From 2bf6d93547e516b6b2b2051c0fb1b47ea4acc8a4 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Mon, 6 May 2019 09:18:26 +0200 Subject: [PATCH] Split i386.c. 2019-05-06 Martin Liska * config/i386/i386-builtins.c: New file. * config/i386/i386-builtins.h: New file. * config/i386/i386-expand.c: New file. * config/i386/i386-expand.h: New file. * config/i386/i386-features.c: New file. * config/i386/i386-features.h: New file. * config/i386/i386-options.c: New file. * config/i386/i386-options.h: New file. * config.gcc: Add new files into extra_objs and target_gtfiles. * config/i386/i386.c: Split content of the file into newly introduced files. * config/i386/i386.h: Declare common variables and macros. * config/i386/t-i386: Define dependencies for new files. From-SVN: r270895 --- gcc/ChangeLog | 18 + gcc/config.gcc | 6 +- gcc/config/i386/i386-builtins.c | 2539 ++ gcc/config/i386/i386-builtins.h | 330 + gcc/config/i386/i386-expand.c | 19840 ++++++++++ gcc/config/i386/i386-expand.h | 58 + gcc/config/i386/i386-features.c | 2742 ++ gcc/config/i386/i386-features.h | 201 + gcc/config/i386/i386-options.c | 3688 ++ gcc/config/i386/i386-options.h | 95 + gcc/config/i386/i386.c | 63106 ++++++++---------------------- gcc/config/i386/i386.h | 9 + gcc/config/i386/t-i386 | 16 + 13 files changed, 46639 insertions(+), 46009 deletions(-) create mode 100644 gcc/config/i386/i386-builtins.c create mode 100644 gcc/config/i386/i386-builtins.h create mode 100644 gcc/config/i386/i386-expand.c create mode 100644 gcc/config/i386/i386-expand.h create mode 100644 gcc/config/i386/i386-features.c create mode 100644 gcc/config/i386/i386-features.h create mode 100644 gcc/config/i386/i386-options.c create mode 100644 gcc/config/i386/i386-options.h diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1a4bcebb6ad..4ca21620723 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,21 @@ +2019-05-06 Martin Liska + + * config/i386/i386-builtins.c: New file. + * config/i386/i386-builtins.h: New file. + * config/i386/i386-expand.c: New file. + * config/i386/i386-expand.h: New file. + * config/i386/i386-features.c: New file. + * config/i386/i386-features.h: New file. + * config/i386/i386-options.c: New file. + * config/i386/i386-options.h: New file. + * config.gcc: Add new files into extra_objs and + target_gtfiles. + * config/i386/i386.c: Split content of the file + into newly introduced files. + * config/i386/i386.h: Declare common variables + and macros. + * config/i386/t-i386: Define dependencies for new files. + 2019-05-06 Li Jia He * tree-ssa-phiopt.c (two_value_replacement): Fix a typo in parameter diff --git a/gcc/config.gcc b/gcc/config.gcc index 67d68d90be6..5124ea00792 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -382,7 +382,8 @@ i[34567]86-*-*) c_target_objs="i386-c.o" cxx_target_objs="i386-c.o" d_target_objs="i386-d.o" - extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o" + extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o i386-options.o i386-builtins.o i386-expand.o i386-features.o" + target_gtfiles="\$(srcdir)/config/i386/i386-builtins.c \$(srcdir)/config/i386/i386-expand.c \$(srcdir)/config/i386/i386-options.c" extra_options="${extra_options} fused-madd.opt" extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h @@ -414,7 +415,8 @@ x86_64-*-*) cxx_target_objs="i386-c.o" d_target_objs="i386-d.o" extra_options="${extra_options} fused-madd.opt" - extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o" + extra_objs="x86-tune-sched.o x86-tune-sched-bd.o x86-tune-sched-atom.o x86-tune-sched-core.o i386-options.o i386-builtins.o i386-expand.o i386-features.o" + target_gtfiles="\$(srcdir)/config/i386/i386-builtins.c \$(srcdir)/config/i386/i386-expand.c \$(srcdir)/config/i386/i386-options.c" extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h diff --git a/gcc/config/i386/i386-builtins.c b/gcc/config/i386/i386-builtins.c new file mode 100644 index 00000000000..9779727480f --- /dev/null +++ b/gcc/config/i386/i386-builtins.c @@ -0,0 +1,2539 @@ +/* Copyright (C) 1988-2019 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "gimple.h" +#include "cfghooks.h" +#include "cfgloop.h" +#include "df.h" +#include "tm_p.h" +#include "stringpool.h" +#include "expmed.h" +#include "optabs.h" +#include "regs.h" +#include "emit-rtl.h" +#include "recog.h" +#include "cgraph.h" +#include "diagnostic.h" +#include "cfgbuild.h" +#include "alias.h" +#include "fold-const.h" +#include "attribs.h" +#include "calls.h" +#include "stor-layout.h" +#include "varasm.h" +#include "output.h" +#include "insn-attr.h" +#include "flags.h" +#include "except.h" +#include "explow.h" +#include "expr.h" +#include "cfgrtl.h" +#include "common/common-target.h" +#include "langhooks.h" +#include "reload.h" +#include "gimplify.h" +#include "dwarf2.h" +#include "tm-constrs.h" +#include "params.h" +#include "cselib.h" +#include "sched-int.h" +#include "opts.h" +#include "tree-pass.h" +#include "context.h" +#include "pass_manager.h" +#include "target-globals.h" +#include "gimple-iterator.h" +#include "tree-vectorizer.h" +#include "shrink-wrap.h" +#include "builtins.h" +#include "rtl-iter.h" +#include "tree-iterator.h" +#include "dbgcnt.h" +#include "case-cfn-macros.h" +#include "dojump.h" +#include "fold-const-call.h" +#include "tree-vrp.h" +#include "tree-ssanames.h" +#include "selftest.h" +#include "selftest-rtl.h" +#include "print-rtl.h" +#include "intl.h" +#include "ifcvt.h" +#include "symbol-summary.h" +#include "ipa-prop.h" +#include "ipa-fnsummary.h" +#include "wide-int-bitmask.h" +#include "tree-vector-builder.h" +#include "debug.h" +#include "dwarf2out.h" +#include "i386-builtins.h" + +#undef BDESC +#undef BDESC_FIRST +#undef BDESC_END + +/* Macros for verification of enum ix86_builtins order. */ +#define BDESC_VERIFY(x, y, z) \ + gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z))) +#define BDESC_VERIFYS(x, y, z) \ + STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z))) + +BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST, + IX86_BUILTIN__BDESC_COMI_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST, + IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, + IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST, + IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, + IX86_BUILTIN__BDESC_ARGS_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, + IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST, + IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, + IX86_BUILTIN__BDESC_CET_LAST, 1); +BDESC_VERIFYS (IX86_BUILTIN_MAX, + IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1); + + +/* Table for the ix86 builtin non-function types. */ +static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1]; + +/* Retrieve an element from the above table, building some of + the types lazily. */ + +static tree +ix86_get_builtin_type (enum ix86_builtin_type tcode) +{ + unsigned int index; + tree type, itype; + + gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab)); + + type = ix86_builtin_type_tab[(int) tcode]; + if (type != NULL) + return type; + + gcc_assert (tcode > IX86_BT_LAST_PRIM); + if (tcode <= IX86_BT_LAST_VECT) + { + machine_mode mode; + + index = tcode - IX86_BT_LAST_PRIM - 1; + itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]); + mode = ix86_builtin_type_vect_mode[index]; + + type = build_vector_type_for_mode (itype, mode); + } + else + { + int quals; + + index = tcode - IX86_BT_LAST_VECT - 1; + if (tcode <= IX86_BT_LAST_PTR) + quals = TYPE_UNQUALIFIED; + else + quals = TYPE_QUAL_CONST; + + itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]); + if (quals != TYPE_UNQUALIFIED) + itype = build_qualified_type (itype, quals); + + type = build_pointer_type (itype); + } + + ix86_builtin_type_tab[(int) tcode] = type; + return type; +} + +/* Table for the ix86 builtin function types. */ +static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1]; + +/* Retrieve an element from the above table, building some of + the types lazily. */ + +static tree +ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode) +{ + tree type; + + gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab)); + + type = ix86_builtin_func_type_tab[(int) tcode]; + if (type != NULL) + return type; + + if (tcode <= IX86_BT_LAST_FUNC) + { + unsigned start = ix86_builtin_func_start[(int) tcode]; + unsigned after = ix86_builtin_func_start[(int) tcode + 1]; + tree rtype, atype, args = void_list_node; + unsigned i; + + rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]); + for (i = after - 1; i > start; --i) + { + atype = ix86_get_builtin_type (ix86_builtin_func_args[i]); + args = tree_cons (NULL, atype, args); + } + + type = build_function_type (rtype, args); + } + else + { + unsigned index = tcode - IX86_BT_LAST_FUNC - 1; + enum ix86_builtin_func_type icode; + + icode = ix86_builtin_func_alias_base[index]; + type = ix86_get_builtin_func_type (icode); + } + + ix86_builtin_func_type_tab[(int) tcode] = type; + return type; +} + +/* Table for the ix86 builtin decls. */ +static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX]; + +struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX]; + +tree get_ix86_builtin (enum ix86_builtins c) +{ + return ix86_builtins[c]; +} + +/* Bits that can still enable any inclusion of a builtin. */ +HOST_WIDE_INT deferred_isa_values = 0; +HOST_WIDE_INT deferred_isa_values2 = 0; + +/* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the + MASK and MASK2 of which isa_flags and ix86_isa_flags2 to use in the + ix86_builtins_isa array. Stores the function decl in the ix86_builtins + array. Returns the function decl or NULL_TREE, if the builtin was not + added. + + If the front end has a special hook for builtin functions, delay adding + builtin functions that aren't in the current ISA until the ISA is changed + with function specific optimization. Doing so, can save about 300K for the + default compiler. When the builtin is expanded, check at that time whether + it is valid. + + If the front end doesn't have a special hook, record all builtins, even if + it isn't an instruction set in the current ISA in case the user uses + function specific options for a different ISA, so that we don't get scope + errors if a builtin is added in the middle of a function scope. */ + +static inline tree +def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, + const char *name, + enum ix86_builtin_func_type tcode, + enum ix86_builtins code) +{ + tree decl = NULL_TREE; + + /* An instruction may be 64bit only regardless of ISAs. */ + if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT) + { + ix86_builtins_isa[(int) code].isa = mask; + ix86_builtins_isa[(int) code].isa2 = mask2; + + mask &= ~OPTION_MASK_ISA_64BIT; + + /* Filter out the masks most often ored together with others. */ + if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL) + && mask != OPTION_MASK_ISA_AVX512VL) + mask &= ~OPTION_MASK_ISA_AVX512VL; + if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW) + && mask != OPTION_MASK_ISA_AVX512BW) + mask &= ~OPTION_MASK_ISA_AVX512BW; + + if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0) + && (mask == 0 || (mask & ix86_isa_flags) != 0)) + || (lang_hooks.builtin_function + == lang_hooks.builtin_function_ext_scope)) + { + tree type = ix86_get_builtin_func_type (tcode); + decl = add_builtin_function (name, type, code, BUILT_IN_MD, + NULL, NULL_TREE); + ix86_builtins[(int) code] = decl; + ix86_builtins_isa[(int) code].set_and_not_built_p = false; + } + else + { + /* Just MASK and MASK2 where set_and_not_built_p == true can potentially + include a builtin. */ + deferred_isa_values |= mask; + deferred_isa_values2 |= mask2; + ix86_builtins[(int) code] = NULL_TREE; + ix86_builtins_isa[(int) code].tcode = tcode; + ix86_builtins_isa[(int) code].name = name; + ix86_builtins_isa[(int) code].const_p = false; + ix86_builtins_isa[(int) code].pure_p = false; + ix86_builtins_isa[(int) code].set_and_not_built_p = true; + } + } + + return decl; +} + +/* Like def_builtin, but also marks the function decl "const". */ + +static inline tree +def_builtin_const (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name, + enum ix86_builtin_func_type tcode, enum ix86_builtins code) +{ + tree decl = def_builtin (mask, mask2, name, tcode, code); + if (decl) + TREE_READONLY (decl) = 1; + else + ix86_builtins_isa[(int) code].const_p = true; + + return decl; +} + +/* Like def_builtin, but also marks the function decl "pure". */ + +static inline tree +def_builtin_pure (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name, + enum ix86_builtin_func_type tcode, enum ix86_builtins code) +{ + tree decl = def_builtin (mask, mask2, name, tcode, code); + if (decl) + DECL_PURE_P (decl) = 1; + else + ix86_builtins_isa[(int) code].pure_p = true; + + return decl; +} + +/* Add any new builtin functions for a given ISA that may not have been + declared. This saves a bit of space compared to adding all of the + declarations to the tree, even if we didn't use them. */ + +void +ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2) +{ + isa &= ~OPTION_MASK_ISA_64BIT; + + if ((isa & deferred_isa_values) == 0 + && (isa2 & deferred_isa_values2) == 0) + return; + + /* Bits in ISA value can be removed from potential isa values. */ + deferred_isa_values &= ~isa; + deferred_isa_values2 &= ~isa2; + + int i; + tree saved_current_target_pragma = current_target_pragma; + current_target_pragma = NULL_TREE; + + for (i = 0; i < (int)IX86_BUILTIN_MAX; i++) + { + if (((ix86_builtins_isa[i].isa & isa) != 0 + || (ix86_builtins_isa[i].isa2 & isa2) != 0) + && ix86_builtins_isa[i].set_and_not_built_p) + { + tree decl, type; + + /* Don't define the builtin again. */ + ix86_builtins_isa[i].set_and_not_built_p = false; + + type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode); + decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name, + type, i, BUILT_IN_MD, NULL, + NULL_TREE); + + ix86_builtins[i] = decl; + if (ix86_builtins_isa[i].const_p) + TREE_READONLY (decl) = 1; + } + } + + current_target_pragma = saved_current_target_pragma; +} + +/* TM vector builtins. */ + +/* Reuse the existing x86-specific `struct builtin_description' cause + we're lazy. Add casts to make them fit. */ +static const struct builtin_description bdesc_tm[] = +{ + { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, + { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, + { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, + { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, + { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, + { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, + { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, + + { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, + { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, + { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, + { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, + { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, + { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, + { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, + + { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, + { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, + { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, + { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, + { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, + { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, + { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, + + { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID }, + { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID }, + { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID }, +}; + +/* Initialize the transactional memory vector load/store builtins. */ + +static void +ix86_init_tm_builtins (void) +{ + enum ix86_builtin_func_type ftype; + const struct builtin_description *d; + size_t i; + tree decl; + tree attrs_load, attrs_type_load, attrs_store, attrs_type_store; + tree attrs_log, attrs_type_log; + + if (!flag_tm) + return; + + /* If there are no builtins defined, we must be compiling in a + language without trans-mem support. */ + if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1)) + return; + + /* Use whatever attributes a normal TM load has. */ + decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1); + attrs_load = DECL_ATTRIBUTES (decl); + attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + /* Use whatever attributes a normal TM store has. */ + decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1); + attrs_store = DECL_ATTRIBUTES (decl); + attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + /* Use whatever attributes a normal TM log has. */ + decl = builtin_decl_explicit (BUILT_IN_TM_LOG); + attrs_log = DECL_ATTRIBUTES (decl); + attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + + for (i = 0, d = bdesc_tm; + i < ARRAY_SIZE (bdesc_tm); + i++, d++) + { + if ((d->mask & ix86_isa_flags) != 0 + || (lang_hooks.builtin_function + == lang_hooks.builtin_function_ext_scope)) + { + tree type, attrs, attrs_type; + enum built_in_function code = (enum built_in_function) d->code; + + ftype = (enum ix86_builtin_func_type) d->flag; + type = ix86_get_builtin_func_type (ftype); + + if (BUILTIN_TM_LOAD_P (code)) + { + attrs = attrs_load; + attrs_type = attrs_type_load; + } + else if (BUILTIN_TM_STORE_P (code)) + { + attrs = attrs_store; + attrs_type = attrs_type_store; + } + else + { + attrs = attrs_log; + attrs_type = attrs_type_log; + } + decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL, + /* The builtin without the prefix for + calling it directly. */ + d->name + strlen ("__builtin_"), + attrs); + /* add_builtin_function() will set the DECL_ATTRIBUTES, now + set the TYPE_ATTRIBUTES. */ + decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN); + + set_builtin_decl (code, decl, false); + } + } +} + +/* Set up all the MMX/SSE builtins, even builtins for instructions that are not + in the current target ISA to allow the user to compile particular modules + with different target specific options that differ from the command line + options. */ +static void +ix86_init_mmx_sse_builtins (void) +{ + const struct builtin_description * d; + enum ix86_builtin_func_type ftype; + size_t i; + + /* Add all special builtins with variable number of operands. */ + for (i = 0, d = bdesc_special_args; + i < ARRAY_SIZE (bdesc_special_args); + i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i); + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, + IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, + ARRAY_SIZE (bdesc_special_args) - 1); + + /* Add all builtins with variable number of operands. */ + for (i = 0, d = bdesc_args; + i < ARRAY_SIZE (bdesc_args); + i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i); + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST, + IX86_BUILTIN__BDESC_ARGS_FIRST, + ARRAY_SIZE (bdesc_args) - 1); + + /* Add all builtins with rounding. */ + for (i = 0, d = bdesc_round_args; + i < ARRAY_SIZE (bdesc_round_args); + i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i); + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, + IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, + ARRAY_SIZE (bdesc_round_args) - 1); + + /* pcmpestr[im] insns. */ + for (i = 0, d = bdesc_pcmpestr; + i < ARRAY_SIZE (bdesc_pcmpestr); + i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i); + if (d->code == IX86_BUILTIN_PCMPESTRM128) + ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT; + else + ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT; + def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST, + IX86_BUILTIN__BDESC_PCMPESTR_FIRST, + ARRAY_SIZE (bdesc_pcmpestr) - 1); + + /* pcmpistr[im] insns. */ + for (i = 0, d = bdesc_pcmpistr; + i < ARRAY_SIZE (bdesc_pcmpistr); + i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i); + if (d->code == IX86_BUILTIN_PCMPISTRM128) + ftype = V16QI_FTYPE_V16QI_V16QI_INT; + else + ftype = INT_FTYPE_V16QI_V16QI_INT; + def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST, + IX86_BUILTIN__BDESC_PCMPISTR_FIRST, + ARRAY_SIZE (bdesc_pcmpistr) - 1); + + /* comi/ucomi insns. */ + for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i); + if (d->mask == OPTION_MASK_ISA_SSE2) + ftype = INT_FTYPE_V2DF_V2DF; + else + ftype = INT_FTYPE_V4SF_V4SF; + def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST, + IX86_BUILTIN__BDESC_COMI_FIRST, + ARRAY_SIZE (bdesc_comi) - 1); + + /* SSE */ + def_builtin (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_ldmxcsr", + VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR); + def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr", + UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR); + + /* SSE or 3DNow!A */ + def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A + /* As it uses V4HImode, we have to require -mmmx too. */ + | OPTION_MASK_ISA_MMX, 0, + "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR, + IX86_BUILTIN_MASKMOVQ); + + /* SSE2 */ + def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu", + VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU); + + def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_clflush", + VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH); + x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_mfence", + VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE); + + /* SSE3. */ + def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_monitor", + VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR); + def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_mwait", + VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT); + + /* AES */ + def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, + "__builtin_ia32_aesenc128", + V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128); + def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, + "__builtin_ia32_aesenclast128", + V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128); + def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, + "__builtin_ia32_aesdec128", + V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128); + def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, + "__builtin_ia32_aesdeclast128", + V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128); + def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, + "__builtin_ia32_aesimc128", + V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128); + def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, + "__builtin_ia32_aeskeygenassist128", + V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128); + + /* PCLMUL */ + def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2, 0, + "__builtin_ia32_pclmulqdq128", + V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128); + + /* RDRND */ + def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand16_step", + INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP); + def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand32_step", + INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP); + def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT, 0, + "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG, + IX86_BUILTIN_RDRAND64_STEP); + + /* AVX2 */ + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2df", + V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT, + IX86_BUILTIN_GATHERSIV2DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4df", + V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT, + IX86_BUILTIN_GATHERSIV4DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2df", + V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT, + IX86_BUILTIN_GATHERDIV2DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4df", + V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT, + IX86_BUILTIN_GATHERDIV4DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4sf", + V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT, + IX86_BUILTIN_GATHERSIV4SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8sf", + V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT, + IX86_BUILTIN_GATHERSIV8SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf", + V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT, + IX86_BUILTIN_GATHERDIV4SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf256", + V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT, + IX86_BUILTIN_GATHERDIV8SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2di", + V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT, + IX86_BUILTIN_GATHERSIV2DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4di", + V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT, + IX86_BUILTIN_GATHERSIV4DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2di", + V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT, + IX86_BUILTIN_GATHERDIV2DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4di", + V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT, + IX86_BUILTIN_GATHERDIV4DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4si", + V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT, + IX86_BUILTIN_GATHERSIV4SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8si", + V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT, + IX86_BUILTIN_GATHERSIV8SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si", + V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT, + IX86_BUILTIN_GATHERDIV4SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si256", + V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT, + IX86_BUILTIN_GATHERDIV8SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4df ", + V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT, + IX86_BUILTIN_GATHERALTSIV4DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8sf ", + V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT, + IX86_BUILTIN_GATHERALTDIV8SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4di ", + V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT, + IX86_BUILTIN_GATHERALTSIV4DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8si ", + V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT, + IX86_BUILTIN_GATHERALTDIV8SI); + + /* AVX512F */ + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16sf", + V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT, + IX86_BUILTIN_GATHER3SIV16SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8df", + V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT, + IX86_BUILTIN_GATHER3SIV8DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16sf", + V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT, + IX86_BUILTIN_GATHER3DIV16SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8df", + V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT, + IX86_BUILTIN_GATHER3DIV8DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16si", + V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT, + IX86_BUILTIN_GATHER3SIV16SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8di", + V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT, + IX86_BUILTIN_GATHER3SIV8DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16si", + V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT, + IX86_BUILTIN_GATHER3DIV16SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8di", + V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT, + IX86_BUILTIN_GATHER3DIV8DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8df ", + V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT, + IX86_BUILTIN_GATHER3ALTSIV8DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16sf ", + V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT, + IX86_BUILTIN_GATHER3ALTDIV16SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8di ", + V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT, + IX86_BUILTIN_GATHER3ALTSIV8DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16si ", + V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT, + IX86_BUILTIN_GATHER3ALTDIV16SI); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16sf", + VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT, + IX86_BUILTIN_SCATTERSIV16SF); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8df", + VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT, + IX86_BUILTIN_SCATTERSIV8DF); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16sf", + VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT, + IX86_BUILTIN_SCATTERDIV16SF); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8df", + VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT, + IX86_BUILTIN_SCATTERDIV8DF); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16si", + VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT, + IX86_BUILTIN_SCATTERSIV16SI); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8di", + VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT, + IX86_BUILTIN_SCATTERSIV8DI); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16si", + VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT, + IX86_BUILTIN_SCATTERDIV16SI); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8di", + VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT, + IX86_BUILTIN_SCATTERDIV8DI); + + /* AVX512VL */ + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2df", + V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT, + IX86_BUILTIN_GATHER3SIV2DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4df", + V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT, + IX86_BUILTIN_GATHER3SIV4DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2df", + V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT, + IX86_BUILTIN_GATHER3DIV2DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4df", + V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT, + IX86_BUILTIN_GATHER3DIV4DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4sf", + V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT, + IX86_BUILTIN_GATHER3SIV4SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8sf", + V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT, + IX86_BUILTIN_GATHER3SIV8SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4sf", + V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT, + IX86_BUILTIN_GATHER3DIV4SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8sf", + V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT, + IX86_BUILTIN_GATHER3DIV8SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2di", + V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT, + IX86_BUILTIN_GATHER3SIV2DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4di", + V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT, + IX86_BUILTIN_GATHER3SIV4DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2di", + V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT, + IX86_BUILTIN_GATHER3DIV2DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4di", + V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT, + IX86_BUILTIN_GATHER3DIV4DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4si", + V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT, + IX86_BUILTIN_GATHER3SIV4SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8si", + V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT, + IX86_BUILTIN_GATHER3SIV8SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4si", + V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT, + IX86_BUILTIN_GATHER3DIV4SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8si", + V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT, + IX86_BUILTIN_GATHER3DIV8SI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4df ", + V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT, + IX86_BUILTIN_GATHER3ALTSIV4DF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8sf ", + V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT, + IX86_BUILTIN_GATHER3ALTDIV8SF); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4di ", + V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT, + IX86_BUILTIN_GATHER3ALTSIV4DI); + + def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8si ", + V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT, + IX86_BUILTIN_GATHER3ALTDIV8SI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8sf", + VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT, + IX86_BUILTIN_SCATTERSIV8SF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4sf", + VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT, + IX86_BUILTIN_SCATTERSIV4SF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4df", + VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT, + IX86_BUILTIN_SCATTERSIV4DF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2df", + VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT, + IX86_BUILTIN_SCATTERSIV2DF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8sf", + VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT, + IX86_BUILTIN_SCATTERDIV8SF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4sf", + VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT, + IX86_BUILTIN_SCATTERDIV4SF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4df", + VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT, + IX86_BUILTIN_SCATTERDIV4DF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2df", + VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT, + IX86_BUILTIN_SCATTERDIV2DF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8si", + VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT, + IX86_BUILTIN_SCATTERSIV8SI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4si", + VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT, + IX86_BUILTIN_SCATTERSIV4SI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4di", + VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT, + IX86_BUILTIN_SCATTERSIV4DI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2di", + VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT, + IX86_BUILTIN_SCATTERSIV2DI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8si", + VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT, + IX86_BUILTIN_SCATTERDIV8SI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4si", + VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT, + IX86_BUILTIN_SCATTERDIV4SI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4di", + VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT, + IX86_BUILTIN_SCATTERDIV4DI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2di", + VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT, + IX86_BUILTIN_SCATTERDIV2DI); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8df ", + VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT, + IX86_BUILTIN_SCATTERALTSIV8DF); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16sf ", + VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT, + IX86_BUILTIN_SCATTERALTDIV16SF); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8di ", + VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT, + IX86_BUILTIN_SCATTERALTSIV8DI); + + def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16si ", + VOID_FTYPE_PINT_HI_V8DI_V16SI_INT, + IX86_BUILTIN_SCATTERALTDIV16SI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4df ", + VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT, + IX86_BUILTIN_SCATTERALTSIV4DF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8sf ", + VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT, + IX86_BUILTIN_SCATTERALTDIV8SF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4di ", + VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT, + IX86_BUILTIN_SCATTERALTSIV4DI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8si ", + VOID_FTYPE_PINT_QI_V4DI_V8SI_INT, + IX86_BUILTIN_SCATTERALTDIV8SI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2df ", + VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT, + IX86_BUILTIN_SCATTERALTSIV2DF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4sf ", + VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT, + IX86_BUILTIN_SCATTERALTDIV4SF); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2di ", + VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT, + IX86_BUILTIN_SCATTERALTSIV2DI); + + def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4si ", + VOID_FTYPE_PINT_QI_V2DI_V4SI_INT, + IX86_BUILTIN_SCATTERALTDIV4SI); + + /* AVX512PF */ + def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdpd", + VOID_FTYPE_QI_V8SI_PCVOID_INT_INT, + IX86_BUILTIN_GATHERPFDPD); + def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdps", + VOID_FTYPE_HI_V16SI_PCVOID_INT_INT, + IX86_BUILTIN_GATHERPFDPS); + def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqpd", + VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, + IX86_BUILTIN_GATHERPFQPD); + def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqps", + VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, + IX86_BUILTIN_GATHERPFQPS); + def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdpd", + VOID_FTYPE_QI_V8SI_PCVOID_INT_INT, + IX86_BUILTIN_SCATTERPFDPD); + def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdps", + VOID_FTYPE_HI_V16SI_PCVOID_INT_INT, + IX86_BUILTIN_SCATTERPFDPS); + def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqpd", + VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, + IX86_BUILTIN_SCATTERPFQPD); + def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqps", + VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, + IX86_BUILTIN_SCATTERPFQPS); + + /* SHA */ + def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg1", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1); + def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg2", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2); + def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1nexte", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE); + def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1rnds4", + V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4); + def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg1", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1); + def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg2", + V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2); + def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256rnds2", + V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2); + + /* RTM. */ + def_builtin (OPTION_MASK_ISA_RTM, 0, "__builtin_ia32_xabort", + VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT); + + /* MMX access to the vec_init patterns. */ + def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v2si", + V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI); + + def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v4hi", + V4HI_FTYPE_HI_HI_HI_HI, + IX86_BUILTIN_VEC_INIT_V4HI); + + def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v8qi", + V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI, + IX86_BUILTIN_VEC_INIT_V8QI); + + /* Access to the vec_extract patterns. */ + def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2df", + DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF); + def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2di", + DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI); + def_builtin_const (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_vec_ext_v4sf", + FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF); + def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v4si", + SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI); + def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v8hi", + HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI); + + def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A + /* As it uses V4HImode, we have to require -mmmx too. */ + | OPTION_MASK_ISA_MMX, 0, + "__builtin_ia32_vec_ext_v4hi", + HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI); + + def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_ext_v2si", + SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI); + + def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v16qi", + QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI); + + /* Access to the vec_set patterns. */ + def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT, 0, + "__builtin_ia32_vec_set_v2di", + V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI); + + def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4sf", + V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF); + + def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4si", + V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI); + + def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_set_v8hi", + V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI); + + def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A + /* As it uses V4HImode, we have to require -mmmx too. */ + | OPTION_MASK_ISA_MMX, 0, + "__builtin_ia32_vec_set_v4hi", + V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI); + + def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v16qi", + V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI); + + /* RDSEED */ + def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_hi_step", + INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP); + def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_si_step", + INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP); + def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT, 0, + "__builtin_ia32_rdseed_di_step", + INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP); + + /* ADCX */ + def_builtin (0, 0, "__builtin_ia32_addcarryx_u32", + UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32); + def_builtin (OPTION_MASK_ISA_64BIT, 0, + "__builtin_ia32_addcarryx_u64", + UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG, + IX86_BUILTIN_ADDCARRYX64); + + /* SBB */ + def_builtin (0, 0, "__builtin_ia32_sbb_u32", + UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32); + def_builtin (OPTION_MASK_ISA_64BIT, 0, + "__builtin_ia32_sbb_u64", + UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG, + IX86_BUILTIN_SBB64); + + /* Read/write FLAGS. */ + if (TARGET_64BIT) + { + def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_readeflags_u64", + UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS); + def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_writeeflags_u64", + VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS); + } + else + { + def_builtin (0, 0, "__builtin_ia32_readeflags_u32", + UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS); + def_builtin (0, 0, "__builtin_ia32_writeeflags_u32", + VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS); + } + + /* CLFLUSHOPT. */ + def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, 0, "__builtin_ia32_clflushopt", + VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT); + + /* CLWB. */ + def_builtin (OPTION_MASK_ISA_CLWB, 0, "__builtin_ia32_clwb", + VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB); + + /* MONITORX and MWAITX. */ + def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx", + VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX); + def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx", + VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX); + + /* CLZERO. */ + def_builtin (0, OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero", + VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO); + + /* WAITPKG. */ + def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umonitor", + VOID_FTYPE_PVOID, IX86_BUILTIN_UMONITOR); + def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umwait", + UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_UMWAIT); + def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_tpause", + UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE); + + /* CLDEMOTE. */ + def_builtin (0, OPTION_MASK_ISA_CLDEMOTE, "__builtin_ia32_cldemote", + VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE); + + /* Add FMA4 multi-arg argument instructions */ + for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i); + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST, + IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, + ARRAY_SIZE (bdesc_multi_arg) - 1); + + /* Add CET inrinsics. */ + for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i); + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST, + IX86_BUILTIN__BDESC_CET_FIRST, + ARRAY_SIZE (bdesc_cet) - 1); + + for (i = 0, d = bdesc_cet_rdssp; + i < ARRAY_SIZE (bdesc_cet_rdssp); + i++, d++) + { + BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i); + if (d->name == 0) + continue; + + ftype = (enum ix86_builtin_func_type) d->flag; + def_builtin (d->mask, d->mask2, d->name, ftype, d->code); + } + BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST, + IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, + ARRAY_SIZE (bdesc_cet_rdssp) - 1); +} + +#undef BDESC_VERIFY +#undef BDESC_VERIFYS + +/* Make builtins to detect cpu type and features supported. NAME is + the builtin name, CODE is the builtin code, and FTYPE is the function + type of the builtin. */ + +static void +make_cpu_type_builtin (const char* name, int code, + enum ix86_builtin_func_type ftype, bool is_const) +{ + tree decl; + tree type; + + type = ix86_get_builtin_func_type (ftype); + decl = add_builtin_function (name, type, code, BUILT_IN_MD, + NULL, NULL_TREE); + gcc_assert (decl != NULL_TREE); + ix86_builtins[(int) code] = decl; + TREE_READONLY (decl) = is_const; +} + +/* Make builtins to get CPU type and features supported. The created + builtins are : + + __builtin_cpu_init (), to detect cpu type and features, + __builtin_cpu_is (""), to check if cpu is of type , + __builtin_cpu_supports (""), to check if cpu supports + */ + +static void +ix86_init_platform_type_builtins (void) +{ + make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT, + INT_FTYPE_VOID, false); + make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS, + INT_FTYPE_PCCHAR, true); + make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS, + INT_FTYPE_PCCHAR, true); +} + +/* Internal method for ix86_init_builtins. */ + +static void +ix86_init_builtins_va_builtins_abi (void) +{ + tree ms_va_ref, sysv_va_ref; + tree fnvoid_va_end_ms, fnvoid_va_end_sysv; + tree fnvoid_va_start_ms, fnvoid_va_start_sysv; + tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv; + tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE; + + if (!TARGET_64BIT) + return; + fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE); + fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE); + ms_va_ref = build_reference_type (ms_va_list_type_node); + sysv_va_ref = build_pointer_type (TREE_TYPE (sysv_va_list_type_node)); + + fnvoid_va_end_ms = build_function_type_list (void_type_node, ms_va_ref, + NULL_TREE); + fnvoid_va_start_ms + = build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE); + fnvoid_va_end_sysv + = build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE); + fnvoid_va_start_sysv + = build_varargs_function_type_list (void_type_node, sysv_va_ref, + NULL_TREE); + fnvoid_va_copy_ms + = build_function_type_list (void_type_node, ms_va_ref, + ms_va_list_type_node, NULL_TREE); + fnvoid_va_copy_sysv + = build_function_type_list (void_type_node, sysv_va_ref, + sysv_va_ref, NULL_TREE); + + add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms, + BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms); + add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms, + BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms); + add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms, + BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms); + add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv, + BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv); + add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv, + BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv); + add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv, + BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv); +} + +static void +ix86_init_builtin_types (void) +{ + tree float80_type_node, const_string_type_node; + + /* The __float80 type. */ + float80_type_node = long_double_type_node; + if (TYPE_MODE (float80_type_node) != XFmode) + { + if (float64x_type_node != NULL_TREE + && TYPE_MODE (float64x_type_node) == XFmode) + float80_type_node = float64x_type_node; + else + { + /* The __float80 type. */ + float80_type_node = make_node (REAL_TYPE); + + TYPE_PRECISION (float80_type_node) = 80; + layout_type (float80_type_node); + } + } + lang_hooks.types.register_builtin_type (float80_type_node, "__float80"); + + /* The __float128 type. The node has already been created as + _Float128, so we only need to register the __float128 name for + it. */ + lang_hooks.types.register_builtin_type (float128_type_node, "__float128"); + + const_string_type_node + = build_pointer_type (build_qualified_type + (char_type_node, TYPE_QUAL_CONST)); + + /* This macro is built by i386-builtin-types.awk. */ + DEFINE_BUILTIN_PRIMITIVE_TYPES; +} + +void +ix86_init_builtins (void) +{ + tree ftype, decl; + + ix86_init_builtin_types (); + + /* Builtins to get CPU type and features. */ + ix86_init_platform_type_builtins (); + + /* TFmode support builtins. */ + def_builtin_const (0, 0, "__builtin_infq", + FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ); + def_builtin_const (0, 0, "__builtin_huge_valq", + FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ); + + ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING); + decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ, + BUILT_IN_MD, "nanq", NULL_TREE); + TREE_READONLY (decl) = 1; + ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl; + + decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ, + BUILT_IN_MD, "nansq", NULL_TREE); + TREE_READONLY (decl) = 1; + ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl; + + /* We will expand them to normal call if SSE isn't available since + they are used by libgcc. */ + ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128); + decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ, + BUILT_IN_MD, "__fabstf2", NULL_TREE); + TREE_READONLY (decl) = 1; + ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl; + + ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128); + decl = add_builtin_function ("__builtin_copysignq", ftype, + IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD, + "__copysigntf3", NULL_TREE); + TREE_READONLY (decl) = 1; + ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl; + + ix86_init_tm_builtins (); + ix86_init_mmx_sse_builtins (); + + if (TARGET_LP64) + ix86_init_builtins_va_builtins_abi (); + +#ifdef SUBTARGET_INIT_BUILTINS + SUBTARGET_INIT_BUILTINS; +#endif +} + +/* Return the ix86 builtin for CODE. */ + +tree +ix86_builtin_decl (unsigned code, bool) +{ + if (code >= IX86_BUILTIN_MAX) + return error_mark_node; + + return ix86_builtins[code]; +} + +/* This returns the target-specific builtin with code CODE if + current_function_decl has visibility on this builtin, which is checked + using isa flags. Returns NULL_TREE otherwise. */ + +static tree ix86_get_builtin (enum ix86_builtins code) +{ + struct cl_target_option *opts; + tree target_tree = NULL_TREE; + + /* Determine the isa flags of current_function_decl. */ + + if (current_function_decl) + target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl); + + if (target_tree == NULL) + target_tree = target_option_default_node; + + opts = TREE_TARGET_OPTION (target_tree); + + if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags) + || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2)) + return ix86_builtin_decl (code, true); + else + return NULL_TREE; +} + +/* Vectorization library interface and handlers. */ +tree (*ix86_veclib_handler) (combined_fn, tree, tree); + +/* Returns a function decl for a vectorized version of the combined function + with combined_fn code FN and the result vector type TYPE, or NULL_TREE + if it is not available. */ + +tree +ix86_builtin_vectorized_function (unsigned int fn, tree type_out, + tree type_in) +{ + machine_mode in_mode, out_mode; + int in_n, out_n; + + if (TREE_CODE (type_out) != VECTOR_TYPE + || TREE_CODE (type_in) != VECTOR_TYPE) + return NULL_TREE; + + out_mode = TYPE_MODE (TREE_TYPE (type_out)); + out_n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + + switch (fn) + { + CASE_CFN_EXP2: + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_EXP2PS); + } + break; + + CASE_CFN_IFLOOR: + CASE_CFN_LFLOOR: + CASE_CFN_LLFLOOR: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_SSE4_1) + break; + + if (out_mode == SImode && in_mode == DFmode) + { + if (out_n == 4 && in_n == 2) + return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX); + else if (out_n == 8 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512); + } + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512); + } + break; + + CASE_CFN_ICEIL: + CASE_CFN_LCEIL: + CASE_CFN_LLCEIL: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_SSE4_1) + break; + + if (out_mode == SImode && in_mode == DFmode) + { + if (out_n == 4 && in_n == 2) + return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX); + else if (out_n == 8 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512); + } + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512); + } + break; + + CASE_CFN_IRINT: + CASE_CFN_LRINT: + CASE_CFN_LLRINT: + if (out_mode == SImode && in_mode == DFmode) + { + if (out_n == 4 && in_n == 2) + return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX); + else if (out_n == 8 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512); + } + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512); + } + break; + + CASE_CFN_IROUND: + CASE_CFN_LROUND: + CASE_CFN_LLROUND: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_SSE4_1) + break; + + if (out_mode == SImode && in_mode == DFmode) + { + if (out_n == 4 && in_n == 2) + return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX); + else if (out_n == 8 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256); + else if (out_n == 16 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512); + } + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512); + } + break; + + CASE_CFN_FLOOR: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_SSE4_1) + break; + + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_get_builtin (IX86_BUILTIN_FLOORPD); + else if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_FLOORPD256); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_FLOORPD512); + } + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_FLOORPS); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_FLOORPS256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_FLOORPS512); + } + break; + + CASE_CFN_CEIL: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_SSE4_1) + break; + + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_get_builtin (IX86_BUILTIN_CEILPD); + else if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_CEILPD256); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_CEILPD512); + } + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_CEILPS); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_CEILPS256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_CEILPS512); + } + break; + + CASE_CFN_TRUNC: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_SSE4_1) + break; + + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_get_builtin (IX86_BUILTIN_TRUNCPD); + else if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512); + } + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_TRUNCPS); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256); + else if (out_n == 16 && in_n == 16) + return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512); + } + break; + + CASE_CFN_RINT: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_SSE4_1) + break; + + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_get_builtin (IX86_BUILTIN_RINTPD); + else if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_RINTPD256); + } + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_RINTPS); + else if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_RINTPS256); + } + break; + + CASE_CFN_FMA: + if (out_mode == DFmode && in_mode == DFmode) + { + if (out_n == 2 && in_n == 2) + return ix86_get_builtin (IX86_BUILTIN_VFMADDPD); + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256); + } + if (out_mode == SFmode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_get_builtin (IX86_BUILTIN_VFMADDPS); + if (out_n == 8 && in_n == 8) + return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256); + } + break; + + default: + break; + } + + /* Dispatch to a handler for a vectorization library. */ + if (ix86_veclib_handler) + return ix86_veclib_handler (combined_fn (fn), type_out, type_in); + + return NULL_TREE; +} + +/* Returns a decl of a function that implements gather load with + memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE. + Return NULL_TREE if it is not available. */ + +tree +ix86_vectorize_builtin_gather (const_tree mem_vectype, + const_tree index_type, int scale) +{ + bool si; + enum ix86_builtins code; + + if (! TARGET_AVX2 || !TARGET_USE_GATHER) + return NULL_TREE; + + if ((TREE_CODE (index_type) != INTEGER_TYPE + && !POINTER_TYPE_P (index_type)) + || (TYPE_MODE (index_type) != SImode + && TYPE_MODE (index_type) != DImode)) + return NULL_TREE; + + if (TYPE_PRECISION (index_type) > POINTER_SIZE) + return NULL_TREE; + + /* v*gather* insn sign extends index to pointer mode. */ + if (TYPE_PRECISION (index_type) < POINTER_SIZE + && TYPE_UNSIGNED (index_type)) + return NULL_TREE; + + if (scale <= 0 + || scale > 8 + || (scale & (scale - 1)) != 0) + return NULL_TREE; + + si = TYPE_MODE (index_type) == SImode; + switch (TYPE_MODE (mem_vectype)) + { + case E_V2DFmode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF; + else + code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF; + break; + case E_V4DFmode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF; + else + code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF; + break; + case E_V2DImode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI; + else + code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI; + break; + case E_V4DImode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI; + else + code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI; + break; + case E_V4SFmode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF; + else + code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF; + break; + case E_V8SFmode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF; + else + code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF; + break; + case E_V4SImode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI; + else + code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI; + break; + case E_V8SImode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI; + else + code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI; + break; + case E_V8DFmode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF; + else + return NULL_TREE; + break; + case E_V8DImode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI; + else + return NULL_TREE; + break; + case E_V16SFmode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF; + else + return NULL_TREE; + break; + case E_V16SImode: + if (TARGET_AVX512F) + code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI; + else + return NULL_TREE; + break; + default: + return NULL_TREE; + } + + return ix86_get_builtin (code); +} + +/* Returns a code for a target-specific builtin that implements + reciprocal of the function, or NULL_TREE if not available. */ + +tree +ix86_builtin_reciprocal (tree fndecl) +{ + enum ix86_builtins fn_code + = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl); + switch (fn_code) + { + /* Vectorized version of sqrt to rsqrt conversion. */ + case IX86_BUILTIN_SQRTPS_NR: + return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR); + + case IX86_BUILTIN_SQRTPS_NR256: + return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256); + + default: + return NULL_TREE; + } +} + +/* Priority of i386 features, greater value is higher priority. This is + used to decide the order in which function dispatch must happen. For + instance, a version specialized for SSE4.2 should be checked for dispatch + before a version for SSE3, as SSE4.2 implies SSE3. */ +enum feature_priority +{ + P_ZERO = 0, + P_MMX, + P_SSE, + P_SSE2, + P_SSE3, + P_SSSE3, + P_PROC_SSSE3, + P_SSE4_A, + P_PROC_SSE4_A, + P_SSE4_1, + P_SSE4_2, + P_PROC_SSE4_2, + P_POPCNT, + P_AES, + P_PCLMUL, + P_AVX, + P_PROC_AVX, + P_BMI, + P_PROC_BMI, + P_FMA4, + P_XOP, + P_PROC_XOP, + P_FMA, + P_PROC_FMA, + P_BMI2, + P_AVX2, + P_PROC_AVX2, + P_AVX512F, + P_PROC_AVX512F +}; + +/* This is the order of bit-fields in __processor_features in cpuinfo.c */ +enum processor_features +{ + F_CMOV = 0, + F_MMX, + F_POPCNT, + F_SSE, + F_SSE2, + F_SSE3, + F_SSSE3, + F_SSE4_1, + F_SSE4_2, + F_AVX, + F_AVX2, + F_SSE4_A, + F_FMA4, + F_XOP, + F_FMA, + F_AVX512F, + F_BMI, + F_BMI2, + F_AES, + F_PCLMUL, + F_AVX512VL, + F_AVX512BW, + F_AVX512DQ, + F_AVX512CD, + F_AVX512ER, + F_AVX512PF, + F_AVX512VBMI, + F_AVX512IFMA, + F_AVX5124VNNIW, + F_AVX5124FMAPS, + F_AVX512VPOPCNTDQ, + F_AVX512VBMI2, + F_GFNI, + F_VPCLMULQDQ, + F_AVX512VNNI, + F_AVX512BITALG, + F_MAX +}; + +/* These are the values for vendor types and cpu types and subtypes + in cpuinfo.c. Cpu types and subtypes should be subtracted by + the corresponding start value. */ +enum processor_model +{ + M_INTEL = 1, + M_AMD, + M_CPU_TYPE_START, + M_INTEL_BONNELL, + M_INTEL_CORE2, + M_INTEL_COREI7, + M_AMDFAM10H, + M_AMDFAM15H, + M_INTEL_SILVERMONT, + M_INTEL_KNL, + M_AMD_BTVER1, + M_AMD_BTVER2, + M_AMDFAM17H, + M_INTEL_KNM, + M_INTEL_GOLDMONT, + M_INTEL_GOLDMONT_PLUS, + M_INTEL_TREMONT, + M_CPU_SUBTYPE_START, + M_INTEL_COREI7_NEHALEM, + M_INTEL_COREI7_WESTMERE, + M_INTEL_COREI7_SANDYBRIDGE, + M_AMDFAM10H_BARCELONA, + M_AMDFAM10H_SHANGHAI, + M_AMDFAM10H_ISTANBUL, + M_AMDFAM15H_BDVER1, + M_AMDFAM15H_BDVER2, + M_AMDFAM15H_BDVER3, + M_AMDFAM15H_BDVER4, + M_AMDFAM17H_ZNVER1, + M_INTEL_COREI7_IVYBRIDGE, + M_INTEL_COREI7_HASWELL, + M_INTEL_COREI7_BROADWELL, + M_INTEL_COREI7_SKYLAKE, + M_INTEL_COREI7_SKYLAKE_AVX512, + M_INTEL_COREI7_CANNONLAKE, + M_INTEL_COREI7_ICELAKE_CLIENT, + M_INTEL_COREI7_ICELAKE_SERVER, + M_AMDFAM17H_ZNVER2, + M_INTEL_COREI7_CASCADELAKE +}; + +struct _arch_names_table +{ + const char *const name; + const enum processor_model model; +}; + +static const _arch_names_table arch_names_table[] = +{ + {"amd", M_AMD}, + {"intel", M_INTEL}, + {"atom", M_INTEL_BONNELL}, + {"slm", M_INTEL_SILVERMONT}, + {"core2", M_INTEL_CORE2}, + {"corei7", M_INTEL_COREI7}, + {"nehalem", M_INTEL_COREI7_NEHALEM}, + {"westmere", M_INTEL_COREI7_WESTMERE}, + {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE}, + {"ivybridge", M_INTEL_COREI7_IVYBRIDGE}, + {"haswell", M_INTEL_COREI7_HASWELL}, + {"broadwell", M_INTEL_COREI7_BROADWELL}, + {"skylake", M_INTEL_COREI7_SKYLAKE}, + {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512}, + {"cannonlake", M_INTEL_COREI7_CANNONLAKE}, + {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT}, + {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER}, + {"cascadelake", M_INTEL_COREI7_CASCADELAKE}, + {"bonnell", M_INTEL_BONNELL}, + {"silvermont", M_INTEL_SILVERMONT}, + {"goldmont", M_INTEL_GOLDMONT}, + {"goldmont-plus", M_INTEL_GOLDMONT_PLUS}, + {"tremont", M_INTEL_TREMONT}, + {"knl", M_INTEL_KNL}, + {"knm", M_INTEL_KNM}, + {"amdfam10h", M_AMDFAM10H}, + {"barcelona", M_AMDFAM10H_BARCELONA}, + {"shanghai", M_AMDFAM10H_SHANGHAI}, + {"istanbul", M_AMDFAM10H_ISTANBUL}, + {"btver1", M_AMD_BTVER1}, + {"amdfam15h", M_AMDFAM15H}, + {"bdver1", M_AMDFAM15H_BDVER1}, + {"bdver2", M_AMDFAM15H_BDVER2}, + {"bdver3", M_AMDFAM15H_BDVER3}, + {"bdver4", M_AMDFAM15H_BDVER4}, + {"btver2", M_AMD_BTVER2}, + {"amdfam17h", M_AMDFAM17H}, + {"znver1", M_AMDFAM17H_ZNVER1}, + {"znver2", M_AMDFAM17H_ZNVER2}, +}; + +/* These are the target attribute strings for which a dispatcher is + available, from fold_builtin_cpu. */ +struct _isa_names_table +{ + const char *const name; + const enum processor_features feature; + const enum feature_priority priority; +}; + +static const _isa_names_table isa_names_table[] = +{ + {"cmov", F_CMOV, P_ZERO}, + {"mmx", F_MMX, P_MMX}, + {"popcnt", F_POPCNT, P_POPCNT}, + {"sse", F_SSE, P_SSE}, + {"sse2", F_SSE2, P_SSE2}, + {"sse3", F_SSE3, P_SSE3}, + {"ssse3", F_SSSE3, P_SSSE3}, + {"sse4a", F_SSE4_A, P_SSE4_A}, + {"sse4.1", F_SSE4_1, P_SSE4_1}, + {"sse4.2", F_SSE4_2, P_SSE4_2}, + {"avx", F_AVX, P_AVX}, + {"fma4", F_FMA4, P_FMA4}, + {"xop", F_XOP, P_XOP}, + {"fma", F_FMA, P_FMA}, + {"avx2", F_AVX2, P_AVX2}, + {"avx512f", F_AVX512F, P_AVX512F}, + {"bmi", F_BMI, P_BMI}, + {"bmi2", F_BMI2, P_BMI2}, + {"aes", F_AES, P_AES}, + {"pclmul", F_PCLMUL, P_PCLMUL}, + {"avx512vl",F_AVX512VL, P_ZERO}, + {"avx512bw",F_AVX512BW, P_ZERO}, + {"avx512dq",F_AVX512DQ, P_ZERO}, + {"avx512cd",F_AVX512CD, P_ZERO}, + {"avx512er",F_AVX512ER, P_ZERO}, + {"avx512pf",F_AVX512PF, P_ZERO}, + {"avx512vbmi",F_AVX512VBMI, P_ZERO}, + {"avx512ifma",F_AVX512IFMA, P_ZERO}, + {"avx5124vnniw",F_AVX5124VNNIW, P_ZERO}, + {"avx5124fmaps",F_AVX5124FMAPS, P_ZERO}, + {"avx512vpopcntdq",F_AVX512VPOPCNTDQ, P_ZERO}, + {"avx512vbmi2", F_AVX512VBMI2, P_ZERO}, + {"gfni", F_GFNI, P_ZERO}, + {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO}, + {"avx512vnni", F_AVX512VNNI, P_ZERO}, + {"avx512bitalg", F_AVX512BITALG, P_ZERO} +}; + +/* This parses the attribute arguments to target in DECL and determines + the right builtin to use to match the platform specification. + It returns the priority value for this version decl. If PREDICATE_LIST + is not NULL, it stores the list of cpu features that need to be checked + before dispatching this function. */ + +unsigned int +get_builtin_code_for_version (tree decl, tree *predicate_list) +{ + tree attrs; + struct cl_target_option cur_target; + tree target_node; + struct cl_target_option *new_target; + const char *arg_str = NULL; + const char *attrs_str = NULL; + char *tok_str = NULL; + char *token; + + enum feature_priority priority = P_ZERO; + + static unsigned int NUM_FEATURES + = sizeof (isa_names_table) / sizeof (_isa_names_table); + + unsigned int i; + + tree predicate_chain = NULL_TREE; + tree predicate_decl, predicate_arg; + + attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl)); + gcc_assert (attrs != NULL); + + attrs = TREE_VALUE (TREE_VALUE (attrs)); + + gcc_assert (TREE_CODE (attrs) == STRING_CST); + attrs_str = TREE_STRING_POINTER (attrs); + + /* Return priority zero for default function. */ + if (strcmp (attrs_str, "default") == 0) + return 0; + + /* Handle arch= if specified. For priority, set it to be 1 more than + the best instruction set the processor can handle. For instance, if + there is a version for atom and a version for ssse3 (the highest ISA + priority for atom), the atom version must be checked for dispatch + before the ssse3 version. */ + if (strstr (attrs_str, "arch=") != NULL) + { + cl_target_option_save (&cur_target, &global_options); + target_node + = ix86_valid_target_attribute_tree (decl, attrs, &global_options, + &global_options_set, 0); + + gcc_assert (target_node); + if (target_node == error_mark_node) + return 0; + new_target = TREE_TARGET_OPTION (target_node); + gcc_assert (new_target); + + if (new_target->arch_specified && new_target->arch > 0) + { + switch (new_target->arch) + { + case PROCESSOR_CORE2: + arg_str = "core2"; + priority = P_PROC_SSSE3; + break; + case PROCESSOR_NEHALEM: + if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_PCLMUL) + { + arg_str = "westmere"; + priority = P_PCLMUL; + } + else + { + /* We translate "arch=corei7" and "arch=nehalem" to + "corei7" so that it will be mapped to M_INTEL_COREI7 + as cpu type to cover all M_INTEL_COREI7_XXXs. */ + arg_str = "corei7"; + priority = P_PROC_SSE4_2; + } + break; + case PROCESSOR_SANDYBRIDGE: + if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C) + arg_str = "ivybridge"; + else + arg_str = "sandybridge"; + priority = P_PROC_AVX; + break; + case PROCESSOR_HASWELL: + if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX) + arg_str = "broadwell"; + else + arg_str = "haswell"; + priority = P_PROC_AVX2; + break; + case PROCESSOR_SKYLAKE: + arg_str = "skylake"; + priority = P_PROC_AVX2; + break; + case PROCESSOR_SKYLAKE_AVX512: + arg_str = "skylake-avx512"; + priority = P_PROC_AVX512F; + break; + case PROCESSOR_CANNONLAKE: + arg_str = "cannonlake"; + priority = P_PROC_AVX512F; + break; + case PROCESSOR_ICELAKE_CLIENT: + arg_str = "icelake-client"; + priority = P_PROC_AVX512F; + break; + case PROCESSOR_ICELAKE_SERVER: + arg_str = "icelake-server"; + priority = P_PROC_AVX512F; + break; + case PROCESSOR_CASCADELAKE: + arg_str = "cascadelake"; + priority = P_PROC_AVX512F; + break; + case PROCESSOR_BONNELL: + arg_str = "bonnell"; + priority = P_PROC_SSSE3; + break; + case PROCESSOR_KNL: + arg_str = "knl"; + priority = P_PROC_AVX512F; + break; + case PROCESSOR_KNM: + arg_str = "knm"; + priority = P_PROC_AVX512F; + break; + case PROCESSOR_SILVERMONT: + arg_str = "silvermont"; + priority = P_PROC_SSE4_2; + break; + case PROCESSOR_GOLDMONT: + arg_str = "goldmont"; + priority = P_PROC_SSE4_2; + break; + case PROCESSOR_GOLDMONT_PLUS: + arg_str = "goldmont-plus"; + priority = P_PROC_SSE4_2; + break; + case PROCESSOR_TREMONT: + arg_str = "tremont"; + priority = P_PROC_SSE4_2; + break; + case PROCESSOR_AMDFAM10: + arg_str = "amdfam10h"; + priority = P_PROC_SSE4_A; + break; + case PROCESSOR_BTVER1: + arg_str = "btver1"; + priority = P_PROC_SSE4_A; + break; + case PROCESSOR_BTVER2: + arg_str = "btver2"; + priority = P_PROC_BMI; + break; + case PROCESSOR_BDVER1: + arg_str = "bdver1"; + priority = P_PROC_XOP; + break; + case PROCESSOR_BDVER2: + arg_str = "bdver2"; + priority = P_PROC_FMA; + break; + case PROCESSOR_BDVER3: + arg_str = "bdver3"; + priority = P_PROC_FMA; + break; + case PROCESSOR_BDVER4: + arg_str = "bdver4"; + priority = P_PROC_AVX2; + break; + case PROCESSOR_ZNVER1: + arg_str = "znver1"; + priority = P_PROC_AVX2; + break; + case PROCESSOR_ZNVER2: + arg_str = "znver2"; + priority = P_PROC_AVX2; + break; + } + } + + cl_target_option_restore (&global_options, &cur_target); + + if (predicate_list && arg_str == NULL) + { + error_at (DECL_SOURCE_LOCATION (decl), + "no dispatcher found for the versioning attributes"); + return 0; + } + + if (predicate_list) + { + predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS]; + /* For a C string literal the length includes the trailing NULL. */ + predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str); + predicate_chain = tree_cons (predicate_decl, predicate_arg, + predicate_chain); + } + } + + /* Process feature name. */ + tok_str = (char *) xmalloc (strlen (attrs_str) + 1); + strcpy (tok_str, attrs_str); + token = strtok (tok_str, ","); + predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS]; + + while (token != NULL) + { + /* Do not process "arch=" */ + if (strncmp (token, "arch=", 5) == 0) + { + token = strtok (NULL, ","); + continue; + } + for (i = 0; i < NUM_FEATURES; ++i) + { + if (strcmp (token, isa_names_table[i].name) == 0) + { + if (predicate_list) + { + predicate_arg = build_string_literal ( + strlen (isa_names_table[i].name) + 1, + isa_names_table[i].name); + predicate_chain = tree_cons (predicate_decl, predicate_arg, + predicate_chain); + } + /* Find the maximum priority feature. */ + if (isa_names_table[i].priority > priority) + priority = isa_names_table[i].priority; + + break; + } + } + if (predicate_list && priority == P_ZERO) + { + error_at (DECL_SOURCE_LOCATION (decl), + "ISA %qs is not supported in % attribute, " + "use % syntax", token); + return 0; + } + token = strtok (NULL, ","); + } + free (tok_str); + + if (predicate_list && predicate_chain == NULL_TREE) + { + error_at (DECL_SOURCE_LOCATION (decl), + "no dispatcher found for the versioning attributes: %s", + attrs_str); + return 0; + } + else if (predicate_list) + { + predicate_chain = nreverse (predicate_chain); + *predicate_list = predicate_chain; + } + + return priority; +} + +/* This builds the processor_model struct type defined in + libgcc/config/i386/cpuinfo.c */ + +static tree +build_processor_model_struct (void) +{ + const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype", + "__cpu_features"}; + tree field = NULL_TREE, field_chain = NULL_TREE; + int i; + tree type = make_node (RECORD_TYPE); + + /* The first 3 fields are unsigned int. */ + for (i = 0; i < 3; ++i) + { + field = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier (field_name[i]), unsigned_type_node); + if (field_chain != NULL_TREE) + DECL_CHAIN (field) = field_chain; + field_chain = field; + } + + /* The last field is an array of unsigned integers of size one. */ + field = build_decl (UNKNOWN_LOCATION, FIELD_DECL, + get_identifier (field_name[3]), + build_array_type (unsigned_type_node, + build_index_type (size_one_node))); + if (field_chain != NULL_TREE) + DECL_CHAIN (field) = field_chain; + field_chain = field; + + finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE); + return type; +} + +/* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */ + +static tree +make_var_decl (tree type, const char *name) +{ + tree new_decl; + + new_decl = build_decl (UNKNOWN_LOCATION, + VAR_DECL, + get_identifier(name), + type); + + DECL_EXTERNAL (new_decl) = 1; + TREE_STATIC (new_decl) = 1; + TREE_PUBLIC (new_decl) = 1; + DECL_INITIAL (new_decl) = 0; + DECL_ARTIFICIAL (new_decl) = 0; + DECL_PRESERVE_P (new_decl) = 1; + + make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl)); + assemble_variable (new_decl, 0, 0, 0); + + return new_decl; +} + +/* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded + into an integer defined in libgcc/config/i386/cpuinfo.c */ + +tree +fold_builtin_cpu (tree fndecl, tree *args) +{ + unsigned int i; + enum ix86_builtins fn_code = (enum ix86_builtins) + DECL_FUNCTION_CODE (fndecl); + tree param_string_cst = NULL; + + tree __processor_model_type = build_processor_model_struct (); + tree __cpu_model_var = make_var_decl (__processor_model_type, + "__cpu_model"); + + + varpool_node::add (__cpu_model_var); + + gcc_assert ((args != NULL) && (*args != NULL)); + + param_string_cst = *args; + while (param_string_cst + && TREE_CODE (param_string_cst) != STRING_CST) + { + /* *args must be a expr that can contain other EXPRS leading to a + STRING_CST. */ + if (!EXPR_P (param_string_cst)) + { + error ("parameter to builtin must be a string constant or literal"); + return integer_zero_node; + } + param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0); + } + + gcc_assert (param_string_cst); + + if (fn_code == IX86_BUILTIN_CPU_IS) + { + tree ref; + tree field; + tree final; + + unsigned int field_val = 0; + unsigned int NUM_ARCH_NAMES + = sizeof (arch_names_table) / sizeof (struct _arch_names_table); + + for (i = 0; i < NUM_ARCH_NAMES; i++) + if (strcmp (arch_names_table[i].name, + TREE_STRING_POINTER (param_string_cst)) == 0) + break; + + if (i == NUM_ARCH_NAMES) + { + error ("parameter to builtin not valid: %s", + TREE_STRING_POINTER (param_string_cst)); + return integer_zero_node; + } + + field = TYPE_FIELDS (__processor_model_type); + field_val = arch_names_table[i].model; + + /* CPU types are stored in the next field. */ + if (field_val > M_CPU_TYPE_START + && field_val < M_CPU_SUBTYPE_START) + { + field = DECL_CHAIN (field); + field_val -= M_CPU_TYPE_START; + } + + /* CPU subtypes are stored in the next field. */ + if (field_val > M_CPU_SUBTYPE_START) + { + field = DECL_CHAIN ( DECL_CHAIN (field)); + field_val -= M_CPU_SUBTYPE_START; + } + + /* Get the appropriate field in __cpu_model. */ + ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var, + field, NULL_TREE); + + /* Check the value. */ + final = build2 (EQ_EXPR, unsigned_type_node, ref, + build_int_cstu (unsigned_type_node, field_val)); + return build1 (CONVERT_EXPR, integer_type_node, final); + } + else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS) + { + tree ref; + tree array_elt; + tree field; + tree final; + + unsigned int field_val = 0; + unsigned int NUM_ISA_NAMES + = sizeof (isa_names_table) / sizeof (struct _isa_names_table); + + for (i = 0; i < NUM_ISA_NAMES; i++) + if (strcmp (isa_names_table[i].name, + TREE_STRING_POINTER (param_string_cst)) == 0) + break; + + if (i == NUM_ISA_NAMES) + { + error ("parameter to builtin not valid: %s", + TREE_STRING_POINTER (param_string_cst)); + return integer_zero_node; + } + + if (isa_names_table[i].feature >= 32) + { + tree __cpu_features2_var = make_var_decl (unsigned_type_node, + "__cpu_features2"); + + varpool_node::add (__cpu_features2_var); + field_val = (1U << (isa_names_table[i].feature - 32)); + /* Return __cpu_features2 & field_val */ + final = build2 (BIT_AND_EXPR, unsigned_type_node, + __cpu_features2_var, + build_int_cstu (unsigned_type_node, field_val)); + return build1 (CONVERT_EXPR, integer_type_node, final); + } + + field = TYPE_FIELDS (__processor_model_type); + /* Get the last field, which is __cpu_features. */ + while (DECL_CHAIN (field)) + field = DECL_CHAIN (field); + + /* Get the appropriate field: __cpu_model.__cpu_features */ + ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var, + field, NULL_TREE); + + /* Access the 0th element of __cpu_features array. */ + array_elt = build4 (ARRAY_REF, unsigned_type_node, ref, + integer_zero_node, NULL_TREE, NULL_TREE); + + field_val = (1U << isa_names_table[i].feature); + /* Return __cpu_model.__cpu_features[0] & field_val */ + final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt, + build_int_cstu (unsigned_type_node, field_val)); + return build1 (CONVERT_EXPR, integer_type_node, final); + } + gcc_unreachable (); +} + +#include "gt-i386-builtins.h" diff --git a/gcc/config/i386/i386-builtins.h b/gcc/config/i386/i386-builtins.h new file mode 100644 index 00000000000..c0264e5bf1d --- /dev/null +++ b/gcc/config/i386/i386-builtins.h @@ -0,0 +1,330 @@ +/* Copyright (C) 1988-2019 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#ifndef GCC_I386_BUILTINS_H +#define GCC_I386_BUILTINS_H + +/* The following file contains several enumerations and data structures + built from the definitions in i386-builtin-types.def. */ + +#include "i386-builtin-types.inc" + +/* Codes for all the SSE/MMX builtins. Builtins not mentioned in any + bdesc_* arrays below should come first, then builtins for each bdesc_* + array in ascending order, so that we can use direct array accesses. */ +enum ix86_builtins +{ + IX86_BUILTIN_MASKMOVQ, + IX86_BUILTIN_LDMXCSR, + IX86_BUILTIN_STMXCSR, + IX86_BUILTIN_MASKMOVDQU, + IX86_BUILTIN_PSLLDQ128, + IX86_BUILTIN_CLFLUSH, + IX86_BUILTIN_MONITOR, + IX86_BUILTIN_MWAIT, + IX86_BUILTIN_UMONITOR, + IX86_BUILTIN_UMWAIT, + IX86_BUILTIN_TPAUSE, + IX86_BUILTIN_CLZERO, + IX86_BUILTIN_CLDEMOTE, + IX86_BUILTIN_VEC_INIT_V2SI, + IX86_BUILTIN_VEC_INIT_V4HI, + IX86_BUILTIN_VEC_INIT_V8QI, + IX86_BUILTIN_VEC_EXT_V2DF, + IX86_BUILTIN_VEC_EXT_V2DI, + IX86_BUILTIN_VEC_EXT_V4SF, + IX86_BUILTIN_VEC_EXT_V4SI, + IX86_BUILTIN_VEC_EXT_V8HI, + IX86_BUILTIN_VEC_EXT_V2SI, + IX86_BUILTIN_VEC_EXT_V4HI, + IX86_BUILTIN_VEC_EXT_V16QI, + IX86_BUILTIN_VEC_SET_V2DI, + IX86_BUILTIN_VEC_SET_V4SF, + IX86_BUILTIN_VEC_SET_V4SI, + IX86_BUILTIN_VEC_SET_V8HI, + IX86_BUILTIN_VEC_SET_V4HI, + IX86_BUILTIN_VEC_SET_V16QI, + IX86_BUILTIN_GATHERSIV2DF, + IX86_BUILTIN_GATHERSIV4DF, + IX86_BUILTIN_GATHERDIV2DF, + IX86_BUILTIN_GATHERDIV4DF, + IX86_BUILTIN_GATHERSIV4SF, + IX86_BUILTIN_GATHERSIV8SF, + IX86_BUILTIN_GATHERDIV4SF, + IX86_BUILTIN_GATHERDIV8SF, + IX86_BUILTIN_GATHERSIV2DI, + IX86_BUILTIN_GATHERSIV4DI, + IX86_BUILTIN_GATHERDIV2DI, + IX86_BUILTIN_GATHERDIV4DI, + IX86_BUILTIN_GATHERSIV4SI, + IX86_BUILTIN_GATHERSIV8SI, + IX86_BUILTIN_GATHERDIV4SI, + IX86_BUILTIN_GATHERDIV8SI, + IX86_BUILTIN_GATHER3SIV8SF, + IX86_BUILTIN_GATHER3SIV4SF, + IX86_BUILTIN_GATHER3SIV4DF, + IX86_BUILTIN_GATHER3SIV2DF, + IX86_BUILTIN_GATHER3DIV8SF, + IX86_BUILTIN_GATHER3DIV4SF, + IX86_BUILTIN_GATHER3DIV4DF, + IX86_BUILTIN_GATHER3DIV2DF, + IX86_BUILTIN_GATHER3SIV8SI, + IX86_BUILTIN_GATHER3SIV4SI, + IX86_BUILTIN_GATHER3SIV4DI, + IX86_BUILTIN_GATHER3SIV2DI, + IX86_BUILTIN_GATHER3DIV8SI, + IX86_BUILTIN_GATHER3DIV4SI, + IX86_BUILTIN_GATHER3DIV4DI, + IX86_BUILTIN_GATHER3DIV2DI, + IX86_BUILTIN_SCATTERSIV8SF, + IX86_BUILTIN_SCATTERSIV4SF, + IX86_BUILTIN_SCATTERSIV4DF, + IX86_BUILTIN_SCATTERSIV2DF, + IX86_BUILTIN_SCATTERDIV8SF, + IX86_BUILTIN_SCATTERDIV4SF, + IX86_BUILTIN_SCATTERDIV4DF, + IX86_BUILTIN_SCATTERDIV2DF, + IX86_BUILTIN_SCATTERSIV8SI, + IX86_BUILTIN_SCATTERSIV4SI, + IX86_BUILTIN_SCATTERSIV4DI, + IX86_BUILTIN_SCATTERSIV2DI, + IX86_BUILTIN_SCATTERDIV8SI, + IX86_BUILTIN_SCATTERDIV4SI, + IX86_BUILTIN_SCATTERDIV4DI, + IX86_BUILTIN_SCATTERDIV2DI, + /* Alternate 4 and 8 element gather/scatter for the vectorizer + where all operands are 32-byte or 64-byte wide respectively. */ + IX86_BUILTIN_GATHERALTSIV4DF, + IX86_BUILTIN_GATHERALTDIV8SF, + IX86_BUILTIN_GATHERALTSIV4DI, + IX86_BUILTIN_GATHERALTDIV8SI, + IX86_BUILTIN_GATHER3ALTDIV16SF, + IX86_BUILTIN_GATHER3ALTDIV16SI, + IX86_BUILTIN_GATHER3ALTSIV4DF, + IX86_BUILTIN_GATHER3ALTDIV8SF, + IX86_BUILTIN_GATHER3ALTSIV4DI, + IX86_BUILTIN_GATHER3ALTDIV8SI, + IX86_BUILTIN_GATHER3ALTSIV8DF, + IX86_BUILTIN_GATHER3ALTSIV8DI, + IX86_BUILTIN_GATHER3DIV16SF, + IX86_BUILTIN_GATHER3DIV16SI, + IX86_BUILTIN_GATHER3DIV8DF, + IX86_BUILTIN_GATHER3DIV8DI, + IX86_BUILTIN_GATHER3SIV16SF, + IX86_BUILTIN_GATHER3SIV16SI, + IX86_BUILTIN_GATHER3SIV8DF, + IX86_BUILTIN_GATHER3SIV8DI, + IX86_BUILTIN_SCATTERALTSIV8DF, + IX86_BUILTIN_SCATTERALTDIV16SF, + IX86_BUILTIN_SCATTERALTSIV8DI, + IX86_BUILTIN_SCATTERALTDIV16SI, + IX86_BUILTIN_SCATTERALTSIV4DF, + IX86_BUILTIN_SCATTERALTDIV8SF, + IX86_BUILTIN_SCATTERALTSIV4DI, + IX86_BUILTIN_SCATTERALTDIV8SI, + IX86_BUILTIN_SCATTERALTSIV2DF, + IX86_BUILTIN_SCATTERALTDIV4SF, + IX86_BUILTIN_SCATTERALTSIV2DI, + IX86_BUILTIN_SCATTERALTDIV4SI, + IX86_BUILTIN_SCATTERDIV16SF, + IX86_BUILTIN_SCATTERDIV16SI, + IX86_BUILTIN_SCATTERDIV8DF, + IX86_BUILTIN_SCATTERDIV8DI, + IX86_BUILTIN_SCATTERSIV16SF, + IX86_BUILTIN_SCATTERSIV16SI, + IX86_BUILTIN_SCATTERSIV8DF, + IX86_BUILTIN_SCATTERSIV8DI, + IX86_BUILTIN_GATHERPFQPD, + IX86_BUILTIN_GATHERPFDPS, + IX86_BUILTIN_GATHERPFDPD, + IX86_BUILTIN_GATHERPFQPS, + IX86_BUILTIN_SCATTERPFDPD, + IX86_BUILTIN_SCATTERPFDPS, + IX86_BUILTIN_SCATTERPFQPD, + IX86_BUILTIN_SCATTERPFQPS, + IX86_BUILTIN_CLWB, + IX86_BUILTIN_CLFLUSHOPT, + IX86_BUILTIN_INFQ, + IX86_BUILTIN_HUGE_VALQ, + IX86_BUILTIN_NANQ, + IX86_BUILTIN_NANSQ, + IX86_BUILTIN_XABORT, + IX86_BUILTIN_ADDCARRYX32, + IX86_BUILTIN_ADDCARRYX64, + IX86_BUILTIN_SBB32, + IX86_BUILTIN_SBB64, + IX86_BUILTIN_RDRAND16_STEP, + IX86_BUILTIN_RDRAND32_STEP, + IX86_BUILTIN_RDRAND64_STEP, + IX86_BUILTIN_RDSEED16_STEP, + IX86_BUILTIN_RDSEED32_STEP, + IX86_BUILTIN_RDSEED64_STEP, + IX86_BUILTIN_MONITORX, + IX86_BUILTIN_MWAITX, + IX86_BUILTIN_CFSTRING, + IX86_BUILTIN_CPU_INIT, + IX86_BUILTIN_CPU_IS, + IX86_BUILTIN_CPU_SUPPORTS, + IX86_BUILTIN_READ_FLAGS, + IX86_BUILTIN_WRITE_FLAGS, + + /* All the remaining builtins are tracked in bdesc_* arrays in + i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after + this point. */ +#define BDESC(mask, mask2, icode, name, code, comparison, flag) \ + code, +#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \ + code, \ + IX86_BUILTIN__BDESC_##kindu##_FIRST = code, +#define BDESC_END(kind, next_kind) + +#include "i386-builtin.def" + +#undef BDESC +#undef BDESC_FIRST +#undef BDESC_END + + IX86_BUILTIN_MAX, + + IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX, + + /* Now just the aliases for bdesc_* start/end. */ +#define BDESC(mask, mask2, icode, name, code, comparison, flag) +#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) +#define BDESC_END(kind, next_kind) \ + IX86_BUILTIN__BDESC_##kind##_LAST \ + = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1, + +#include "i386-builtin.def" + +#undef BDESC +#undef BDESC_FIRST +#undef BDESC_END + + /* Just to make sure there is no comma after the last enumerator. */ + IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST +}; + +/* Table of all of the builtin functions that are possible with different ISA's + but are waiting to be built until a function is declared to use that + ISA. */ +struct builtin_isa { + HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */ + HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */ + const char *name; /* function name */ + enum ix86_builtin_func_type tcode; /* type to use in the declaration */ + unsigned char const_p:1; /* true if the declaration is constant */ + unsigned char pure_p:1; /* true if the declaration has pure attribute */ + bool set_and_not_built_p; +}; + +/* Bits for builtin_description.flag. */ + +/* Set when we don't support the comparison natively, and should + swap_comparison in order to support it. */ +#define BUILTIN_DESC_SWAP_OPERANDS 1 + +struct builtin_description +{ + const HOST_WIDE_INT mask; + const HOST_WIDE_INT mask2; + const enum insn_code icode; + const char *const name; + const enum ix86_builtins code; + const enum rtx_code comparison; + const int flag; +}; + +#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT +#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT +#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT +#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT +#define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF +#define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF +#define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF +#define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF +#define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI +#define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI +#define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI +#define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI +#define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI +#define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI +#define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI +#define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI +#define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI +#define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI +#define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF +#define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF +#define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI +#define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI +#define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI +#define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI +#define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI +#define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI +#define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI +#define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI +#define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP +#define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP +#define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP +#define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP +#define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF +#define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF +#define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF +#define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF +#define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF +#define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF +#define MULTI_ARG_1_SF V4SF_FTYPE_V4SF +#define MULTI_ARG_1_DF V2DF_FTYPE_V2DF +#define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF +#define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF +#define MULTI_ARG_1_DI V2DI_FTYPE_V2DI +#define MULTI_ARG_1_SI V4SI_FTYPE_V4SI +#define MULTI_ARG_1_HI V8HI_FTYPE_V8HI +#define MULTI_ARG_1_QI V16QI_FTYPE_V16QI +#define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI +#define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI +#define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI +#define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI +#define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI +#define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI + +#define BDESC(mask, mask2, icode, name, code, comparison, flag) \ + { mask, mask2, icode, name, code, comparison, flag }, +#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \ +static const struct builtin_description bdesc_##kind[] = \ +{ \ + BDESC (mask, mask2, icode, name, code, comparison, flag) +#define BDESC_END(kind, next_kind) \ +}; + +#include "i386-builtin.def" + +extern builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX]; + +tree ix86_builtin_vectorized_function (unsigned int fn, tree type_out, + tree type_in); +void ix86_init_builtins (void); +tree ix86_vectorize_builtin_gather (const_tree mem_vectype, + const_tree index_type, int scale); +tree ix86_builtin_decl (unsigned code, bool); +tree ix86_builtin_reciprocal (tree fndecl); +unsigned int get_builtin_code_for_version (tree decl, tree *predicate_list); +tree fold_builtin_cpu (tree fndecl, tree *args); +tree get_ix86_builtin (enum ix86_builtins c); + +#endif /* GCC_I386_BUILTINS_H */ diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c new file mode 100644 index 00000000000..0835ebf74b7 --- /dev/null +++ b/gcc/config/i386/i386-expand.c @@ -0,0 +1,19840 @@ +/* Copyright (C) 1988-2019 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "gimple.h" +#include "cfghooks.h" +#include "cfgloop.h" +#include "df.h" +#include "tm_p.h" +#include "stringpool.h" +#include "expmed.h" +#include "optabs.h" +#include "regs.h" +#include "emit-rtl.h" +#include "recog.h" +#include "cgraph.h" +#include "diagnostic.h" +#include "cfgbuild.h" +#include "alias.h" +#include "fold-const.h" +#include "attribs.h" +#include "calls.h" +#include "stor-layout.h" +#include "varasm.h" +#include "output.h" +#include "insn-attr.h" +#include "flags.h" +#include "except.h" +#include "explow.h" +#include "expr.h" +#include "cfgrtl.h" +#include "common/common-target.h" +#include "langhooks.h" +#include "reload.h" +#include "gimplify.h" +#include "dwarf2.h" +#include "tm-constrs.h" +#include "params.h" +#include "cselib.h" +#include "sched-int.h" +#include "opts.h" +#include "tree-pass.h" +#include "context.h" +#include "pass_manager.h" +#include "target-globals.h" +#include "gimple-iterator.h" +#include "tree-vectorizer.h" +#include "shrink-wrap.h" +#include "builtins.h" +#include "rtl-iter.h" +#include "tree-iterator.h" +#include "dbgcnt.h" +#include "case-cfn-macros.h" +#include "dojump.h" +#include "fold-const-call.h" +#include "tree-vrp.h" +#include "tree-ssanames.h" +#include "selftest.h" +#include "selftest-rtl.h" +#include "print-rtl.h" +#include "intl.h" +#include "ifcvt.h" +#include "symbol-summary.h" +#include "ipa-prop.h" +#include "ipa-fnsummary.h" +#include "wide-int-bitmask.h" +#include "tree-vector-builder.h" +#include "debug.h" +#include "dwarf2out.h" +#include "i386-options.h" +#include "i386-builtins.h" +#include "i386-expand.h" + +/* Split one or more double-mode RTL references into pairs of half-mode + references. The RTL can be REG, offsettable MEM, integer constant, or + CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to + split and "num" is its length. lo_half and hi_half are output arrays + that parallel "operands". */ + +void +split_double_mode (machine_mode mode, rtx operands[], + int num, rtx lo_half[], rtx hi_half[]) +{ + machine_mode half_mode; + unsigned int byte; + + switch (mode) + { + case E_TImode: + half_mode = DImode; + break; + case E_DImode: + half_mode = SImode; + break; + default: + gcc_unreachable (); + } + + byte = GET_MODE_SIZE (half_mode); + + while (num--) + { + rtx op = operands[num]; + + /* simplify_subreg refuse to split volatile memory addresses, + but we still have to handle it. */ + if (MEM_P (op)) + { + lo_half[num] = adjust_address (op, half_mode, 0); + hi_half[num] = adjust_address (op, half_mode, byte); + } + else + { + lo_half[num] = simplify_gen_subreg (half_mode, op, + GET_MODE (op) == VOIDmode + ? mode : GET_MODE (op), 0); + hi_half[num] = simplify_gen_subreg (half_mode, op, + GET_MODE (op) == VOIDmode + ? mode : GET_MODE (op), byte); + } + } +} + +/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate + for the target. */ + +void +ix86_expand_clear (rtx dest) +{ + rtx tmp; + + /* We play register width games, which are only valid after reload. */ + gcc_assert (reload_completed); + + /* Avoid HImode and its attendant prefix byte. */ + if (GET_MODE_SIZE (GET_MODE (dest)) < 4) + dest = gen_rtx_REG (SImode, REGNO (dest)); + tmp = gen_rtx_SET (dest, const0_rtx); + + if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ()) + { + rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob)); + } + + emit_insn (tmp); +} + +void +ix86_expand_move (machine_mode mode, rtx operands[]) +{ + rtx op0, op1; + rtx tmp, addend = NULL_RTX; + enum tls_model model; + + op0 = operands[0]; + op1 = operands[1]; + + switch (GET_CODE (op1)) + { + case CONST: + tmp = XEXP (op1, 0); + + if (GET_CODE (tmp) != PLUS + || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF) + break; + + op1 = XEXP (tmp, 0); + addend = XEXP (tmp, 1); + /* FALLTHRU */ + + case SYMBOL_REF: + model = SYMBOL_REF_TLS_MODEL (op1); + + if (model) + op1 = legitimize_tls_address (op1, model, true); + else if (ix86_force_load_from_GOT_p (op1)) + { + /* Load the external function address via GOT slot to avoid PLT. */ + op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1), + (TARGET_64BIT + ? UNSPEC_GOTPCREL + : UNSPEC_GOT)); + op1 = gen_rtx_CONST (Pmode, op1); + op1 = gen_const_mem (Pmode, op1); + set_mem_alias_set (op1, ix86_GOT_alias_set ()); + } + else + { + tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX); + if (tmp) + { + op1 = tmp; + if (!addend) + break; + } + else + { + op1 = operands[1]; + break; + } + } + + if (addend) + { + op1 = force_operand (op1, NULL_RTX); + op1 = expand_simple_binop (Pmode, PLUS, op1, addend, + op0, 1, OPTAB_DIRECT); + } + else + op1 = force_operand (op1, op0); + + if (op1 == op0) + return; + + op1 = convert_to_mode (mode, op1, 1); + + default: + break; + } + + if ((flag_pic || MACHOPIC_INDIRECT) + && symbolic_operand (op1, mode)) + { + if (TARGET_MACHO && !TARGET_64BIT) + { +#if TARGET_MACHO + /* dynamic-no-pic */ + if (MACHOPIC_INDIRECT) + { + rtx temp = (op0 && REG_P (op0) && mode == Pmode) + ? op0 : gen_reg_rtx (Pmode); + op1 = machopic_indirect_data_reference (op1, temp); + if (MACHOPIC_PURE) + op1 = machopic_legitimize_pic_address (op1, mode, + temp == op1 ? 0 : temp); + } + if (op0 != op1 && GET_CODE (op0) != MEM) + { + rtx insn = gen_rtx_SET (op0, op1); + emit_insn (insn); + return; + } + if (GET_CODE (op0) == MEM) + op1 = force_reg (Pmode, op1); + else + { + rtx temp = op0; + if (GET_CODE (temp) != REG) + temp = gen_reg_rtx (Pmode); + temp = legitimize_pic_address (op1, temp); + if (temp == op0) + return; + op1 = temp; + } + /* dynamic-no-pic */ +#endif + } + else + { + if (MEM_P (op0)) + op1 = force_reg (mode, op1); + else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode))) + { + rtx reg = can_create_pseudo_p () ? NULL_RTX : op0; + op1 = legitimize_pic_address (op1, reg); + if (op0 == op1) + return; + op1 = convert_to_mode (mode, op1, 1); + } + } + } + else + { + if (MEM_P (op0) + && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode) + || !push_operand (op0, mode)) + && MEM_P (op1)) + op1 = force_reg (mode, op1); + + if (push_operand (op0, mode) + && ! general_no_elim_operand (op1, mode)) + op1 = copy_to_mode_reg (mode, op1); + + /* Force large constants in 64bit compilation into register + to get them CSEed. */ + if (can_create_pseudo_p () + && (mode == DImode) && TARGET_64BIT + && immediate_operand (op1, mode) + && !x86_64_zext_immediate_operand (op1, VOIDmode) + && !register_operand (op0, mode) + && optimize) + op1 = copy_to_mode_reg (mode, op1); + + if (can_create_pseudo_p () + && CONST_DOUBLE_P (op1)) + { + /* If we are loading a floating point constant to a register, + force the value to memory now, since we'll get better code + out the back end. */ + + op1 = validize_mem (force_const_mem (mode, op1)); + if (!register_operand (op0, mode)) + { + rtx temp = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (temp, op1)); + emit_move_insn (op0, temp); + return; + } + } + } + + emit_insn (gen_rtx_SET (op0, op1)); +} + +void +ix86_expand_vector_move (machine_mode mode, rtx operands[]) +{ + rtx op0 = operands[0], op1 = operands[1]; + /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU + psABI since the biggest alignment is 4 byte for IA MCU psABI. */ + unsigned int align = (TARGET_IAMCU + ? GET_MODE_BITSIZE (mode) + : GET_MODE_ALIGNMENT (mode)); + + if (push_operand (op0, VOIDmode)) + op0 = emit_move_resolve_push (mode, op0); + + /* Force constants other than zero into memory. We do not know how + the instructions used to build constants modify the upper 64 bits + of the register, once we have that information we may be able + to handle some of them more efficiently. */ + if (can_create_pseudo_p () + && (CONSTANT_P (op1) + || (SUBREG_P (op1) + && CONSTANT_P (SUBREG_REG (op1)))) + && ((register_operand (op0, mode) + && !standard_sse_constant_p (op1, mode)) + /* ix86_expand_vector_move_misalign() does not like constants. */ + || (SSE_REG_MODE_P (mode) + && MEM_P (op0) + && MEM_ALIGN (op0) < align))) + { + if (SUBREG_P (op1)) + { + machine_mode imode = GET_MODE (SUBREG_REG (op1)); + rtx r = force_const_mem (imode, SUBREG_REG (op1)); + if (r) + r = validize_mem (r); + else + r = force_reg (imode, SUBREG_REG (op1)); + op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1)); + } + else + op1 = validize_mem (force_const_mem (mode, op1)); + } + + /* We need to check memory alignment for SSE mode since attribute + can make operands unaligned. */ + if (can_create_pseudo_p () + && SSE_REG_MODE_P (mode) + && ((MEM_P (op0) && (MEM_ALIGN (op0) < align)) + || (MEM_P (op1) && (MEM_ALIGN (op1) < align)))) + { + rtx tmp[2]; + + /* ix86_expand_vector_move_misalign() does not like both + arguments in memory. */ + if (!register_operand (op0, mode) + && !register_operand (op1, mode)) + op1 = force_reg (mode, op1); + + tmp[0] = op0; tmp[1] = op1; + ix86_expand_vector_move_misalign (mode, tmp); + return; + } + + /* Make operand1 a register if it isn't already. */ + if (can_create_pseudo_p () + && !register_operand (op0, mode) + && !register_operand (op1, mode)) + { + emit_move_insn (op0, force_reg (GET_MODE (op0), op1)); + return; + } + + emit_insn (gen_rtx_SET (op0, op1)); +} + +/* Split 32-byte AVX unaligned load and store if needed. */ + +static void +ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) +{ + rtx m; + rtx (*extract) (rtx, rtx, rtx); + machine_mode mode; + + if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD) + || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE)) + { + emit_insn (gen_rtx_SET (op0, op1)); + return; + } + + rtx orig_op0 = NULL_RTX; + mode = GET_MODE (op0); + switch (GET_MODE_CLASS (mode)) + { + case MODE_VECTOR_INT: + case MODE_INT: + if (mode != V32QImode) + { + if (!MEM_P (op0)) + { + orig_op0 = op0; + op0 = gen_reg_rtx (V32QImode); + } + else + op0 = gen_lowpart (V32QImode, op0); + op1 = gen_lowpart (V32QImode, op1); + mode = V32QImode; + } + break; + case MODE_VECTOR_FLOAT: + break; + default: + gcc_unreachable (); + } + + switch (mode) + { + default: + gcc_unreachable (); + case E_V32QImode: + extract = gen_avx_vextractf128v32qi; + mode = V16QImode; + break; + case E_V8SFmode: + extract = gen_avx_vextractf128v8sf; + mode = V4SFmode; + break; + case E_V4DFmode: + extract = gen_avx_vextractf128v4df; + mode = V2DFmode; + break; + } + + if (MEM_P (op1)) + { + rtx r = gen_reg_rtx (mode); + m = adjust_address (op1, mode, 0); + emit_move_insn (r, m); + m = adjust_address (op1, mode, 16); + r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); + emit_move_insn (op0, r); + } + else if (MEM_P (op0)) + { + m = adjust_address (op0, mode, 0); + emit_insn (extract (m, op1, const0_rtx)); + m = adjust_address (op0, mode, 16); + emit_insn (extract (m, copy_rtx (op1), const1_rtx)); + } + else + gcc_unreachable (); + + if (orig_op0) + emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); +} + +/* Implement the movmisalign patterns for SSE. Non-SSE modes go + straight to ix86_expand_vector_move. */ +/* Code generation for scalar reg-reg moves of single and double precision data: + if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true) + movaps reg, reg + else + movss reg, reg + if (x86_sse_partial_reg_dependency == true) + movapd reg, reg + else + movsd reg, reg + + Code generation for scalar loads of double precision data: + if (x86_sse_split_regs == true) + movlpd mem, reg (gas syntax) + else + movsd mem, reg + + Code generation for unaligned packed loads of single precision data + (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency): + if (x86_sse_unaligned_move_optimal) + movups mem, reg + + if (x86_sse_partial_reg_dependency == true) + { + xorps reg, reg + movlps mem, reg + movhps mem+8, reg + } + else + { + movlps mem, reg + movhps mem+8, reg + } + + Code generation for unaligned packed loads of double precision data + (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs): + if (x86_sse_unaligned_move_optimal) + movupd mem, reg + + if (x86_sse_split_regs == true) + { + movlpd mem, reg + movhpd mem+8, reg + } + else + { + movsd mem, reg + movhpd mem+8, reg + } + */ + +void +ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) +{ + rtx op0, op1, m; + + op0 = operands[0]; + op1 = operands[1]; + + /* Use unaligned load/store for AVX512 or when optimizing for size. */ + if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ()) + { + emit_insn (gen_rtx_SET (op0, op1)); + return; + } + + if (TARGET_AVX) + { + if (GET_MODE_SIZE (mode) == 32) + ix86_avx256_split_vector_move_misalign (op0, op1); + else + /* Always use 128-bit mov_internal pattern for AVX. */ + emit_insn (gen_rtx_SET (op0, op1)); + return; + } + + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + emit_insn (gen_rtx_SET (op0, op1)); + return; + } + + /* ??? If we have typed data, then it would appear that using + movdqu is the only way to get unaligned data loaded with + integer type. */ + if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + emit_insn (gen_rtx_SET (op0, op1)); + return; + } + + if (MEM_P (op1)) + { + if (TARGET_SSE2 && mode == V2DFmode) + { + rtx zero; + + /* When SSE registers are split into halves, we can avoid + writing to the top half twice. */ + if (TARGET_SSE_SPLIT_REGS) + { + emit_clobber (op0); + zero = op0; + } + else + { + /* ??? Not sure about the best option for the Intel chips. + The following would seem to satisfy; the register is + entirely cleared, breaking the dependency chain. We + then store to the upper half, with a dependency depth + of one. A rumor has it that Intel recommends two movsd + followed by an unpacklpd, but this is unconfirmed. And + given that the dependency depth of the unpacklpd would + still be one, I'm not sure why this would be better. */ + zero = CONST0_RTX (V2DFmode); + } + + m = adjust_address (op1, DFmode, 0); + emit_insn (gen_sse2_loadlpd (op0, zero, m)); + m = adjust_address (op1, DFmode, 8); + emit_insn (gen_sse2_loadhpd (op0, op0, m)); + } + else + { + rtx t; + + if (mode != V4SFmode) + t = gen_reg_rtx (V4SFmode); + else + t = op0; + + if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) + emit_move_insn (t, CONST0_RTX (V4SFmode)); + else + emit_clobber (t); + + m = adjust_address (op1, V2SFmode, 0); + emit_insn (gen_sse_loadlps (t, t, m)); + m = adjust_address (op1, V2SFmode, 8); + emit_insn (gen_sse_loadhps (t, t, m)); + if (mode != V4SFmode) + emit_move_insn (op0, gen_lowpart (mode, t)); + } + } + else if (MEM_P (op0)) + { + if (TARGET_SSE2 && mode == V2DFmode) + { + m = adjust_address (op0, DFmode, 0); + emit_insn (gen_sse2_storelpd (m, op1)); + m = adjust_address (op0, DFmode, 8); + emit_insn (gen_sse2_storehpd (m, op1)); + } + else + { + if (mode != V4SFmode) + op1 = gen_lowpart (V4SFmode, op1); + + m = adjust_address (op0, V2SFmode, 0); + emit_insn (gen_sse_storelps (m, op1)); + m = adjust_address (op0, V2SFmode, 8); + emit_insn (gen_sse_storehps (m, copy_rtx (op1))); + } + } + else + gcc_unreachable (); +} + +/* Helper function of ix86_fixup_binary_operands to canonicalize + operand order. Returns true if the operands should be swapped. */ + +static bool +ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode, + rtx operands[]) +{ + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + + /* If the operation is not commutative, we can't do anything. */ + if (GET_RTX_CLASS (code) != RTX_COMM_ARITH + && GET_RTX_CLASS (code) != RTX_COMM_COMPARE) + return false; + + /* Highest priority is that src1 should match dst. */ + if (rtx_equal_p (dst, src1)) + return false; + if (rtx_equal_p (dst, src2)) + return true; + + /* Next highest priority is that immediate constants come second. */ + if (immediate_operand (src2, mode)) + return false; + if (immediate_operand (src1, mode)) + return true; + + /* Lowest priority is that memory references should come second. */ + if (MEM_P (src2)) + return false; + if (MEM_P (src1)) + return true; + + return false; +} + + +/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the + destination to use for the operation. If different from the true + destination in operands[0], a copy operation will be required. */ + +rtx +ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode, + rtx operands[]) +{ + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + + /* Canonicalize operand order. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) + { + /* It is invalid to swap operands of different modes. */ + gcc_assert (GET_MODE (src1) == GET_MODE (src2)); + + std::swap (src1, src2); + } + + /* Both source operands cannot be in memory. */ + if (MEM_P (src1) && MEM_P (src2)) + { + /* Optimization: Only read from memory once. */ + if (rtx_equal_p (src1, src2)) + { + src2 = force_reg (mode, src2); + src1 = src2; + } + else if (rtx_equal_p (dst, src1)) + src2 = force_reg (mode, src2); + else + src1 = force_reg (mode, src1); + } + + /* If the destination is memory, and we do not have matching source + operands, do things in registers. */ + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + dst = gen_reg_rtx (mode); + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) + src1 = force_reg (mode, src1); + + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) + src1 = force_reg (mode, src1); + + /* Improve address combine. */ + if (code == PLUS + && GET_MODE_CLASS (mode) == MODE_INT + && MEM_P (src2)) + src2 = force_reg (mode, src2); + + operands[1] = src1; + operands[2] = src2; + return dst; +} + +/* Similarly, but assume that the destination has already been + set up properly. */ + +void +ix86_fixup_binary_operands_no_copy (enum rtx_code code, + machine_mode mode, rtx operands[]) +{ + rtx dst = ix86_fixup_binary_operands (code, mode, operands); + gcc_assert (dst == operands[0]); +} + +/* Attempt to expand a binary operator. Make the expansion closer to the + actual machine, then just general_operand, which will allow 3 separate + memory references (one output, two input) in a single insn. */ + +void +ix86_expand_binary_operator (enum rtx_code code, machine_mode mode, + rtx operands[]) +{ + rtx src1, src2, dst, op, clob; + + dst = ix86_fixup_binary_operands (code, mode, operands); + src1 = operands[1]; + src2 = operands[2]; + + /* Emit the instruction. */ + + op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2)); + + if (reload_completed + && code == PLUS + && !rtx_equal_p (dst, src1)) + { + /* This is going to be an LEA; avoid splitting it later. */ + emit_insn (op); + } + else + { + clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); + } + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); +} + +/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with + the given OPERANDS. */ + +void +ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode, + rtx operands[]) +{ + rtx op1 = NULL_RTX, op2 = NULL_RTX; + if (SUBREG_P (operands[1])) + { + op1 = operands[1]; + op2 = operands[2]; + } + else if (SUBREG_P (operands[2])) + { + op1 = operands[2]; + op2 = operands[1]; + } + /* Optimize (__m128i) d | (__m128i) e and similar code + when d and e are float vectors into float vector logical + insn. In C/C++ without using intrinsics there is no other way + to express vector logical operation on float vectors than + to cast them temporarily to integer vectors. */ + if (op1 + && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL + && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR) + && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT + && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode) + && SUBREG_BYTE (op1) == 0 + && (GET_CODE (op2) == CONST_VECTOR + || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2)) + && SUBREG_BYTE (op2) == 0)) + && can_create_pseudo_p ()) + { + rtx dst; + switch (GET_MODE (SUBREG_REG (op1))) + { + case E_V4SFmode: + case E_V8SFmode: + case E_V16SFmode: + case E_V2DFmode: + case E_V4DFmode: + case E_V8DFmode: + dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1))); + if (GET_CODE (op2) == CONST_VECTOR) + { + op2 = gen_lowpart (GET_MODE (dst), op2); + op2 = force_reg (GET_MODE (dst), op2); + } + else + { + op1 = operands[1]; + op2 = SUBREG_REG (operands[2]); + if (!vector_operand (op2, GET_MODE (dst))) + op2 = force_reg (GET_MODE (dst), op2); + } + op1 = SUBREG_REG (op1); + if (!vector_operand (op1, GET_MODE (dst))) + op1 = force_reg (GET_MODE (dst), op1); + emit_insn (gen_rtx_SET (dst, + gen_rtx_fmt_ee (code, GET_MODE (dst), + op1, op2))); + emit_move_insn (operands[0], gen_lowpart (mode, dst)); + return; + default: + break; + } + } + if (!vector_operand (operands[1], mode)) + operands[1] = force_reg (mode, operands[1]); + if (!vector_operand (operands[2], mode)) + operands[2] = force_reg (mode, operands[2]); + ix86_fixup_binary_operands_no_copy (code, mode, operands); + emit_insn (gen_rtx_SET (operands[0], + gen_rtx_fmt_ee (code, mode, operands[1], + operands[2]))); +} + +/* Return TRUE or FALSE depending on whether the binary operator meets the + appropriate constraints. */ + +bool +ix86_binary_operator_ok (enum rtx_code code, machine_mode mode, + rtx operands[3]) +{ + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + + /* Both source operands cannot be in memory. */ + if (MEM_P (src1) && MEM_P (src2)) + return false; + + /* Canonicalize operand order for commutative operators. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) + std::swap (src1, src2); + + /* If the destination is memory, we must have a matching source operand. */ + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + return false; + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) + return false; + + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) + /* Support "andhi/andsi/anddi" as a zero-extending move. */ + return (code == AND + && (mode == HImode + || mode == SImode + || (TARGET_64BIT && mode == DImode)) + && satisfies_constraint_L (src2)); + + return true; +} + +/* Attempt to expand a unary operator. Make the expansion closer to the + actual machine, then just general_operand, which will allow 2 separate + memory references (one output, one input) in a single insn. */ + +void +ix86_expand_unary_operator (enum rtx_code code, machine_mode mode, + rtx operands[]) +{ + bool matching_memory = false; + rtx src, dst, op, clob; + + dst = operands[0]; + src = operands[1]; + + /* If the destination is memory, and we do not have matching source + operands, do things in registers. */ + if (MEM_P (dst)) + { + if (rtx_equal_p (dst, src)) + matching_memory = true; + else + dst = gen_reg_rtx (mode); + } + + /* When source operand is memory, destination must match. */ + if (MEM_P (src) && !matching_memory) + src = force_reg (mode, src); + + /* Emit the instruction. */ + + op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src)); + + if (code == NOT) + emit_insn (op); + else + { + clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); + } + + /* Fix up the destination if needed. */ + if (dst != operands[0]) + emit_move_insn (operands[0], dst); +} + +/* Predict just emitted jump instruction to be taken with probability PROB. */ + +static void +predict_jump (int prob) +{ + rtx_insn *insn = get_last_insn (); + gcc_assert (JUMP_P (insn)); + add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob)); +} + +/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and + divisor are within the range [0-255]. */ + +void +ix86_split_idivmod (machine_mode mode, rtx operands[], + bool signed_p) +{ + rtx_code_label *end_label, *qimode_label; + rtx div, mod; + rtx_insn *insn; + rtx scratch, tmp0, tmp1, tmp2; + rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx); + rtx (*gen_zero_extend) (rtx, rtx); + rtx (*gen_test_ccno_1) (rtx, rtx); + + switch (mode) + { + case E_SImode: + if (GET_MODE (operands[0]) == SImode) + { + if (GET_MODE (operands[1]) == SImode) + gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1; + else + gen_divmod4_1 + = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2; + gen_zero_extend = gen_zero_extendqisi2; + } + else + { + gen_divmod4_1 + = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1; + gen_zero_extend = gen_zero_extendqidi2; + } + gen_test_ccno_1 = gen_testsi_ccno_1; + break; + case E_DImode: + gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1; + gen_test_ccno_1 = gen_testdi_ccno_1; + gen_zero_extend = gen_zero_extendqidi2; + break; + default: + gcc_unreachable (); + } + + end_label = gen_label_rtx (); + qimode_label = gen_label_rtx (); + + scratch = gen_reg_rtx (mode); + + /* Use 8bit unsigned divimod if dividend and divisor are within + the range [0-255]. */ + emit_move_insn (scratch, operands[2]); + scratch = expand_simple_binop (mode, IOR, scratch, operands[3], + scratch, 1, OPTAB_DIRECT); + emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100))); + tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG); + tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx); + tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0, + gen_rtx_LABEL_REF (VOIDmode, qimode_label), + pc_rtx); + insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0)); + predict_jump (REG_BR_PROB_BASE * 50 / 100); + JUMP_LABEL (insn) = qimode_label; + + /* Generate original signed/unsigned divimod. */ + div = gen_divmod4_1 (operands[0], operands[1], + operands[2], operands[3]); + emit_insn (div); + + /* Branch to the end. */ + emit_jump_insn (gen_jump (end_label)); + emit_barrier (); + + /* Generate 8bit unsigned divide. */ + emit_label (qimode_label); + /* Don't use operands[0] for result of 8bit divide since not all + registers support QImode ZERO_EXTRACT. */ + tmp0 = lowpart_subreg (HImode, scratch, mode); + tmp1 = lowpart_subreg (HImode, operands[2], mode); + tmp2 = lowpart_subreg (QImode, operands[3], mode); + emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2)); + + if (signed_p) + { + div = gen_rtx_DIV (mode, operands[2], operands[3]); + mod = gen_rtx_MOD (mode, operands[2], operands[3]); + } + else + { + div = gen_rtx_UDIV (mode, operands[2], operands[3]); + mod = gen_rtx_UMOD (mode, operands[2], operands[3]); + } + if (mode == SImode) + { + if (GET_MODE (operands[0]) != SImode) + div = gen_rtx_ZERO_EXTEND (DImode, div); + if (GET_MODE (operands[1]) != SImode) + mod = gen_rtx_ZERO_EXTEND (DImode, mod); + } + + /* Extract remainder from AH. */ + tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), + tmp0, GEN_INT (8), GEN_INT (8)); + if (REG_P (operands[1])) + insn = emit_move_insn (operands[1], tmp1); + else + { + /* Need a new scratch register since the old one has result + of 8bit divide. */ + scratch = gen_reg_rtx (GET_MODE (operands[1])); + emit_move_insn (scratch, tmp1); + insn = emit_move_insn (operands[1], scratch); + } + set_unique_reg_note (insn, REG_EQUAL, mod); + + /* Zero extend quotient from AL. */ + tmp1 = gen_lowpart (QImode, tmp0); + insn = emit_insn (gen_zero_extend (operands[0], tmp1)); + set_unique_reg_note (insn, REG_EQUAL, div); + + emit_label (end_label); +} + +/* Emit x86 binary operand CODE in mode MODE, where the first operand + matches destination. RTX includes clobber of FLAGS_REG. */ + +void +ix86_emit_binop (enum rtx_code code, machine_mode mode, + rtx dst, rtx src) +{ + rtx op, clob; + + op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src)); + clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); +} + +/* Return true if regno1 def is nearest to the insn. */ + +static bool +find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2) +{ + rtx_insn *prev = insn; + rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn)); + + if (insn == start) + return false; + while (prev && prev != start) + { + if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev)) + { + prev = PREV_INSN (prev); + continue; + } + if (insn_defines_reg (regno1, INVALID_REGNUM, prev)) + return true; + else if (insn_defines_reg (regno2, INVALID_REGNUM, prev)) + return false; + prev = PREV_INSN (prev); + } + + /* None of the regs is defined in the bb. */ + return false; +} + +/* Split lea instructions into a sequence of instructions + which are executed on ALU to avoid AGU stalls. + It is assumed that it is allowed to clobber flags register + at lea position. */ + +void +ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode) +{ + unsigned int regno0, regno1, regno2; + struct ix86_address parts; + rtx target, tmp; + int ok, adds; + + ok = ix86_decompose_address (operands[1], &parts); + gcc_assert (ok); + + target = gen_lowpart (mode, operands[0]); + + regno0 = true_regnum (target); + regno1 = INVALID_REGNUM; + regno2 = INVALID_REGNUM; + + if (parts.base) + { + parts.base = gen_lowpart (mode, parts.base); + regno1 = true_regnum (parts.base); + } + + if (parts.index) + { + parts.index = gen_lowpart (mode, parts.index); + regno2 = true_regnum (parts.index); + } + + if (parts.disp) + parts.disp = gen_lowpart (mode, parts.disp); + + if (parts.scale > 1) + { + /* Case r1 = r1 + ... */ + if (regno1 == regno0) + { + /* If we have a case r1 = r1 + C * r2 then we + should use multiplication which is very + expensive. Assume cost model is wrong if we + have such case here. */ + gcc_assert (regno2 != regno0); + + for (adds = parts.scale; adds > 0; adds--) + ix86_emit_binop (PLUS, mode, target, parts.index); + } + else + { + /* r1 = r2 + r3 * C case. Need to move r3 into r1. */ + if (regno0 != regno2) + emit_insn (gen_rtx_SET (target, parts.index)); + + /* Use shift for scaling. */ + ix86_emit_binop (ASHIFT, mode, target, + GEN_INT (exact_log2 (parts.scale))); + + if (parts.base) + ix86_emit_binop (PLUS, mode, target, parts.base); + + if (parts.disp && parts.disp != const0_rtx) + ix86_emit_binop (PLUS, mode, target, parts.disp); + } + } + else if (!parts.base && !parts.index) + { + gcc_assert(parts.disp); + emit_insn (gen_rtx_SET (target, parts.disp)); + } + else + { + if (!parts.base) + { + if (regno0 != regno2) + emit_insn (gen_rtx_SET (target, parts.index)); + } + else if (!parts.index) + { + if (regno0 != regno1) + emit_insn (gen_rtx_SET (target, parts.base)); + } + else + { + if (regno0 == regno1) + tmp = parts.index; + else if (regno0 == regno2) + tmp = parts.base; + else + { + rtx tmp1; + + /* Find better operand for SET instruction, depending + on which definition is farther from the insn. */ + if (find_nearest_reg_def (insn, regno1, regno2)) + tmp = parts.index, tmp1 = parts.base; + else + tmp = parts.base, tmp1 = parts.index; + + emit_insn (gen_rtx_SET (target, tmp)); + + if (parts.disp && parts.disp != const0_rtx) + ix86_emit_binop (PLUS, mode, target, parts.disp); + + ix86_emit_binop (PLUS, mode, target, tmp1); + return; + } + + ix86_emit_binop (PLUS, mode, target, tmp); + } + + if (parts.disp && parts.disp != const0_rtx) + ix86_emit_binop (PLUS, mode, target, parts.disp); + } +} + +/* Post-reload splitter for converting an SF or DFmode value in an + SSE register into an unsigned SImode. */ + +void +ix86_split_convert_uns_si_sse (rtx operands[]) +{ + machine_mode vecmode; + rtx value, large, zero_or_two31, input, two31, x; + + large = operands[1]; + zero_or_two31 = operands[2]; + input = operands[3]; + two31 = operands[4]; + vecmode = GET_MODE (large); + value = gen_rtx_REG (vecmode, REGNO (operands[0])); + + /* Load up the value into the low element. We must ensure that the other + elements are valid floats -- zero is the easiest such value. */ + if (MEM_P (input)) + { + if (vecmode == V4SFmode) + emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input)); + else + emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input)); + } + else + { + input = gen_rtx_REG (vecmode, REGNO (input)); + emit_move_insn (value, CONST0_RTX (vecmode)); + if (vecmode == V4SFmode) + emit_insn (gen_sse_movss (value, value, input)); + else + emit_insn (gen_sse2_movsd (value, value, input)); + } + + emit_move_insn (large, two31); + emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31); + + x = gen_rtx_fmt_ee (LE, vecmode, large, value); + emit_insn (gen_rtx_SET (large, x)); + + x = gen_rtx_AND (vecmode, zero_or_two31, large); + emit_insn (gen_rtx_SET (zero_or_two31, x)); + + x = gen_rtx_MINUS (vecmode, value, zero_or_two31); + emit_insn (gen_rtx_SET (value, x)); + + large = gen_rtx_REG (V4SImode, REGNO (large)); + emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31))); + + x = gen_rtx_REG (V4SImode, REGNO (value)); + if (vecmode == V4SFmode) + emit_insn (gen_fix_truncv4sfv4si2 (x, value)); + else + emit_insn (gen_sse2_cvttpd2dq (x, value)); + value = x; + + emit_insn (gen_xorv4si3 (value, value, large)); +} + +static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, + machine_mode mode, rtx target, + rtx var, int one_var); + +/* Convert an unsigned DImode value into a DFmode, using only SSE. + Expects the 64-bit DImode to be supplied in a pair of integral + registers. Requires SSE2; will use SSE3 if available. For x86_32, + -mfpmath=sse, !optimize_size only. */ + +void +ix86_expand_convert_uns_didf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt; + rtx int_xmm, fp_xmm; + rtx biases, exponents; + rtx x; + + int_xmm = gen_reg_rtx (V4SImode); + if (TARGET_INTER_UNIT_MOVES_TO_VEC) + emit_insn (gen_movdi_to_sse (int_xmm, input)); + else if (TARGET_SSE_SPLIT_REGS) + { + emit_clobber (int_xmm); + emit_move_insn (gen_lowpart (DImode, int_xmm), input); + } + else + { + x = gen_reg_rtx (V2DImode); + ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0); + emit_move_insn (int_xmm, gen_lowpart (V4SImode, x)); + } + + x = gen_rtx_CONST_VECTOR (V4SImode, + gen_rtvec (4, GEN_INT (0x43300000UL), + GEN_INT (0x45300000UL), + const0_rtx, const0_rtx)); + exponents = validize_mem (force_const_mem (V4SImode, x)); + + /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */ + emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents)); + + /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm) + yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)). + Similarly (0x45300000UL ## fp_value_hi_xmm) yields + (0x1.0p84 + double(fp_value_hi_xmm)). + Note these exponents differ by 32. */ + + fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm)); + + /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values + in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */ + real_ldexp (&bias_lo_rvt, &dconst1, 52); + real_ldexp (&bias_hi_rvt, &dconst1, 84); + biases = const_double_from_real_value (bias_lo_rvt, DFmode); + x = const_double_from_real_value (bias_hi_rvt, DFmode); + biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x)); + biases = validize_mem (force_const_mem (V2DFmode, biases)); + emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases)); + + /* Add the upper and lower DFmode values together. */ + if (TARGET_SSE3) + emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm)); + else + { + x = copy_to_mode_reg (V2DFmode, fp_xmm); + emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm)); + emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x)); + } + + ix86_expand_vector_extract (false, target, fp_xmm, 0); +} + +/* Not used, but eases macroization of patterns. */ +void +ix86_expand_convert_uns_sixf_sse (rtx, rtx) +{ + gcc_unreachable (); +} + +/* Convert an unsigned SImode value into a DFmode. Only currently used + for SSE, but applicable anywhere. */ + +void +ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE TWO31r; + rtx x, fp; + + x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1), + NULL, 1, OPTAB_DIRECT); + + fp = gen_reg_rtx (DFmode); + emit_insn (gen_floatsidf2 (fp, x)); + + real_ldexp (&TWO31r, &dconst1, 31); + x = const_double_from_real_value (TWO31r, DFmode); + + x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert a signed DImode value into a DFmode. Only used for SSE in + 32-bit mode; otherwise we have a direct convert instruction. */ + +void +ix86_expand_convert_sign_didf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE TWO32r; + rtx fp_lo, fp_hi, x; + + fp_lo = gen_reg_rtx (DFmode); + fp_hi = gen_reg_rtx (DFmode); + + emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input))); + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, DFmode); + fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT); + + ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input)); + + x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert an unsigned SImode value into a SFmode, using only SSE. + For x86_32, -mfpmath=sse, !optimize_size only. */ +void +ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE ONE16r; + rtx fp_hi, fp_lo, int_hi, int_lo, x; + + real_ldexp (&ONE16r, &dconst1, 16); + x = const_double_from_real_value (ONE16r, SFmode); + int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff), + NULL, 0, OPTAB_DIRECT); + int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16), + NULL, 0, OPTAB_DIRECT); + fp_hi = gen_reg_rtx (SFmode); + fp_lo = gen_reg_rtx (SFmode); + emit_insn (gen_floatsisf2 (fp_hi, int_hi)); + emit_insn (gen_floatsisf2 (fp_lo, int_lo)); + fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, + 0, OPTAB_DIRECT); + fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (!rtx_equal_p (target, fp_hi)) + emit_move_insn (target, fp_hi); +} + +/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert + a vector of unsigned ints VAL to vector of floats TARGET. */ + +void +ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val) +{ + rtx tmp[8]; + REAL_VALUE_TYPE TWO16r; + machine_mode intmode = GET_MODE (val); + machine_mode fltmode = GET_MODE (target); + rtx (*cvt) (rtx, rtx); + + if (intmode == V4SImode) + cvt = gen_floatv4siv4sf2; + else + cvt = gen_floatv8siv8sf2; + tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff)); + tmp[0] = force_reg (intmode, tmp[0]); + tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1, + OPTAB_DIRECT); + tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16), + NULL_RTX, 1, OPTAB_DIRECT); + tmp[3] = gen_reg_rtx (fltmode); + emit_insn (cvt (tmp[3], tmp[1])); + tmp[4] = gen_reg_rtx (fltmode); + emit_insn (cvt (tmp[4], tmp[2])); + real_ldexp (&TWO16r, &dconst1, 16); + tmp[5] = const_double_from_real_value (TWO16r, SFmode); + tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5])); + tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1, + OPTAB_DIRECT); + tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1, + OPTAB_DIRECT); + if (tmp[7] != target) + emit_move_insn (target, tmp[7]); +} + +/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc* + pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*. + This is done by doing just signed conversion if < 0x1p31, and otherwise by + subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */ + +rtx +ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp) +{ + REAL_VALUE_TYPE TWO31r; + rtx two31r, tmp[4]; + machine_mode mode = GET_MODE (val); + machine_mode scalarmode = GET_MODE_INNER (mode); + machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode; + rtx (*cmp) (rtx, rtx, rtx, rtx); + int i; + + for (i = 0; i < 3; i++) + tmp[i] = gen_reg_rtx (mode); + real_ldexp (&TWO31r, &dconst1, 31); + two31r = const_double_from_real_value (TWO31r, scalarmode); + two31r = ix86_build_const_vector (mode, 1, two31r); + two31r = force_reg (mode, two31r); + switch (mode) + { + case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break; + case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break; + case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break; + case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break; + default: gcc_unreachable (); + } + tmp[3] = gen_rtx_LE (mode, two31r, val); + emit_insn (cmp (tmp[0], two31r, val, tmp[3])); + tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1], + 0, OPTAB_DIRECT); + if (intmode == V4SImode || TARGET_AVX2) + *xorp = expand_simple_binop (intmode, ASHIFT, + gen_lowpart (intmode, tmp[0]), + GEN_INT (31), NULL_RTX, 0, + OPTAB_DIRECT); + else + { + rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31); + two31 = ix86_build_const_vector (intmode, 1, two31); + *xorp = expand_simple_binop (intmode, AND, + gen_lowpart (intmode, tmp[0]), + two31, NULL_RTX, 0, + OPTAB_DIRECT); + } + return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2], + 0, OPTAB_DIRECT); +} + +/* Generate code for floating point ABS or NEG. */ + +void +ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode, + rtx operands[]) +{ + rtx mask, set, dst, src; + bool use_sse = false; + bool vector_mode = VECTOR_MODE_P (mode); + machine_mode vmode = mode; + + if (vector_mode) + use_sse = true; + else if (mode == TFmode) + use_sse = true; + else if (TARGET_SSE_MATH) + { + use_sse = SSE_FLOAT_MODE_P (mode); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + } + + /* NEG and ABS performed with SSE use bitwise mask operations. + Create the appropriate mask now. */ + if (use_sse) + mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS); + else + mask = NULL_RTX; + + dst = operands[0]; + src = operands[1]; + + set = gen_rtx_fmt_e (code, mode, src); + set = gen_rtx_SET (dst, set); + + if (mask) + { + rtx use, clob; + rtvec par; + + use = gen_rtx_USE (VOIDmode, mask); + if (vector_mode) + par = gen_rtvec (2, set, use); + else + { + clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + par = gen_rtvec (3, set, use, clob); + } + emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); + } + else + emit_insn (set); +} + +/* Expand a copysign operation. Special case operand 0 being a constant. */ + +void +ix86_expand_copysign (rtx operands[]) +{ + machine_mode mode, vmode; + rtx dest, op0, op1, mask, nmask; + + dest = operands[0]; + op0 = operands[1]; + op1 = operands[2]; + + mode = GET_MODE (dest); + + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + + if (CONST_DOUBLE_P (op0)) + { + rtx (*copysign_insn)(rtx, rtx, rtx, rtx); + + if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0))) + op0 = simplify_unary_operation (ABS, mode, op0, mode); + + if (mode == SFmode || mode == DFmode) + { + if (op0 == CONST0_RTX (mode)) + op0 = CONST0_RTX (vmode); + else + { + rtx v = ix86_build_const_vector (vmode, false, op0); + + op0 = force_reg (vmode, v); + } + } + else if (op0 != CONST0_RTX (mode)) + op0 = force_reg (mode, op0); + + mask = ix86_build_signbit_mask (vmode, 0, 0); + + if (mode == SFmode) + copysign_insn = gen_copysignsf3_const; + else if (mode == DFmode) + copysign_insn = gen_copysigndf3_const; + else + copysign_insn = gen_copysigntf3_const; + + emit_insn (copysign_insn (dest, op0, op1, mask)); + } + else + { + rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx); + + nmask = ix86_build_signbit_mask (vmode, 0, 1); + mask = ix86_build_signbit_mask (vmode, 0, 0); + + if (mode == SFmode) + copysign_insn = gen_copysignsf3_var; + else if (mode == DFmode) + copysign_insn = gen_copysigndf3_var; + else + copysign_insn = gen_copysigntf3_var; + + emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask)); + } +} + +/* Deconstruct a copysign operation into bit masks. Operand 0 is known to + be a constant, and so has already been expanded into a vector constant. */ + +void +ix86_split_copysign_const (rtx operands[]) +{ + machine_mode mode, vmode; + rtx dest, op0, mask, x; + + dest = operands[0]; + op0 = operands[1]; + mask = operands[3]; + + mode = GET_MODE (dest); + vmode = GET_MODE (mask); + + dest = lowpart_subreg (vmode, dest, mode); + x = gen_rtx_AND (vmode, dest, mask); + emit_insn (gen_rtx_SET (dest, x)); + + if (op0 != CONST0_RTX (vmode)) + { + x = gen_rtx_IOR (vmode, dest, op0); + emit_insn (gen_rtx_SET (dest, x)); + } +} + +/* Deconstruct a copysign operation into bit masks. Operand 0 is variable, + so we have to do two masks. */ + +void +ix86_split_copysign_var (rtx operands[]) +{ + machine_mode mode, vmode; + rtx dest, scratch, op0, op1, mask, nmask, x; + + dest = operands[0]; + scratch = operands[1]; + op0 = operands[2]; + op1 = operands[3]; + nmask = operands[4]; + mask = operands[5]; + + mode = GET_MODE (dest); + vmode = GET_MODE (mask); + + if (rtx_equal_p (op0, op1)) + { + /* Shouldn't happen often (it's useless, obviously), but when it does + we'd generate incorrect code if we continue below. */ + emit_move_insn (dest, op0); + return; + } + + if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */ + { + gcc_assert (REGNO (op1) == REGNO (scratch)); + + x = gen_rtx_AND (vmode, scratch, mask); + emit_insn (gen_rtx_SET (scratch, x)); + + dest = mask; + op0 = lowpart_subreg (vmode, op0, mode); + x = gen_rtx_NOT (vmode, dest); + x = gen_rtx_AND (vmode, x, op0); + emit_insn (gen_rtx_SET (dest, x)); + } + else + { + if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */ + { + x = gen_rtx_AND (vmode, scratch, mask); + } + else /* alternative 2,4 */ + { + gcc_assert (REGNO (mask) == REGNO (scratch)); + op1 = lowpart_subreg (vmode, op1, mode); + x = gen_rtx_AND (vmode, scratch, op1); + } + emit_insn (gen_rtx_SET (scratch, x)); + + if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */ + { + dest = lowpart_subreg (vmode, op0, mode); + x = gen_rtx_AND (vmode, dest, nmask); + } + else /* alternative 3,4 */ + { + gcc_assert (REGNO (nmask) == REGNO (dest)); + dest = nmask; + op0 = lowpart_subreg (vmode, op0, mode); + x = gen_rtx_AND (vmode, dest, op0); + } + emit_insn (gen_rtx_SET (dest, x)); + } + + x = gen_rtx_IOR (vmode, dest, scratch); + emit_insn (gen_rtx_SET (dest, x)); +} + +/* Expand an xorsign operation. */ + +void +ix86_expand_xorsign (rtx operands[]) +{ + rtx (*xorsign_insn)(rtx, rtx, rtx, rtx); + machine_mode mode, vmode; + rtx dest, op0, op1, mask; + + dest = operands[0]; + op0 = operands[1]; + op1 = operands[2]; + + mode = GET_MODE (dest); + + if (mode == SFmode) + { + xorsign_insn = gen_xorsignsf3_1; + vmode = V4SFmode; + } + else if (mode == DFmode) + { + xorsign_insn = gen_xorsigndf3_1; + vmode = V2DFmode; + } + else + gcc_unreachable (); + + mask = ix86_build_signbit_mask (vmode, 0, 0); + + emit_insn (xorsign_insn (dest, op0, op1, mask)); +} + +/* Deconstruct an xorsign operation into bit masks. */ + +void +ix86_split_xorsign (rtx operands[]) +{ + machine_mode mode, vmode; + rtx dest, op0, mask, x; + + dest = operands[0]; + op0 = operands[1]; + mask = operands[3]; + + mode = GET_MODE (dest); + vmode = GET_MODE (mask); + + dest = lowpart_subreg (vmode, dest, mode); + x = gen_rtx_AND (vmode, dest, mask); + emit_insn (gen_rtx_SET (dest, x)); + + op0 = lowpart_subreg (vmode, op0, mode); + x = gen_rtx_XOR (vmode, dest, op0); + emit_insn (gen_rtx_SET (dest, x)); +} + +static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1); + +void +ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) +{ + machine_mode mode = GET_MODE (op0); + rtx tmp; + + /* Handle special case - vector comparsion with boolean result, transform + it using ptest instruction. */ + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG); + machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode; + + gcc_assert (code == EQ || code == NE); + /* Generate XOR since we can't check that one operand is zero vector. */ + tmp = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1))); + tmp = gen_lowpart (p_mode, tmp); + emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG), + gen_rtx_UNSPEC (CCmode, + gen_rtvec (2, tmp, tmp), + UNSPEC_PTEST))); + tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, label), + pc_rtx); + emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + return; + } + + switch (mode) + { + case E_SFmode: + case E_DFmode: + case E_XFmode: + case E_QImode: + case E_HImode: + case E_SImode: + simple: + tmp = ix86_expand_compare (code, op0, op1); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, label), + pc_rtx); + emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + return; + + case E_DImode: + if (TARGET_64BIT) + goto simple; + /* For 32-bit target DI comparison may be performed on + SSE registers. To allow this we should avoid split + to SI mode which is achieved by doing xor in DI mode + and then comparing with zero (which is recognized by + STV pass). We don't compare using xor when optimizing + for size. */ + if (!optimize_insn_for_size_p () + && TARGET_STV + && (code == EQ || code == NE)) + { + op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1)); + op1 = const0_rtx; + } + /* FALLTHRU */ + case E_TImode: + /* Expand DImode branch into multiple compare+branch. */ + { + rtx lo[2], hi[2]; + rtx_code_label *label2; + enum rtx_code code1, code2, code3; + machine_mode submode; + + if (CONSTANT_P (op0) && !CONSTANT_P (op1)) + { + std::swap (op0, op1); + code = swap_condition (code); + } + + split_double_mode (mode, &op0, 1, lo+0, hi+0); + split_double_mode (mode, &op1, 1, lo+1, hi+1); + + submode = mode == DImode ? SImode : DImode; + + /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to + avoid two branches. This costs one extra insn, so disable when + optimizing for size. */ + + if ((code == EQ || code == NE) + && (!optimize_insn_for_size_p () + || hi[1] == const0_rtx || lo[1] == const0_rtx)) + { + rtx xor0, xor1; + + xor1 = hi[0]; + if (hi[1] != const0_rtx) + xor1 = expand_binop (submode, xor_optab, xor1, hi[1], + NULL_RTX, 0, OPTAB_WIDEN); + + xor0 = lo[0]; + if (lo[1] != const0_rtx) + xor0 = expand_binop (submode, xor_optab, xor0, lo[1], + NULL_RTX, 0, OPTAB_WIDEN); + + tmp = expand_binop (submode, ior_optab, xor1, xor0, + NULL_RTX, 0, OPTAB_WIDEN); + + ix86_expand_branch (code, tmp, const0_rtx, label); + return; + } + + /* Otherwise, if we are doing less-than or greater-or-equal-than, + op1 is a constant and the low word is zero, then we can just + examine the high word. Similarly for low word -1 and + less-or-equal-than or greater-than. */ + + if (CONST_INT_P (hi[1])) + switch (code) + { + case LT: case LTU: case GE: case GEU: + if (lo[1] == const0_rtx) + { + ix86_expand_branch (code, hi[0], hi[1], label); + return; + } + break; + case LE: case LEU: case GT: case GTU: + if (lo[1] == constm1_rtx) + { + ix86_expand_branch (code, hi[0], hi[1], label); + return; + } + break; + default: + break; + } + + /* Emulate comparisons that do not depend on Zero flag with + double-word subtraction. Note that only Overflow, Sign + and Carry flags are valid, so swap arguments and condition + of comparisons that would otherwise test Zero flag. */ + + switch (code) + { + case LE: case LEU: case GT: case GTU: + std::swap (lo[0], lo[1]); + std::swap (hi[0], hi[1]); + code = swap_condition (code); + /* FALLTHRU */ + + case LT: case LTU: case GE: case GEU: + { + rtx (*cmp_insn) (rtx, rtx); + rtx (*sbb_insn) (rtx, rtx, rtx); + bool uns = (code == LTU || code == GEU); + + if (TARGET_64BIT) + { + cmp_insn = gen_cmpdi_1; + sbb_insn + = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz; + } + else + { + cmp_insn = gen_cmpsi_1; + sbb_insn + = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz; + } + + if (!nonimmediate_operand (lo[0], submode)) + lo[0] = force_reg (submode, lo[0]); + if (!x86_64_general_operand (lo[1], submode)) + lo[1] = force_reg (submode, lo[1]); + + if (!register_operand (hi[0], submode)) + hi[0] = force_reg (submode, hi[0]); + if ((uns && !nonimmediate_operand (hi[1], submode)) + || (!uns && !x86_64_general_operand (hi[1], submode))) + hi[1] = force_reg (submode, hi[1]); + + emit_insn (cmp_insn (lo[0], lo[1])); + emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1])); + + tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG); + + ix86_expand_branch (code, tmp, const0_rtx, label); + return; + } + + default: + break; + } + + /* Otherwise, we need two or three jumps. */ + + label2 = gen_label_rtx (); + + code1 = code; + code2 = swap_condition (code); + code3 = unsigned_condition (code); + + switch (code) + { + case LT: case GT: case LTU: case GTU: + break; + + case LE: code1 = LT; code2 = GT; break; + case GE: code1 = GT; code2 = LT; break; + case LEU: code1 = LTU; code2 = GTU; break; + case GEU: code1 = GTU; code2 = LTU; break; + + case EQ: code1 = UNKNOWN; code2 = NE; break; + case NE: code2 = UNKNOWN; break; + + default: + gcc_unreachable (); + } + + /* + * a < b => + * if (hi(a) < hi(b)) goto true; + * if (hi(a) > hi(b)) goto false; + * if (lo(a) < lo(b)) goto true; + * false: + */ + + if (code1 != UNKNOWN) + ix86_expand_branch (code1, hi[0], hi[1], label); + if (code2 != UNKNOWN) + ix86_expand_branch (code2, hi[0], hi[1], label2); + + ix86_expand_branch (code3, lo[0], lo[1], label); + + if (code2 != UNKNOWN) + emit_label (label2); + return; + } + + default: + gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC); + goto simple; + } +} + +/* Figure out whether to use unordered fp comparisons. */ + +static bool +ix86_unordered_fp_compare (enum rtx_code code) +{ + if (!TARGET_IEEE_FP) + return false; + + switch (code) + { + case GT: + case GE: + case LT: + case LE: + return false; + + case EQ: + case NE: + + case LTGT: + case UNORDERED: + case ORDERED: + case UNLT: + case UNLE: + case UNGT: + case UNGE: + case UNEQ: + return true; + + default: + gcc_unreachable (); + } +} + +/* Return a comparison we can do and that it is equivalent to + swap_condition (code) apart possibly from orderedness. + But, never change orderedness if TARGET_IEEE_FP, returning + UNKNOWN in that case if necessary. */ + +static enum rtx_code +ix86_fp_swap_condition (enum rtx_code code) +{ + switch (code) + { + case GT: /* GTU - CF=0 & ZF=0 */ + return TARGET_IEEE_FP ? UNKNOWN : UNLT; + case GE: /* GEU - CF=0 */ + return TARGET_IEEE_FP ? UNKNOWN : UNLE; + case UNLT: /* LTU - CF=1 */ + return TARGET_IEEE_FP ? UNKNOWN : GT; + case UNLE: /* LEU - CF=1 | ZF=1 */ + return TARGET_IEEE_FP ? UNKNOWN : GE; + default: + return swap_condition (code); + } +} + +/* Return cost of comparison CODE using the best strategy for performance. + All following functions do use number of instructions as a cost metrics. + In future this should be tweaked to compute bytes for optimize_size and + take into account performance of various instructions on various CPUs. */ + +static int +ix86_fp_comparison_cost (enum rtx_code code) +{ + int arith_cost; + + /* The cost of code using bit-twiddling on %ah. */ + switch (code) + { + case UNLE: + case UNLT: + case LTGT: + case GT: + case GE: + case UNORDERED: + case ORDERED: + case UNEQ: + arith_cost = 4; + break; + case LT: + case NE: + case EQ: + case UNGE: + arith_cost = TARGET_IEEE_FP ? 5 : 4; + break; + case LE: + case UNGT: + arith_cost = TARGET_IEEE_FP ? 6 : 4; + break; + default: + gcc_unreachable (); + } + + switch (ix86_fp_comparison_strategy (code)) + { + case IX86_FPCMP_COMI: + return arith_cost > 4 ? 3 : 2; + case IX86_FPCMP_SAHF: + return arith_cost > 4 ? 4 : 3; + default: + return arith_cost; + } +} + +/* Swap, force into registers, or otherwise massage the two operands + to a fp comparison. The operands are updated in place; the new + comparison code is returned. */ + +static enum rtx_code +ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) +{ + bool unordered_compare = ix86_unordered_fp_compare (code); + rtx op0 = *pop0, op1 = *pop1; + machine_mode op_mode = GET_MODE (op0); + bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); + + /* All of the unordered compare instructions only work on registers. + The same is true of the fcomi compare instructions. The XFmode + compare instructions require registers except when comparing + against zero or when converting operand 1 from fixed point to + floating point. */ + + if (!is_sse + && (unordered_compare + || (op_mode == XFmode + && ! (standard_80387_constant_p (op0) == 1 + || standard_80387_constant_p (op1) == 1) + && GET_CODE (op1) != FLOAT) + || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI)) + { + op0 = force_reg (op_mode, op0); + op1 = force_reg (op_mode, op1); + } + else + { + /* %%% We only allow op1 in memory; op0 must be st(0). So swap + things around if they appear profitable, otherwise force op0 + into a register. */ + + if (standard_80387_constant_p (op0) == 0 + || (MEM_P (op0) + && ! (standard_80387_constant_p (op1) == 0 + || MEM_P (op1)))) + { + enum rtx_code new_code = ix86_fp_swap_condition (code); + if (new_code != UNKNOWN) + { + std::swap (op0, op1); + code = new_code; + } + } + + if (!REG_P (op0)) + op0 = force_reg (op_mode, op0); + + if (CONSTANT_P (op1)) + { + int tmp = standard_80387_constant_p (op1); + if (tmp == 0) + op1 = validize_mem (force_const_mem (op_mode, op1)); + else if (tmp == 1) + { + if (TARGET_CMOVE) + op1 = force_reg (op_mode, op1); + } + else + op1 = force_reg (op_mode, op1); + } + } + + /* Try to rearrange the comparison to make it cheaper. */ + if (ix86_fp_comparison_cost (code) + > ix86_fp_comparison_cost (swap_condition (code)) + && (REG_P (op1) || can_create_pseudo_p ())) + { + std::swap (op0, op1); + code = swap_condition (code); + if (!REG_P (op0)) + op0 = force_reg (op_mode, op0); + } + + *pop0 = op0; + *pop1 = op1; + return code; +} + +/* Generate insn patterns to do a floating point compare of OPERANDS. */ + +static rtx +ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1) +{ + bool unordered_compare = ix86_unordered_fp_compare (code); + machine_mode cmp_mode; + rtx tmp, scratch; + + code = ix86_prepare_fp_compare_args (code, &op0, &op1); + + tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); + if (unordered_compare) + tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); + + /* Do fcomi/sahf based test when profitable. */ + switch (ix86_fp_comparison_strategy (code)) + { + case IX86_FPCMP_COMI: + cmp_mode = CCFPmode; + emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp)); + break; + + case IX86_FPCMP_SAHF: + cmp_mode = CCFPmode; + tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); + scratch = gen_reg_rtx (HImode); + emit_insn (gen_rtx_SET (scratch, tmp)); + emit_insn (gen_x86_sahf_1 (scratch)); + break; + + case IX86_FPCMP_ARITH: + cmp_mode = CCNOmode; + tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); + scratch = gen_reg_rtx (HImode); + emit_insn (gen_rtx_SET (scratch, tmp)); + + /* In the unordered case, we have to check C2 for NaN's, which + doesn't happen to work out to anything nice combination-wise. + So do some bit twiddling on the value we've got in AH to come + up with an appropriate set of condition codes. */ + + switch (code) + { + case GT: + case UNGT: + if (code == GT || !TARGET_IEEE_FP) + { + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); + code = EQ; + } + else + { + emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); + emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44))); + cmp_mode = CCmode; + code = GEU; + } + break; + case LT: + case UNLT: + if (code == LT && TARGET_IEEE_FP) + { + emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx)); + cmp_mode = CCmode; + code = EQ; + } + else + { + emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx)); + code = NE; + } + break; + case GE: + case UNGE: + if (code == GE || !TARGET_IEEE_FP) + { + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05))); + code = EQ; + } + else + { + emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx)); + code = NE; + } + break; + case LE: + case UNLE: + if (code == LE && TARGET_IEEE_FP) + { + emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); + emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); + cmp_mode = CCmode; + code = LTU; + } + else + { + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); + code = NE; + } + break; + case EQ: + case UNEQ: + if (code == EQ && TARGET_IEEE_FP) + { + emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); + cmp_mode = CCmode; + code = EQ; + } + else + { + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); + code = NE; + } + break; + case NE: + case LTGT: + if (code == NE && TARGET_IEEE_FP) + { + emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); + emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, + GEN_INT (0x40))); + code = NE; + } + else + { + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); + code = EQ; + } + break; + + case UNORDERED: + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); + code = NE; + break; + case ORDERED: + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); + code = EQ; + break; + + default: + gcc_unreachable (); + } + break; + + default: + gcc_unreachable(); + } + + /* Return the test that should be put into the flags user, i.e. + the bcc, scc, or cmov instruction. */ + return gen_rtx_fmt_ee (code, VOIDmode, + gen_rtx_REG (cmp_mode, FLAGS_REG), + const0_rtx); +} + +/* Generate insn patterns to do an integer compare of OPERANDS. */ + +static rtx +ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1) +{ + machine_mode cmpmode; + rtx tmp, flags; + + cmpmode = SELECT_CC_MODE (code, op0, op1); + flags = gen_rtx_REG (cmpmode, FLAGS_REG); + + /* This is very simple, but making the interface the same as in the + FP case makes the rest of the code easier. */ + tmp = gen_rtx_COMPARE (cmpmode, op0, op1); + emit_insn (gen_rtx_SET (flags, tmp)); + + /* Return the test that should be put into the flags user, i.e. + the bcc, scc, or cmov instruction. */ + return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx); +} + +static rtx +ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1) +{ + rtx ret; + + if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC) + ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1); + + else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0))) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0))); + ret = ix86_expand_fp_compare (code, op0, op1); + } + else + ret = ix86_expand_int_compare (code, op0, op1); + + return ret; +} + +void +ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1) +{ + rtx ret; + + gcc_assert (GET_MODE (dest) == QImode); + + ret = ix86_expand_compare (code, op0, op1); + PUT_MODE (ret, QImode); + emit_insn (gen_rtx_SET (dest, ret)); +} + +/* Expand comparison setting or clearing carry flag. Return true when + successful and set pop for the operation. */ +static bool +ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) +{ + machine_mode mode + = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); + + /* Do not handle double-mode compares that go through special path. */ + if (mode == (TARGET_64BIT ? TImode : DImode)) + return false; + + if (SCALAR_FLOAT_MODE_P (mode)) + { + rtx compare_op; + rtx_insn *compare_seq; + + gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); + + /* Shortcut: following common codes never translate + into carry flag compares. */ + if (code == EQ || code == NE || code == UNEQ || code == LTGT + || code == ORDERED || code == UNORDERED) + return false; + + /* These comparisons require zero flag; swap operands so they won't. */ + if ((code == GT || code == UNLE || code == LE || code == UNGT) + && !TARGET_IEEE_FP) + { + std::swap (op0, op1); + code = swap_condition (code); + } + + /* Try to expand the comparison and verify that we end up with + carry flag based comparison. This fails to be true only when + we decide to expand comparison using arithmetic that is not + too common scenario. */ + start_sequence (); + compare_op = ix86_expand_fp_compare (code, op0, op1); + compare_seq = get_insns (); + end_sequence (); + + if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode) + code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op)); + else + code = GET_CODE (compare_op); + + if (code != LTU && code != GEU) + return false; + + emit_insn (compare_seq); + *pop = compare_op; + return true; + } + + if (!INTEGRAL_MODE_P (mode)) + return false; + + switch (code) + { + case LTU: + case GEU: + break; + + /* Convert a==0 into (unsigned)a<1. */ + case EQ: + case NE: + if (op1 != const0_rtx) + return false; + op1 = const1_rtx; + code = (code == EQ ? LTU : GEU); + break; + + /* Convert a>b into b=b-1. */ + case GTU: + case LEU: + if (CONST_INT_P (op1)) + { + op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); + /* Bail out on overflow. We still can swap operands but that + would force loading of the constant into register. */ + if (op1 == const0_rtx + || !x86_64_immediate_operand (op1, GET_MODE (op1))) + return false; + code = (code == GTU ? GEU : LTU); + } + else + { + std::swap (op0, op1); + code = (code == GTU ? LTU : GEU); + } + break; + + /* Convert a>=0 into (unsigned)a<0x80000000. */ + case LT: + case GE: + if (mode == DImode || op1 != const0_rtx) + return false; + op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); + code = (code == LT ? GEU : LTU); + break; + case LE: + case GT: + if (mode == DImode || op1 != constm1_rtx) + return false; + op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); + code = (code == LE ? GEU : LTU); + break; + + default: + return false; + } + /* Swapping operands may cause constant to appear as first operand. */ + if (!nonimmediate_operand (op0, VOIDmode)) + { + if (!can_create_pseudo_p ()) + return false; + op0 = force_reg (mode, op0); + } + *pop = ix86_expand_compare (code, op0, op1); + gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU); + return true; +} + +/* Expand conditional increment or decrement using adb/sbb instructions. + The default case using setcc followed by the conditional move can be + done by generic code. */ +bool +ix86_expand_int_addcc (rtx operands[]) +{ + enum rtx_code code = GET_CODE (operands[1]); + rtx flags; + rtx (*insn)(rtx, rtx, rtx, rtx, rtx); + rtx compare_op; + rtx val = const0_rtx; + bool fpcmp = false; + machine_mode mode; + rtx op0 = XEXP (operands[1], 0); + rtx op1 = XEXP (operands[1], 1); + + if (operands[3] != const1_rtx + && operands[3] != constm1_rtx) + return false; + if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) + return false; + code = GET_CODE (compare_op); + + flags = XEXP (compare_op, 0); + + if (GET_MODE (flags) == CCFPmode) + { + fpcmp = true; + code = ix86_fp_compare_code_to_integer (code); + } + + if (code != LTU) + { + val = constm1_rtx; + if (fpcmp) + PUT_CODE (compare_op, + reverse_condition_maybe_unordered + (GET_CODE (compare_op))); + else + PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op))); + } + + mode = GET_MODE (operands[0]); + + /* Construct either adc or sbb insn. */ + if ((code == LTU) == (operands[3] == constm1_rtx)) + { + switch (mode) + { + case E_QImode: + insn = gen_subqi3_carry; + break; + case E_HImode: + insn = gen_subhi3_carry; + break; + case E_SImode: + insn = gen_subsi3_carry; + break; + case E_DImode: + insn = gen_subdi3_carry; + break; + default: + gcc_unreachable (); + } + } + else + { + switch (mode) + { + case E_QImode: + insn = gen_addqi3_carry; + break; + case E_HImode: + insn = gen_addhi3_carry; + break; + case E_SImode: + insn = gen_addsi3_carry; + break; + case E_DImode: + insn = gen_adddi3_carry; + break; + default: + gcc_unreachable (); + } + } + emit_insn (insn (operands[0], operands[2], val, flags, compare_op)); + + return true; +} + +bool +ix86_expand_int_movcc (rtx operands[]) +{ + enum rtx_code code = GET_CODE (operands[1]), compare_code; + rtx_insn *compare_seq; + rtx compare_op; + machine_mode mode = GET_MODE (operands[0]); + bool sign_bit_compare_p = false; + rtx op0 = XEXP (operands[1], 0); + rtx op1 = XEXP (operands[1], 1); + + if (GET_MODE (op0) == TImode + || (GET_MODE (op0) == DImode + && !TARGET_64BIT)) + return false; + + start_sequence (); + compare_op = ix86_expand_compare (code, op0, op1); + compare_seq = get_insns (); + end_sequence (); + + compare_code = GET_CODE (compare_op); + + if ((op1 == const0_rtx && (code == GE || code == LT)) + || (op1 == constm1_rtx && (code == GT || code == LE))) + sign_bit_compare_p = true; + + /* Don't attempt mode expansion here -- if we had to expand 5 or 6 + HImode insns, we'd be swallowed in word prefix ops. */ + + if ((mode != HImode || TARGET_FAST_PREFIX) + && (mode != (TARGET_64BIT ? TImode : DImode)) + && CONST_INT_P (operands[2]) + && CONST_INT_P (operands[3])) + { + rtx out = operands[0]; + HOST_WIDE_INT ct = INTVAL (operands[2]); + HOST_WIDE_INT cf = INTVAL (operands[3]); + HOST_WIDE_INT diff; + + diff = ct - cf; + /* Sign bit compares are better done using shifts than we do by using + sbb. */ + if (sign_bit_compare_p + || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) + { + /* Detect overlap between destination and compare sources. */ + rtx tmp = out; + + if (!sign_bit_compare_p) + { + rtx flags; + bool fpcmp = false; + + compare_code = GET_CODE (compare_op); + + flags = XEXP (compare_op, 0); + + if (GET_MODE (flags) == CCFPmode) + { + fpcmp = true; + compare_code + = ix86_fp_compare_code_to_integer (compare_code); + } + + /* To simplify rest of code, restrict to the GEU case. */ + if (compare_code == LTU) + { + std::swap (ct, cf); + compare_code = reverse_condition (compare_code); + code = reverse_condition (code); + } + else + { + if (fpcmp) + PUT_CODE (compare_op, + reverse_condition_maybe_unordered + (GET_CODE (compare_op))); + else + PUT_CODE (compare_op, + reverse_condition (GET_CODE (compare_op))); + } + diff = ct - cf; + + if (reg_overlap_mentioned_p (out, op0) + || reg_overlap_mentioned_p (out, op1)) + tmp = gen_reg_rtx (mode); + + if (mode == DImode) + emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op)); + else + emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), + flags, compare_op)); + } + else + { + if (code == GT || code == GE) + code = reverse_condition (code); + else + { + std::swap (ct, cf); + diff = ct - cf; + } + tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1); + } + + if (diff == 1) + { + /* + * cmpl op0,op1 + * sbbl dest,dest + * [addl dest, ct] + * + * Size 5 - 8. + */ + if (ct) + tmp = expand_simple_binop (mode, PLUS, + tmp, GEN_INT (ct), + copy_rtx (tmp), 1, OPTAB_DIRECT); + } + else if (cf == -1) + { + /* + * cmpl op0,op1 + * sbbl dest,dest + * orl $ct, dest + * + * Size 8. + */ + tmp = expand_simple_binop (mode, IOR, + tmp, GEN_INT (ct), + copy_rtx (tmp), 1, OPTAB_DIRECT); + } + else if (diff == -1 && ct) + { + /* + * cmpl op0,op1 + * sbbl dest,dest + * notl dest + * [addl dest, cf] + * + * Size 8 - 11. + */ + tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); + if (cf) + tmp = expand_simple_binop (mode, PLUS, + copy_rtx (tmp), GEN_INT (cf), + copy_rtx (tmp), 1, OPTAB_DIRECT); + } + else + { + /* + * cmpl op0,op1 + * sbbl dest,dest + * [notl dest] + * andl cf - ct, dest + * [addl dest, ct] + * + * Size 8 - 11. + */ + + if (cf == 0) + { + cf = ct; + ct = 0; + tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); + } + + tmp = expand_simple_binop (mode, AND, + copy_rtx (tmp), + gen_int_mode (cf - ct, mode), + copy_rtx (tmp), 1, OPTAB_DIRECT); + if (ct) + tmp = expand_simple_binop (mode, PLUS, + copy_rtx (tmp), GEN_INT (ct), + copy_rtx (tmp), 1, OPTAB_DIRECT); + } + + if (!rtx_equal_p (tmp, out)) + emit_move_insn (copy_rtx (out), copy_rtx (tmp)); + + return true; + } + + if (diff < 0) + { + machine_mode cmp_mode = GET_MODE (op0); + enum rtx_code new_code; + + if (SCALAR_FLOAT_MODE_P (cmp_mode)) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); + + /* We may be reversing unordered compare to normal compare, that + is not valid in general (we may convert non-trapping condition + to trapping one), however on i386 we currently emit all + comparisons unordered. */ + new_code = reverse_condition_maybe_unordered (code); + } + else + new_code = ix86_reverse_condition (code, cmp_mode); + if (new_code != UNKNOWN) + { + std::swap (ct, cf); + diff = -diff; + code = new_code; + } + } + + compare_code = UNKNOWN; + if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT + && CONST_INT_P (op1)) + { + if (op1 == const0_rtx + && (code == LT || code == GE)) + compare_code = code; + else if (op1 == constm1_rtx) + { + if (code == LE) + compare_code = LT; + else if (code == GT) + compare_code = GE; + } + } + + /* Optimize dest = (op0 < 0) ? -1 : cf. */ + if (compare_code != UNKNOWN + && GET_MODE (op0) == GET_MODE (out) + && (cf == -1 || ct == -1)) + { + /* If lea code below could be used, only optimize + if it results in a 2 insn sequence. */ + + if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8 + || diff == 3 || diff == 5 || diff == 9) + || (compare_code == LT && ct == -1) + || (compare_code == GE && cf == -1)) + { + /* + * notl op1 (if necessary) + * sarl $31, op1 + * orl cf, op1 + */ + if (ct != -1) + { + cf = ct; + ct = -1; + code = reverse_condition (code); + } + + out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); + + out = expand_simple_binop (mode, IOR, + out, GEN_INT (cf), + out, 1, OPTAB_DIRECT); + if (out != operands[0]) + emit_move_insn (operands[0], out); + + return true; + } + } + + + if ((diff == 1 || diff == 2 || diff == 4 || diff == 8 + || diff == 3 || diff == 5 || diff == 9) + && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL) + && (mode != DImode + || x86_64_immediate_operand (GEN_INT (cf), VOIDmode))) + { + /* + * xorl dest,dest + * cmpl op1,op2 + * setcc dest + * lea cf(dest*(ct-cf)),dest + * + * Size 14. + * + * This also catches the degenerate setcc-only case. + */ + + rtx tmp; + int nops; + + out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); + + nops = 0; + /* On x86_64 the lea instruction operates on Pmode, so we need + to get arithmetics done in proper mode to match. */ + if (diff == 1) + tmp = copy_rtx (out); + else + { + rtx out1; + out1 = copy_rtx (out); + tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1)); + nops++; + if (diff & 1) + { + tmp = gen_rtx_PLUS (mode, tmp, out1); + nops++; + } + } + if (cf != 0) + { + tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf)); + nops++; + } + if (!rtx_equal_p (tmp, out)) + { + if (nops == 1) + out = force_operand (tmp, copy_rtx (out)); + else + emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp))); + } + if (!rtx_equal_p (out, operands[0])) + emit_move_insn (operands[0], copy_rtx (out)); + + return true; + } + + /* + * General case: Jumpful: + * xorl dest,dest cmpl op1, op2 + * cmpl op1, op2 movl ct, dest + * setcc dest jcc 1f + * decl dest movl cf, dest + * andl (cf-ct),dest 1: + * addl ct,dest + * + * Size 20. Size 14. + * + * This is reasonably steep, but branch mispredict costs are + * high on modern cpus, so consider failing only if optimizing + * for space. + */ + + if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) + && BRANCH_COST (optimize_insn_for_speed_p (), + false) >= 2) + { + if (cf == 0) + { + machine_mode cmp_mode = GET_MODE (op0); + enum rtx_code new_code; + + if (SCALAR_FLOAT_MODE_P (cmp_mode)) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); + + /* We may be reversing unordered compare to normal compare, + that is not valid in general (we may convert non-trapping + condition to trapping one), however on i386 we currently + emit all comparisons unordered. */ + new_code = reverse_condition_maybe_unordered (code); + } + else + { + new_code = ix86_reverse_condition (code, cmp_mode); + if (compare_code != UNKNOWN && new_code != UNKNOWN) + compare_code = reverse_condition (compare_code); + } + + if (new_code != UNKNOWN) + { + cf = ct; + ct = 0; + code = new_code; + } + } + + if (compare_code != UNKNOWN) + { + /* notl op1 (if needed) + sarl $31, op1 + andl (cf-ct), op1 + addl ct, op1 + + For x < 0 (resp. x <= -1) there will be no notl, + so if possible swap the constants to get rid of the + complement. + True/false will be -1/0 while code below (store flag + followed by decrement) is 0/-1, so the constants need + to be exchanged once more. */ + + if (compare_code == GE || !cf) + { + code = reverse_condition (code); + compare_code = LT; + } + else + std::swap (ct, cf); + + out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); + } + else + { + out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); + + out = expand_simple_binop (mode, PLUS, copy_rtx (out), + constm1_rtx, + copy_rtx (out), 1, OPTAB_DIRECT); + } + + out = expand_simple_binop (mode, AND, copy_rtx (out), + gen_int_mode (cf - ct, mode), + copy_rtx (out), 1, OPTAB_DIRECT); + if (ct) + out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct), + copy_rtx (out), 1, OPTAB_DIRECT); + if (!rtx_equal_p (out, operands[0])) + emit_move_insn (operands[0], copy_rtx (out)); + + return true; + } + } + + if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) + { + /* Try a few things more with specific constants and a variable. */ + + optab op; + rtx var, orig_out, out, tmp; + + if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2) + return false; + + /* If one of the two operands is an interesting constant, load a + constant with the above and mask it in with a logical operation. */ + + if (CONST_INT_P (operands[2])) + { + var = operands[3]; + if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) + operands[3] = constm1_rtx, op = and_optab; + else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx) + operands[3] = const0_rtx, op = ior_optab; + else + return false; + } + else if (CONST_INT_P (operands[3])) + { + var = operands[2]; + if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) + operands[2] = constm1_rtx, op = and_optab; + else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx) + operands[2] = const0_rtx, op = ior_optab; + else + return false; + } + else + return false; + + orig_out = operands[0]; + tmp = gen_reg_rtx (mode); + operands[0] = tmp; + + /* Recurse to get the constant loaded. */ + if (!ix86_expand_int_movcc (operands)) + return false; + + /* Mask in the interesting variable. */ + out = expand_binop (mode, op, var, tmp, orig_out, 0, + OPTAB_WIDEN); + if (!rtx_equal_p (out, orig_out)) + emit_move_insn (copy_rtx (orig_out), copy_rtx (out)); + + return true; + } + + /* + * For comparison with above, + * + * movl cf,dest + * movl ct,tmp + * cmpl op1,op2 + * cmovcc tmp,dest + * + * Size 15. + */ + + if (! nonimmediate_operand (operands[2], mode)) + operands[2] = force_reg (mode, operands[2]); + if (! nonimmediate_operand (operands[3], mode)) + operands[3] = force_reg (mode, operands[3]); + + if (! register_operand (operands[2], VOIDmode) + && (mode == QImode + || ! register_operand (operands[3], VOIDmode))) + operands[2] = force_reg (mode, operands[2]); + + if (mode == QImode + && ! register_operand (operands[3], VOIDmode)) + operands[3] = force_reg (mode, operands[3]); + + emit_insn (compare_seq); + emit_insn (gen_rtx_SET (operands[0], + gen_rtx_IF_THEN_ELSE (mode, + compare_op, operands[2], + operands[3]))); + return true; +} + +/* Detect conditional moves that exactly match min/max operational + semantics. Note that this is IEEE safe, as long as we don't + interchange the operands. + + Returns FALSE if this conditional move doesn't match a MIN/MAX, + and TRUE if the operation is successful and instructions are emitted. */ + +static bool +ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0, + rtx cmp_op1, rtx if_true, rtx if_false) +{ + machine_mode mode; + bool is_min; + rtx tmp; + + if (code == LT) + ; + else if (code == UNGE) + std::swap (if_true, if_false); + else + return false; + + if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false)) + is_min = true; + else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false)) + is_min = false; + else + return false; + + mode = GET_MODE (dest); + + /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here, + but MODE may be a vector mode and thus not appropriate. */ + if (!flag_finite_math_only || flag_signed_zeros) + { + int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX; + rtvec v; + + if_true = force_reg (mode, if_true); + v = gen_rtvec (2, if_true, if_false); + tmp = gen_rtx_UNSPEC (mode, v, u); + } + else + { + code = is_min ? SMIN : SMAX; + if (MEM_P (if_true) && MEM_P (if_false)) + if_true = force_reg (mode, if_true); + tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false); + } + + emit_insn (gen_rtx_SET (dest, tmp)); + return true; +} + +/* Expand an SSE comparison. Return the register with the result. */ + +static rtx +ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, + rtx op_true, rtx op_false) +{ + machine_mode mode = GET_MODE (dest); + machine_mode cmp_ops_mode = GET_MODE (cmp_op0); + + /* In general case result of comparison can differ from operands' type. */ + machine_mode cmp_mode; + + /* In AVX512F the result of comparison is an integer mask. */ + bool maskcmp = false; + rtx x; + + if (GET_MODE_SIZE (cmp_ops_mode) == 64) + { + unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode); + cmp_mode = int_mode_for_size (nbits, 0).require (); + maskcmp = true; + } + else + cmp_mode = cmp_ops_mode; + + cmp_op0 = force_reg (cmp_ops_mode, cmp_op0); + + int (*op1_predicate)(rtx, machine_mode) + = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand; + + if (!op1_predicate (cmp_op1, cmp_ops_mode)) + cmp_op1 = force_reg (cmp_ops_mode, cmp_op1); + + if (optimize + || (maskcmp && cmp_mode != mode) + || (op_true && reg_overlap_mentioned_p (dest, op_true)) + || (op_false && reg_overlap_mentioned_p (dest, op_false))) + dest = gen_reg_rtx (maskcmp ? cmp_mode : mode); + + /* Compare patterns for int modes are unspec in AVX512F only. */ + if (maskcmp && (code == GT || code == EQ)) + { + rtx (*gen)(rtx, rtx, rtx); + + switch (cmp_ops_mode) + { + case E_V64QImode: + gcc_assert (TARGET_AVX512BW); + gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1; + break; + case E_V32HImode: + gcc_assert (TARGET_AVX512BW); + gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1; + break; + case E_V16SImode: + gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1; + break; + case E_V8DImode: + gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1; + break; + default: + gen = NULL; + } + + if (gen) + { + emit_insn (gen (dest, cmp_op0, cmp_op1)); + return dest; + } + } + x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1); + + if (cmp_mode != mode && !maskcmp) + { + x = force_reg (cmp_ops_mode, x); + convert_move (dest, x, false); + } + else + emit_insn (gen_rtx_SET (dest, x)); + + return dest; +} + +/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical + operations. This is used for both scalar and vector conditional moves. */ + +void +ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) +{ + machine_mode mode = GET_MODE (dest); + machine_mode cmpmode = GET_MODE (cmp); + + /* In AVX512F the result of comparison is an integer mask. */ + bool maskcmp = (mode != cmpmode && TARGET_AVX512F); + + rtx t2, t3, x; + + /* If we have an integer mask and FP value then we need + to cast mask to FP mode. */ + if (mode != cmpmode && VECTOR_MODE_P (cmpmode)) + { + cmp = force_reg (cmpmode, cmp); + cmp = gen_rtx_SUBREG (mode, cmp, 0); + } + + if (maskcmp) + { + rtx (*gen) (rtx, rtx) = NULL; + if ((op_true == CONST0_RTX (mode) + && vector_all_ones_operand (op_false, mode)) + || (op_false == CONST0_RTX (mode) + && vector_all_ones_operand (op_true, mode))) + switch (mode) + { + case E_V64QImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_cvtmask2bv64qi; + break; + case E_V32QImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2bv32qi; + break; + case E_V16QImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2bv16qi; + break; + case E_V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_cvtmask2wv32hi; + break; + case E_V16HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2wv16hi; + break; + case E_V8HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_cvtmask2wv8hi; + break; + case E_V16SImode: + if (TARGET_AVX512DQ) + gen = gen_avx512f_cvtmask2dv16si; + break; + case E_V8SImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2dv8si; + break; + case E_V4SImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2dv4si; + break; + case E_V8DImode: + if (TARGET_AVX512DQ) + gen = gen_avx512f_cvtmask2qv8di; + break; + case E_V4DImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2qv4di; + break; + case E_V2DImode: + if (TARGET_AVX512VL && TARGET_AVX512DQ) + gen = gen_avx512vl_cvtmask2qv2di; + break; + default: + break; + } + if (gen && SCALAR_INT_MODE_P (cmpmode)) + { + cmp = force_reg (cmpmode, cmp); + if (op_true == CONST0_RTX (mode)) + { + rtx (*gen_not) (rtx, rtx); + switch (cmpmode) + { + case E_QImode: gen_not = gen_knotqi; break; + case E_HImode: gen_not = gen_knothi; break; + case E_SImode: gen_not = gen_knotsi; break; + case E_DImode: gen_not = gen_knotdi; break; + default: gcc_unreachable (); + } + rtx n = gen_reg_rtx (cmpmode); + emit_insn (gen_not (n, cmp)); + cmp = n; + } + emit_insn (gen (dest, cmp)); + return; + } + } + else if (vector_all_ones_operand (op_true, mode) + && op_false == CONST0_RTX (mode)) + { + emit_insn (gen_rtx_SET (dest, cmp)); + return; + } + else if (op_false == CONST0_RTX (mode)) + { + op_true = force_reg (mode, op_true); + x = gen_rtx_AND (mode, cmp, op_true); + emit_insn (gen_rtx_SET (dest, x)); + return; + } + else if (op_true == CONST0_RTX (mode)) + { + op_false = force_reg (mode, op_false); + x = gen_rtx_NOT (mode, cmp); + x = gen_rtx_AND (mode, x, op_false); + emit_insn (gen_rtx_SET (dest, x)); + return; + } + else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)) + { + op_false = force_reg (mode, op_false); + x = gen_rtx_IOR (mode, cmp, op_false); + emit_insn (gen_rtx_SET (dest, x)); + return; + } + else if (TARGET_XOP) + { + op_true = force_reg (mode, op_true); + + if (!nonimmediate_operand (op_false, mode)) + op_false = force_reg (mode, op_false); + + emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp, + op_true, + op_false))); + return; + } + + rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; + rtx d = dest; + + if (!vector_operand (op_true, mode)) + op_true = force_reg (mode, op_true); + + op_false = force_reg (mode, op_false); + + switch (mode) + { + case E_V4SFmode: + if (TARGET_SSE4_1) + gen = gen_sse4_1_blendvps; + break; + case E_V2DFmode: + if (TARGET_SSE4_1) + gen = gen_sse4_1_blendvpd; + break; + case E_SFmode: + if (TARGET_SSE4_1) + { + gen = gen_sse4_1_blendvss; + op_true = force_reg (mode, op_true); + } + break; + case E_DFmode: + if (TARGET_SSE4_1) + { + gen = gen_sse4_1_blendvsd; + op_true = force_reg (mode, op_true); + } + break; + case E_V16QImode: + case E_V8HImode: + case E_V4SImode: + case E_V2DImode: + if (TARGET_SSE4_1) + { + gen = gen_sse4_1_pblendvb; + if (mode != V16QImode) + d = gen_reg_rtx (V16QImode); + op_false = gen_lowpart (V16QImode, op_false); + op_true = gen_lowpart (V16QImode, op_true); + cmp = gen_lowpart (V16QImode, cmp); + } + break; + case E_V8SFmode: + if (TARGET_AVX) + gen = gen_avx_blendvps256; + break; + case E_V4DFmode: + if (TARGET_AVX) + gen = gen_avx_blendvpd256; + break; + case E_V32QImode: + case E_V16HImode: + case E_V8SImode: + case E_V4DImode: + if (TARGET_AVX2) + { + gen = gen_avx2_pblendvb; + if (mode != V32QImode) + d = gen_reg_rtx (V32QImode); + op_false = gen_lowpart (V32QImode, op_false); + op_true = gen_lowpart (V32QImode, op_true); + cmp = gen_lowpart (V32QImode, cmp); + } + break; + + case E_V64QImode: + gen = gen_avx512bw_blendmv64qi; + break; + case E_V32HImode: + gen = gen_avx512bw_blendmv32hi; + break; + case E_V16SImode: + gen = gen_avx512f_blendmv16si; + break; + case E_V8DImode: + gen = gen_avx512f_blendmv8di; + break; + case E_V8DFmode: + gen = gen_avx512f_blendmv8df; + break; + case E_V16SFmode: + gen = gen_avx512f_blendmv16sf; + break; + + default: + break; + } + + if (gen != NULL) + { + emit_insn (gen (d, op_false, op_true, cmp)); + if (d != dest) + emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); + } + else + { + op_true = force_reg (mode, op_true); + + t2 = gen_reg_rtx (mode); + if (optimize) + t3 = gen_reg_rtx (mode); + else + t3 = dest; + + x = gen_rtx_AND (mode, op_true, cmp); + emit_insn (gen_rtx_SET (t2, x)); + + x = gen_rtx_NOT (mode, cmp); + x = gen_rtx_AND (mode, x, op_false); + emit_insn (gen_rtx_SET (t3, x)); + + x = gen_rtx_IOR (mode, t3, t2); + emit_insn (gen_rtx_SET (dest, x)); + } +} + +/* Swap, force into registers, or otherwise massage the two operands + to an sse comparison with a mask result. Thus we differ a bit from + ix86_prepare_fp_compare_args which expects to produce a flags result. + + The DEST operand exists to help determine whether to commute commutative + operators. The POP0/POP1 operands are updated in place. The new + comparison code is returned, or UNKNOWN if not implementable. */ + +static enum rtx_code +ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code, + rtx *pop0, rtx *pop1) +{ + switch (code) + { + case LTGT: + case UNEQ: + /* AVX supports all the needed comparisons. */ + if (TARGET_AVX) + break; + /* We have no LTGT as an operator. We could implement it with + NE & ORDERED, but this requires an extra temporary. It's + not clear that it's worth it. */ + return UNKNOWN; + + case LT: + case LE: + case UNGT: + case UNGE: + /* These are supported directly. */ + break; + + case EQ: + case NE: + case UNORDERED: + case ORDERED: + /* AVX has 3 operand comparisons, no need to swap anything. */ + if (TARGET_AVX) + break; + /* For commutative operators, try to canonicalize the destination + operand to be first in the comparison - this helps reload to + avoid extra moves. */ + if (!dest || !rtx_equal_p (dest, *pop1)) + break; + /* FALLTHRU */ + + case GE: + case GT: + case UNLE: + case UNLT: + /* These are not supported directly before AVX, and furthermore + ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the + comparison operands to transform into something that is + supported. */ + std::swap (*pop0, *pop1); + code = swap_condition (code); + break; + + default: + gcc_unreachable (); + } + + return code; +} + +/* Expand a floating-point conditional move. Return true if successful. */ + +bool +ix86_expand_fp_movcc (rtx operands[]) +{ + machine_mode mode = GET_MODE (operands[0]); + enum rtx_code code = GET_CODE (operands[1]); + rtx tmp, compare_op; + rtx op0 = XEXP (operands[1], 0); + rtx op1 = XEXP (operands[1], 1); + + if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode)) + { + machine_mode cmode; + + /* Since we've no cmove for sse registers, don't force bad register + allocation just to gain access to it. Deny movcc when the + comparison mode doesn't match the move mode. */ + cmode = GET_MODE (op0); + if (cmode == VOIDmode) + cmode = GET_MODE (op1); + if (cmode != mode) + return false; + + code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1); + if (code == UNKNOWN) + return false; + + if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1, + operands[2], operands[3])) + return true; + + tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1, + operands[2], operands[3]); + ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]); + return true; + } + + if (GET_MODE (op0) == TImode + || (GET_MODE (op0) == DImode + && !TARGET_64BIT)) + return false; + + /* The floating point conditional move instructions don't directly + support conditions resulting from a signed integer comparison. */ + + compare_op = ix86_expand_compare (code, op0, op1); + if (!fcmov_comparison_operator (compare_op, VOIDmode)) + { + tmp = gen_reg_rtx (QImode); + ix86_expand_setcc (tmp, code, op0, op1); + + compare_op = ix86_expand_compare (NE, tmp, const0_rtx); + } + + emit_insn (gen_rtx_SET (operands[0], + gen_rtx_IF_THEN_ELSE (mode, compare_op, + operands[2], operands[3]))); + + return true; +} + +/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */ + +static int +ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code) +{ + switch (code) + { + case EQ: + return 0; + case LT: + case LTU: + return 1; + case LE: + case LEU: + return 2; + case NE: + return 4; + case GE: + case GEU: + return 5; + case GT: + case GTU: + return 6; + default: + gcc_unreachable (); + } +} + +/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */ + +static int +ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code) +{ + switch (code) + { + case EQ: + return 0x00; + case NE: + return 0x04; + case GT: + return 0x0e; + case LE: + return 0x02; + case GE: + return 0x0d; + case LT: + return 0x01; + case UNLE: + return 0x0a; + case UNLT: + return 0x09; + case UNGE: + return 0x05; + case UNGT: + return 0x06; + case UNEQ: + return 0x18; + case LTGT: + return 0x0c; + case ORDERED: + return 0x07; + case UNORDERED: + return 0x03; + default: + gcc_unreachable (); + } +} + +/* Return immediate value to be used in UNSPEC_PCMP + for comparison CODE in MODE. */ + +static int +ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode) +{ + if (FLOAT_MODE_P (mode)) + return ix86_fp_cmp_code_to_pcmp_immediate (code); + return ix86_int_cmp_code_to_pcmp_immediate (code); +} + +/* Expand AVX-512 vector comparison. */ + +bool +ix86_expand_mask_vec_cmp (rtx operands[]) +{ + machine_mode mask_mode = GET_MODE (operands[0]); + machine_mode cmp_mode = GET_MODE (operands[2]); + enum rtx_code code = GET_CODE (operands[1]); + rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode)); + int unspec_code; + rtx unspec; + + switch (code) + { + case LEU: + case GTU: + case GEU: + case LTU: + unspec_code = UNSPEC_UNSIGNED_PCMP; + break; + + default: + unspec_code = UNSPEC_PCMP; + } + + unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2], + operands[3], imm), + unspec_code); + emit_insn (gen_rtx_SET (operands[0], unspec)); + + return true; +} + +/* Expand fp vector comparison. */ + +bool +ix86_expand_fp_vec_cmp (rtx operands[]) +{ + enum rtx_code code = GET_CODE (operands[1]); + rtx cmp; + + code = ix86_prepare_sse_fp_compare_args (operands[0], code, + &operands[2], &operands[3]); + if (code == UNKNOWN) + { + rtx temp; + switch (GET_CODE (operands[1])) + { + case LTGT: + temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2], + operands[3], NULL, NULL); + cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2], + operands[3], NULL, NULL); + code = AND; + break; + case UNEQ: + temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2], + operands[3], NULL, NULL); + cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2], + operands[3], NULL, NULL); + code = IOR; + break; + default: + gcc_unreachable (); + } + cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, + OPTAB_DIRECT); + } + else + cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3], + operands[1], operands[2]); + + if (operands[0] != cmp) + emit_move_insn (operands[0], cmp); + + return true; +} + +static rtx +ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, + rtx op_true, rtx op_false, bool *negate) +{ + machine_mode data_mode = GET_MODE (dest); + machine_mode mode = GET_MODE (cop0); + rtx x; + + *negate = false; + + /* XOP supports all of the comparisons on all 128-bit vector int types. */ + if (TARGET_XOP + && (mode == V16QImode || mode == V8HImode + || mode == V4SImode || mode == V2DImode)) + ; + else + { + /* Canonicalize the comparison to EQ, GT, GTU. */ + switch (code) + { + case EQ: + case GT: + case GTU: + break; + + case NE: + case LE: + case LEU: + code = reverse_condition (code); + *negate = true; + break; + + case GE: + case GEU: + code = reverse_condition (code); + *negate = true; + /* FALLTHRU */ + + case LT: + case LTU: + std::swap (cop0, cop1); + code = swap_condition (code); + break; + + default: + gcc_unreachable (); + } + + /* Only SSE4.1/SSE4.2 supports V2DImode. */ + if (mode == V2DImode) + { + switch (code) + { + case EQ: + /* SSE4.1 supports EQ. */ + if (!TARGET_SSE4_1) + return NULL; + break; + + case GT: + case GTU: + /* SSE4.2 supports GT/GTU. */ + if (!TARGET_SSE4_2) + return NULL; + break; + + default: + gcc_unreachable (); + } + } + + rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode); + rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode); + if (*negate) + std::swap (optrue, opfalse); + + /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when + not using integer masks into min (x, y) == x ? -1 : 0 (i.e. + min (x, y) == x). While we add one instruction (the minimum), + we remove the need for two instructions in the negation, as the + result is done this way. + When using masks, do it for SI/DImode element types, as it is shorter + than the two subtractions. */ + if ((code != EQ + && GET_MODE_SIZE (mode) != 64 + && vector_all_ones_operand (opfalse, data_mode) + && optrue == CONST0_RTX (data_mode)) + || (code == GTU + && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4 + /* Don't do it if not using integer masks and we'd end up with + the right values in the registers though. */ + && (GET_MODE_SIZE (mode) == 64 + || !vector_all_ones_operand (optrue, data_mode) + || opfalse != CONST0_RTX (data_mode)))) + { + rtx (*gen) (rtx, rtx, rtx) = NULL; + + switch (mode) + { + case E_V16SImode: + gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3; + break; + case E_V8DImode: + gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3; + cop0 = force_reg (mode, cop0); + cop1 = force_reg (mode, cop1); + break; + case E_V32QImode: + if (TARGET_AVX2) + gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3; + break; + case E_V16HImode: + if (TARGET_AVX2) + gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3; + break; + case E_V8SImode: + if (TARGET_AVX2) + gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3; + break; + case E_V4DImode: + if (TARGET_AVX512VL) + { + gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3; + cop0 = force_reg (mode, cop0); + cop1 = force_reg (mode, cop1); + } + break; + case E_V16QImode: + if (code == GTU && TARGET_SSE2) + gen = gen_uminv16qi3; + else if (code == GT && TARGET_SSE4_1) + gen = gen_sminv16qi3; + break; + case E_V8HImode: + if (code == GTU && TARGET_SSE4_1) + gen = gen_uminv8hi3; + else if (code == GT && TARGET_SSE2) + gen = gen_sminv8hi3; + break; + case E_V4SImode: + if (TARGET_SSE4_1) + gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3; + break; + case E_V2DImode: + if (TARGET_AVX512VL) + { + gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3; + cop0 = force_reg (mode, cop0); + cop1 = force_reg (mode, cop1); + } + break; + default: + break; + } + + if (gen) + { + rtx tem = gen_reg_rtx (mode); + if (!vector_operand (cop0, mode)) + cop0 = force_reg (mode, cop0); + if (!vector_operand (cop1, mode)) + cop1 = force_reg (mode, cop1); + *negate = !*negate; + emit_insn (gen (tem, cop0, cop1)); + cop1 = tem; + code = EQ; + } + } + + /* Unsigned parallel compare is not supported by the hardware. + Play some tricks to turn this into a signed comparison + against 0. */ + if (code == GTU) + { + cop0 = force_reg (mode, cop0); + + switch (mode) + { + case E_V16SImode: + case E_V8DImode: + case E_V8SImode: + case E_V4DImode: + case E_V4SImode: + case E_V2DImode: + { + rtx t1, t2, mask; + rtx (*gen_sub3) (rtx, rtx, rtx); + + switch (mode) + { + case E_V16SImode: gen_sub3 = gen_subv16si3; break; + case E_V8DImode: gen_sub3 = gen_subv8di3; break; + case E_V8SImode: gen_sub3 = gen_subv8si3; break; + case E_V4DImode: gen_sub3 = gen_subv4di3; break; + case E_V4SImode: gen_sub3 = gen_subv4si3; break; + case E_V2DImode: gen_sub3 = gen_subv2di3; break; + default: + gcc_unreachable (); + } + /* Subtract (-(INT MAX) - 1) from both operands to make + them signed. */ + mask = ix86_build_signbit_mask (mode, true, false); + t1 = gen_reg_rtx (mode); + emit_insn (gen_sub3 (t1, cop0, mask)); + + t2 = gen_reg_rtx (mode); + emit_insn (gen_sub3 (t2, cop1, mask)); + + cop0 = t1; + cop1 = t2; + code = GT; + } + break; + + case E_V64QImode: + case E_V32HImode: + case E_V32QImode: + case E_V16HImode: + case E_V16QImode: + case E_V8HImode: + /* Perform a parallel unsigned saturating subtraction. */ + x = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0, + cop1))); + + cop0 = x; + cop1 = CONST0_RTX (mode); + code = EQ; + *negate = !*negate; + break; + + default: + gcc_unreachable (); + } + } + } + + if (*negate) + std::swap (op_true, op_false); + + /* Allow the comparison to be done in one mode, but the movcc to + happen in another mode. */ + if (data_mode == mode) + { + x = ix86_expand_sse_cmp (dest, code, cop0, cop1, + op_true, op_false); + } + else + { + gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode)); + x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1, + op_true, op_false); + if (GET_MODE (x) == mode) + x = gen_lowpart (data_mode, x); + } + + return x; +} + +/* Expand integer vector comparison. */ + +bool +ix86_expand_int_vec_cmp (rtx operands[]) +{ + rtx_code code = GET_CODE (operands[1]); + bool negate = false; + rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2], + operands[3], NULL, NULL, &negate); + + if (!cmp) + return false; + + if (negate) + cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp, + CONST0_RTX (GET_MODE (cmp)), + NULL, NULL, &negate); + + gcc_assert (!negate); + + if (operands[0] != cmp) + emit_move_insn (operands[0], cmp); + + return true; +} + +/* Expand a floating-point vector conditional move; a vcond operation + rather than a movcc operation. */ + +bool +ix86_expand_fp_vcond (rtx operands[]) +{ + enum rtx_code code = GET_CODE (operands[3]); + rtx cmp; + + code = ix86_prepare_sse_fp_compare_args (operands[0], code, + &operands[4], &operands[5]); + if (code == UNKNOWN) + { + rtx temp; + switch (GET_CODE (operands[3])) + { + case LTGT: + temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4], + operands[5], operands[0], operands[0]); + cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4], + operands[5], operands[1], operands[2]); + code = AND; + break; + case UNEQ: + temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4], + operands[5], operands[0], operands[0]); + cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4], + operands[5], operands[1], operands[2]); + code = IOR; + break; + default: + gcc_unreachable (); + } + cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, + OPTAB_DIRECT); + ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); + return true; + } + + if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4], + operands[5], operands[1], operands[2])) + return true; + + cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5], + operands[1], operands[2]); + ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); + return true; +} + +/* Expand a signed/unsigned integral vector conditional move. */ + +bool +ix86_expand_int_vcond (rtx operands[]) +{ + machine_mode data_mode = GET_MODE (operands[0]); + machine_mode mode = GET_MODE (operands[4]); + enum rtx_code code = GET_CODE (operands[3]); + bool negate = false; + rtx x, cop0, cop1; + + cop0 = operands[4]; + cop1 = operands[5]; + + /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31 + and x < 0 ? 1 : 0 into (unsigned) x >> 31. */ + if ((code == LT || code == GE) + && data_mode == mode + && cop1 == CONST0_RTX (mode) + && operands[1 + (code == LT)] == CONST0_RTX (data_mode) + && GET_MODE_UNIT_SIZE (data_mode) > 1 + && GET_MODE_UNIT_SIZE (data_mode) <= 8 + && (GET_MODE_SIZE (data_mode) == 16 + || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32))) + { + rtx negop = operands[2 - (code == LT)]; + int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1; + if (negop == CONST1_RTX (data_mode)) + { + rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift), + operands[0], 1, OPTAB_DIRECT); + if (res != operands[0]) + emit_move_insn (operands[0], res); + return true; + } + else if (GET_MODE_INNER (data_mode) != DImode + && vector_all_ones_operand (negop, data_mode)) + { + rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift), + operands[0], 0, OPTAB_DIRECT); + if (res != operands[0]) + emit_move_insn (operands[0], res); + return true; + } + } + + if (!nonimmediate_operand (cop1, mode)) + cop1 = force_reg (mode, cop1); + if (!general_operand (operands[1], data_mode)) + operands[1] = force_reg (data_mode, operands[1]); + if (!general_operand (operands[2], data_mode)) + operands[2] = force_reg (data_mode, operands[2]); + + x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1, + operands[1], operands[2], &negate); + + if (!x) + return false; + + ix86_expand_sse_movcc (operands[0], x, operands[1+negate], + operands[2-negate]); + return true; +} + +static bool +ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1, + struct expand_vec_perm_d *d) +{ + /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const + expander, so args are either in d, or in op0, op1 etc. */ + machine_mode mode = GET_MODE (d ? d->op0 : op0); + machine_mode maskmode = mode; + rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; + + switch (mode) + { + case E_V8HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_vpermt2varv8hi3; + break; + case E_V16HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_vpermt2varv16hi3; + break; + case E_V64QImode: + if (TARGET_AVX512VBMI) + gen = gen_avx512bw_vpermt2varv64qi3; + break; + case E_V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vpermt2varv32hi3; + break; + case E_V4SImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermt2varv4si3; + break; + case E_V8SImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermt2varv8si3; + break; + case E_V16SImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vpermt2varv16si3; + break; + case E_V4SFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermt2varv4sf3; + maskmode = V4SImode; + } + break; + case E_V8SFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermt2varv8sf3; + maskmode = V8SImode; + } + break; + case E_V16SFmode: + if (TARGET_AVX512F) + { + gen = gen_avx512f_vpermt2varv16sf3; + maskmode = V16SImode; + } + break; + case E_V2DImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermt2varv2di3; + break; + case E_V4DImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermt2varv4di3; + break; + case E_V8DImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vpermt2varv8di3; + break; + case E_V2DFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermt2varv2df3; + maskmode = V2DImode; + } + break; + case E_V4DFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermt2varv4df3; + maskmode = V4DImode; + } + break; + case E_V8DFmode: + if (TARGET_AVX512F) + { + gen = gen_avx512f_vpermt2varv8df3; + maskmode = V8DImode; + } + break; + default: + break; + } + + if (gen == NULL) + return false; + + /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const + expander, so args are either in d, or in op0, op1 etc. */ + if (d) + { + rtx vec[64]; + target = d->target; + op0 = d->op0; + op1 = d->op1; + for (int i = 0; i < d->nelt; ++i) + vec[i] = GEN_INT (d->perm[i]); + mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); + } + + emit_insn (gen (target, force_reg (maskmode, mask), op0, op1)); + return true; +} + +/* Expand a variable vector permutation. */ + +void +ix86_expand_vec_perm (rtx operands[]) +{ + rtx target = operands[0]; + rtx op0 = operands[1]; + rtx op1 = operands[2]; + rtx mask = operands[3]; + rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32]; + machine_mode mode = GET_MODE (op0); + machine_mode maskmode = GET_MODE (mask); + int w, e, i; + bool one_operand_shuffle = rtx_equal_p (op0, op1); + + /* Number of elements in the vector. */ + w = GET_MODE_NUNITS (mode); + e = GET_MODE_UNIT_SIZE (mode); + gcc_assert (w <= 64); + + if (TARGET_AVX512F && one_operand_shuffle) + { + rtx (*gen) (rtx, rtx, rtx) = NULL; + switch (mode) + { + case E_V16SImode: + gen =gen_avx512f_permvarv16si; + break; + case E_V16SFmode: + gen = gen_avx512f_permvarv16sf; + break; + case E_V8DImode: + gen = gen_avx512f_permvarv8di; + break; + case E_V8DFmode: + gen = gen_avx512f_permvarv8df; + break; + default: + break; + } + if (gen != NULL) + { + emit_insn (gen (target, op0, mask)); + return; + } + } + + if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL)) + return; + + if (TARGET_AVX2) + { + if (mode == V4DImode || mode == V4DFmode || mode == V16HImode) + { + /* Unfortunately, the VPERMQ and VPERMPD instructions only support + an constant shuffle operand. With a tiny bit of effort we can + use VPERMD instead. A re-interpretation stall for V4DFmode is + unfortunate but there's no avoiding it. + Similarly for V16HImode we don't have instructions for variable + shuffling, while for V32QImode we can use after preparing suitable + masks vpshufb; vpshufb; vpermq; vpor. */ + + if (mode == V16HImode) + { + maskmode = mode = V32QImode; + w = 32; + e = 1; + } + else + { + maskmode = mode = V8SImode; + w = 8; + e = 4; + } + t1 = gen_reg_rtx (maskmode); + + /* Replicate the low bits of the V4DImode mask into V8SImode: + mask = { A B C D } + t1 = { A A B B C C D D }. */ + for (i = 0; i < w / 2; ++i) + vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2); + vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + vt = force_reg (maskmode, vt); + mask = gen_lowpart (maskmode, mask); + if (maskmode == V8SImode) + emit_insn (gen_avx2_permvarv8si (t1, mask, vt)); + else + emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt)); + + /* Multiply the shuffle indicies by two. */ + t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1, + OPTAB_DIRECT); + + /* Add one to the odd shuffle indicies: + t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */ + for (i = 0; i < w / 2; ++i) + { + vec[i * 2] = const0_rtx; + vec[i * 2 + 1] = const1_rtx; + } + vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + vt = validize_mem (force_const_mem (maskmode, vt)); + t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1, + OPTAB_DIRECT); + + /* Continue as if V8SImode (resp. V32QImode) was used initially. */ + operands[3] = mask = t1; + target = gen_reg_rtx (mode); + op0 = gen_lowpart (mode, op0); + op1 = gen_lowpart (mode, op1); + } + + switch (mode) + { + case E_V8SImode: + /* The VPERMD and VPERMPS instructions already properly ignore + the high bits of the shuffle elements. No need for us to + perform an AND ourselves. */ + if (one_operand_shuffle) + { + emit_insn (gen_avx2_permvarv8si (target, op0, mask)); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); + } + else + { + t1 = gen_reg_rtx (V8SImode); + t2 = gen_reg_rtx (V8SImode); + emit_insn (gen_avx2_permvarv8si (t1, op0, mask)); + emit_insn (gen_avx2_permvarv8si (t2, op1, mask)); + goto merge_two; + } + return; + + case E_V8SFmode: + mask = gen_lowpart (V8SImode, mask); + if (one_operand_shuffle) + emit_insn (gen_avx2_permvarv8sf (target, op0, mask)); + else + { + t1 = gen_reg_rtx (V8SFmode); + t2 = gen_reg_rtx (V8SFmode); + emit_insn (gen_avx2_permvarv8sf (t1, op0, mask)); + emit_insn (gen_avx2_permvarv8sf (t2, op1, mask)); + goto merge_two; + } + return; + + case E_V4SImode: + /* By combining the two 128-bit input vectors into one 256-bit + input vector, we can use VPERMD and VPERMPS for the full + two-operand shuffle. */ + t1 = gen_reg_rtx (V8SImode); + t2 = gen_reg_rtx (V8SImode); + emit_insn (gen_avx_vec_concatv8si (t1, op0, op1)); + emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); + emit_insn (gen_avx2_permvarv8si (t1, t1, t2)); + emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx)); + return; + + case E_V4SFmode: + t1 = gen_reg_rtx (V8SFmode); + t2 = gen_reg_rtx (V8SImode); + mask = gen_lowpart (V4SImode, mask); + emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1)); + emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); + emit_insn (gen_avx2_permvarv8sf (t1, t1, t2)); + emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx)); + return; + + case E_V32QImode: + t1 = gen_reg_rtx (V32QImode); + t2 = gen_reg_rtx (V32QImode); + t3 = gen_reg_rtx (V32QImode); + vt2 = GEN_INT (-128); + vt = gen_const_vec_duplicate (V32QImode, vt2); + vt = force_reg (V32QImode, vt); + for (i = 0; i < 32; i++) + vec[i] = i < 16 ? vt2 : const0_rtx; + vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec)); + vt2 = force_reg (V32QImode, vt2); + /* From mask create two adjusted masks, which contain the same + bits as mask in the low 7 bits of each vector element. + The first mask will have the most significant bit clear + if it requests element from the same 128-bit lane + and MSB set if it requests element from the other 128-bit lane. + The second mask will have the opposite values of the MSB, + and additionally will have its 128-bit lanes swapped. + E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have + t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and + t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ... + stands for other 12 bytes. */ + /* The bit whether element is from the same lane or the other + lane is bit 4, so shift it up by 3 to the MSB position. */ + t5 = gen_reg_rtx (V4DImode); + emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask), + GEN_INT (3))); + /* Clear MSB bits from the mask just in case it had them set. */ + emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask)); + /* After this t1 will have MSB set for elements from other lane. */ + emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2)); + /* Clear bits other than MSB. */ + emit_insn (gen_andv32qi3 (t1, t1, vt)); + /* Or in the lower bits from mask into t3. */ + emit_insn (gen_iorv32qi3 (t3, t1, t2)); + /* And invert MSB bits in t1, so MSB is set for elements from the same + lane. */ + emit_insn (gen_xorv32qi3 (t1, t1, vt)); + /* Swap 128-bit lanes in t3. */ + t6 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3), + const2_rtx, GEN_INT (3), + const0_rtx, const1_rtx)); + /* And or in the lower bits from mask into t1. */ + emit_insn (gen_iorv32qi3 (t1, t1, t2)); + if (one_operand_shuffle) + { + /* Each of these shuffles will put 0s in places where + element from the other 128-bit lane is needed, otherwise + will shuffle in the requested value. */ + emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, + gen_lowpart (V32QImode, t6))); + emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1)); + /* For t3 the 128-bit lanes are swapped again. */ + t7 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3), + const2_rtx, GEN_INT (3), + const0_rtx, const1_rtx)); + /* And oring both together leads to the result. */ + emit_insn (gen_iorv32qi3 (target, t1, + gen_lowpart (V32QImode, t7))); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); + return; + } + + t4 = gen_reg_rtx (V32QImode); + /* Similarly to the above one_operand_shuffle code, + just for repeated twice for each operand. merge_two: + code will merge the two results together. */ + emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, + gen_lowpart (V32QImode, t6))); + emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, + gen_lowpart (V32QImode, t6))); + emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1)); + emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1)); + t7 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4), + const2_rtx, GEN_INT (3), + const0_rtx, const1_rtx)); + t8 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3), + const2_rtx, GEN_INT (3), + const0_rtx, const1_rtx)); + emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7))); + emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8))); + t1 = t4; + t2 = t3; + goto merge_two; + + default: + gcc_assert (GET_MODE_SIZE (mode) <= 16); + break; + } + } + + if (TARGET_XOP) + { + /* The XOP VPPERM insn supports three inputs. By ignoring the + one_operand_shuffle special case, we avoid creating another + set of constant vectors in memory. */ + one_operand_shuffle = false; + + /* mask = mask & {2*w-1, ...} */ + vt = GEN_INT (2*w - 1); + } + else + { + /* mask = mask & {w-1, ...} */ + vt = GEN_INT (w - 1); + } + + vt = gen_const_vec_duplicate (maskmode, vt); + mask = expand_simple_binop (maskmode, AND, mask, vt, + NULL_RTX, 0, OPTAB_DIRECT); + + /* For non-QImode operations, convert the word permutation control + into a byte permutation control. */ + if (mode != V16QImode) + { + mask = expand_simple_binop (maskmode, ASHIFT, mask, + GEN_INT (exact_log2 (e)), + NULL_RTX, 0, OPTAB_DIRECT); + + /* Convert mask to vector of chars. */ + mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask)); + + /* Replicate each of the input bytes into byte positions: + (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8} + (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} + (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */ + for (i = 0; i < 16; ++i) + vec[i] = GEN_INT (i/e * e); + vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); + vt = validize_mem (force_const_mem (V16QImode, vt)); + if (TARGET_XOP) + emit_insn (gen_xop_pperm (mask, mask, mask, vt)); + else + emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt)); + + /* Convert it into the byte positions by doing + mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */ + for (i = 0; i < 16; ++i) + vec[i] = GEN_INT (i % e); + vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); + vt = validize_mem (force_const_mem (V16QImode, vt)); + emit_insn (gen_addv16qi3 (mask, mask, vt)); + } + + /* The actual shuffle operations all operate on V16QImode. */ + op0 = gen_lowpart (V16QImode, op0); + op1 = gen_lowpart (V16QImode, op1); + + if (TARGET_XOP) + { + if (GET_MODE (target) != V16QImode) + target = gen_reg_rtx (V16QImode); + emit_insn (gen_xop_pperm (target, op0, op1, mask)); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); + } + else if (one_operand_shuffle) + { + if (GET_MODE (target) != V16QImode) + target = gen_reg_rtx (V16QImode); + emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask)); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); + } + else + { + rtx xops[6]; + bool ok; + + /* Shuffle the two input vectors independently. */ + t1 = gen_reg_rtx (V16QImode); + t2 = gen_reg_rtx (V16QImode); + emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask)); + emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask)); + + merge_two: + /* Then merge them together. The key is whether any given control + element contained a bit set that indicates the second word. */ + mask = operands[3]; + vt = GEN_INT (w); + if (maskmode == V2DImode && !TARGET_SSE4_1) + { + /* Without SSE4.1, we don't have V2DImode EQ. Perform one + more shuffle to convert the V2DI input mask into a V4SI + input mask. At which point the masking that expand_int_vcond + will work as desired. */ + rtx t3 = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask), + const0_rtx, const0_rtx, + const2_rtx, const2_rtx)); + mask = t3; + maskmode = V4SImode; + e = w = 4; + } + + vt = gen_const_vec_duplicate (maskmode, vt); + vt = force_reg (maskmode, vt); + mask = expand_simple_binop (maskmode, AND, mask, vt, + NULL_RTX, 0, OPTAB_DIRECT); + + if (GET_MODE (target) != mode) + target = gen_reg_rtx (mode); + xops[0] = target; + xops[1] = gen_lowpart (mode, t2); + xops[2] = gen_lowpart (mode, t1); + xops[3] = gen_rtx_EQ (maskmode, mask, vt); + xops[4] = mask; + xops[5] = vt; + ok = ix86_expand_int_vcond (xops); + gcc_assert (ok); + if (target != operands[0]) + emit_move_insn (operands[0], + gen_lowpart (GET_MODE (operands[0]), target)); + } +} + +/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is + true if we should do zero extension, else sign extension. HIGH_P is + true if we want the N/2 high elements, else the low elements. */ + +void +ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) +{ + machine_mode imode = GET_MODE (src); + rtx tmp; + + if (TARGET_SSE4_1) + { + rtx (*unpack)(rtx, rtx); + rtx (*extract)(rtx, rtx) = NULL; + machine_mode halfmode = BLKmode; + + switch (imode) + { + case E_V64QImode: + if (unsigned_p) + unpack = gen_avx512bw_zero_extendv32qiv32hi2; + else + unpack = gen_avx512bw_sign_extendv32qiv32hi2; + halfmode = V32QImode; + extract + = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi; + break; + case E_V32QImode: + if (unsigned_p) + unpack = gen_avx2_zero_extendv16qiv16hi2; + else + unpack = gen_avx2_sign_extendv16qiv16hi2; + halfmode = V16QImode; + extract + = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi; + break; + case E_V32HImode: + if (unsigned_p) + unpack = gen_avx512f_zero_extendv16hiv16si2; + else + unpack = gen_avx512f_sign_extendv16hiv16si2; + halfmode = V16HImode; + extract + = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi; + break; + case E_V16HImode: + if (unsigned_p) + unpack = gen_avx2_zero_extendv8hiv8si2; + else + unpack = gen_avx2_sign_extendv8hiv8si2; + halfmode = V8HImode; + extract + = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi; + break; + case E_V16SImode: + if (unsigned_p) + unpack = gen_avx512f_zero_extendv8siv8di2; + else + unpack = gen_avx512f_sign_extendv8siv8di2; + halfmode = V8SImode; + extract + = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si; + break; + case E_V8SImode: + if (unsigned_p) + unpack = gen_avx2_zero_extendv4siv4di2; + else + unpack = gen_avx2_sign_extendv4siv4di2; + halfmode = V4SImode; + extract + = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si; + break; + case E_V16QImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv8qiv8hi2; + else + unpack = gen_sse4_1_sign_extendv8qiv8hi2; + break; + case E_V8HImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv4hiv4si2; + else + unpack = gen_sse4_1_sign_extendv4hiv4si2; + break; + case E_V4SImode: + if (unsigned_p) + unpack = gen_sse4_1_zero_extendv2siv2di2; + else + unpack = gen_sse4_1_sign_extendv2siv2di2; + break; + default: + gcc_unreachable (); + } + + if (GET_MODE_SIZE (imode) >= 32) + { + tmp = gen_reg_rtx (halfmode); + emit_insn (extract (tmp, src)); + } + else if (high_p) + { + /* Shift higher 8 bytes to lower 8 bytes. */ + tmp = gen_reg_rtx (V1TImode); + emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src), + GEN_INT (64))); + tmp = gen_lowpart (imode, tmp); + } + else + tmp = src; + + emit_insn (unpack (dest, tmp)); + } + else + { + rtx (*unpack)(rtx, rtx, rtx); + + switch (imode) + { + case E_V16QImode: + if (high_p) + unpack = gen_vec_interleave_highv16qi; + else + unpack = gen_vec_interleave_lowv16qi; + break; + case E_V8HImode: + if (high_p) + unpack = gen_vec_interleave_highv8hi; + else + unpack = gen_vec_interleave_lowv8hi; + break; + case E_V4SImode: + if (high_p) + unpack = gen_vec_interleave_highv4si; + else + unpack = gen_vec_interleave_lowv4si; + break; + default: + gcc_unreachable (); + } + + if (unsigned_p) + tmp = force_reg (imode, CONST0_RTX (imode)); + else + tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), + src, pc_rtx, pc_rtx); + + rtx tmp2 = gen_reg_rtx (imode); + emit_insn (unpack (tmp2, src, tmp)); + emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2)); + } +} + +/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode, + but works for floating pointer parameters and nonoffsetable memories. + For pushes, it returns just stack offsets; the values will be saved + in the right order. Maximally three parts are generated. */ + +static int +ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode) +{ + int size; + + if (!TARGET_64BIT) + size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4; + else + size = (GET_MODE_SIZE (mode) + 4) / 8; + + gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand))); + gcc_assert (size >= 2 && size <= 4); + + /* Optimize constant pool reference to immediates. This is used by fp + moves, that force all constants to memory to allow combining. */ + if (MEM_P (operand) && MEM_READONLY_P (operand)) + operand = avoid_constant_pool_reference (operand); + + if (MEM_P (operand) && !offsettable_memref_p (operand)) + { + /* The only non-offsetable memories we handle are pushes. */ + int ok = push_operand (operand, VOIDmode); + + gcc_assert (ok); + + operand = copy_rtx (operand); + PUT_MODE (operand, word_mode); + parts[0] = parts[1] = parts[2] = parts[3] = operand; + return size; + } + + if (GET_CODE (operand) == CONST_VECTOR) + { + scalar_int_mode imode = int_mode_for_mode (mode).require (); + /* Caution: if we looked through a constant pool memory above, + the operand may actually have a different mode now. That's + ok, since we want to pun this all the way back to an integer. */ + operand = simplify_subreg (imode, operand, GET_MODE (operand), 0); + gcc_assert (operand != NULL); + mode = imode; + } + + if (!TARGET_64BIT) + { + if (mode == DImode) + split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); + else + { + int i; + + if (REG_P (operand)) + { + gcc_assert (reload_completed); + for (i = 0; i < size; i++) + parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i); + } + else if (offsettable_memref_p (operand)) + { + operand = adjust_address (operand, SImode, 0); + parts[0] = operand; + for (i = 1; i < size; i++) + parts[i] = adjust_address (operand, SImode, 4 * i); + } + else if (CONST_DOUBLE_P (operand)) + { + const REAL_VALUE_TYPE *r; + long l[4]; + + r = CONST_DOUBLE_REAL_VALUE (operand); + switch (mode) + { + case E_TFmode: + real_to_target (l, r, mode); + parts[3] = gen_int_mode (l[3], SImode); + parts[2] = gen_int_mode (l[2], SImode); + break; + case E_XFmode: + /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since + long double may not be 80-bit. */ + real_to_target (l, r, mode); + parts[2] = gen_int_mode (l[2], SImode); + break; + case E_DFmode: + REAL_VALUE_TO_TARGET_DOUBLE (*r, l); + break; + default: + gcc_unreachable (); + } + parts[1] = gen_int_mode (l[1], SImode); + parts[0] = gen_int_mode (l[0], SImode); + } + else + gcc_unreachable (); + } + } + else + { + if (mode == TImode) + split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); + if (mode == XFmode || mode == TFmode) + { + machine_mode upper_mode = mode==XFmode ? SImode : DImode; + if (REG_P (operand)) + { + gcc_assert (reload_completed); + parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0); + parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1); + } + else if (offsettable_memref_p (operand)) + { + operand = adjust_address (operand, DImode, 0); + parts[0] = operand; + parts[1] = adjust_address (operand, upper_mode, 8); + } + else if (CONST_DOUBLE_P (operand)) + { + long l[4]; + + real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode); + + /* real_to_target puts 32-bit pieces in each long. */ + parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff)) + | ((l[1] & HOST_WIDE_INT_C (0xffffffff)) + << 32), DImode); + + if (upper_mode == SImode) + parts[1] = gen_int_mode (l[2], SImode); + else + parts[1] + = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff)) + | ((l[3] & HOST_WIDE_INT_C (0xffffffff)) + << 32), DImode); + } + else + gcc_unreachable (); + } + } + + return size; +} + +/* Emit insns to perform a move or push of DI, DF, XF, and TF values. + Return false when normal moves are needed; true when all required + insns have been emitted. Operands 2-4 contain the input values + int the correct order; operands 5-7 contain the output values. */ + +void +ix86_split_long_move (rtx operands[]) +{ + rtx part[2][4]; + int nparts, i, j; + int push = 0; + int collisions = 0; + machine_mode mode = GET_MODE (operands[0]); + bool collisionparts[4]; + + /* The DFmode expanders may ask us to move double. + For 64bit target this is single move. By hiding the fact + here we simplify i386.md splitters. */ + if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8) + { + /* Optimize constant pool reference to immediates. This is used by + fp moves, that force all constants to memory to allow combining. */ + + if (MEM_P (operands[1]) + && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0))) + operands[1] = get_pool_constant (XEXP (operands[1], 0)); + if (push_operand (operands[0], VOIDmode)) + { + operands[0] = copy_rtx (operands[0]); + PUT_MODE (operands[0], word_mode); + } + else + operands[0] = gen_lowpart (DImode, operands[0]); + operands[1] = gen_lowpart (DImode, operands[1]); + emit_move_insn (operands[0], operands[1]); + return; + } + + /* The only non-offsettable memory we handle is push. */ + if (push_operand (operands[0], VOIDmode)) + push = 1; + else + gcc_assert (!MEM_P (operands[0]) + || offsettable_memref_p (operands[0])); + + nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0])); + ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0])); + + /* When emitting push, take care for source operands on the stack. */ + if (push && MEM_P (operands[1]) + && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) + { + rtx src_base = XEXP (part[1][nparts - 1], 0); + + /* Compensate for the stack decrement by 4. */ + if (!TARGET_64BIT && nparts == 3 + && mode == XFmode && TARGET_128BIT_LONG_DOUBLE) + src_base = plus_constant (Pmode, src_base, 4); + + /* src_base refers to the stack pointer and is + automatically decreased by emitted push. */ + for (i = 0; i < nparts; i++) + part[1][i] = change_address (part[1][i], + GET_MODE (part[1][i]), src_base); + } + + /* We need to do copy in the right order in case an address register + of the source overlaps the destination. */ + if (REG_P (part[0][0]) && MEM_P (part[1][0])) + { + rtx tmp; + + for (i = 0; i < nparts; i++) + { + collisionparts[i] + = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0)); + if (collisionparts[i]) + collisions++; + } + + /* Collision in the middle part can be handled by reordering. */ + if (collisions == 1 && nparts == 3 && collisionparts [1]) + { + std::swap (part[0][1], part[0][2]); + std::swap (part[1][1], part[1][2]); + } + else if (collisions == 1 + && nparts == 4 + && (collisionparts [1] || collisionparts [2])) + { + if (collisionparts [1]) + { + std::swap (part[0][1], part[0][2]); + std::swap (part[1][1], part[1][2]); + } + else + { + std::swap (part[0][2], part[0][3]); + std::swap (part[1][2], part[1][3]); + } + } + + /* If there are more collisions, we can't handle it by reordering. + Do an lea to the last part and use only one colliding move. */ + else if (collisions > 1) + { + rtx base, addr; + + collisions = 1; + + base = part[0][nparts - 1]; + + /* Handle the case when the last part isn't valid for lea. + Happens in 64-bit mode storing the 12-byte XFmode. */ + if (GET_MODE (base) != Pmode) + base = gen_rtx_REG (Pmode, REGNO (base)); + + addr = XEXP (part[1][0], 0); + if (TARGET_TLS_DIRECT_SEG_REFS) + { + struct ix86_address parts; + int ok = ix86_decompose_address (addr, &parts); + gcc_assert (ok); + /* It is not valid to use %gs: or %fs: in lea. */ + gcc_assert (parts.seg == ADDR_SPACE_GENERIC); + } + emit_insn (gen_rtx_SET (base, addr)); + part[1][0] = replace_equiv_address (part[1][0], base); + for (i = 1; i < nparts; i++) + { + tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i); + part[1][i] = replace_equiv_address (part[1][i], tmp); + } + } + } + + if (push) + { + if (!TARGET_64BIT) + { + if (nparts == 3) + { + if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode) + emit_insn (ix86_gen_add3 (stack_pointer_rtx, + stack_pointer_rtx, GEN_INT (-4))); + emit_move_insn (part[0][2], part[1][2]); + } + else if (nparts == 4) + { + emit_move_insn (part[0][3], part[1][3]); + emit_move_insn (part[0][2], part[1][2]); + } + } + else + { + /* In 64bit mode we don't have 32bit push available. In case this is + register, it is OK - we will just use larger counterpart. We also + retype memory - these comes from attempt to avoid REX prefix on + moving of second half of TFmode value. */ + if (GET_MODE (part[1][1]) == SImode) + { + switch (GET_CODE (part[1][1])) + { + case MEM: + part[1][1] = adjust_address (part[1][1], DImode, 0); + break; + + case REG: + part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1])); + break; + + default: + gcc_unreachable (); + } + + if (GET_MODE (part[1][0]) == SImode) + part[1][0] = part[1][1]; + } + } + emit_move_insn (part[0][1], part[1][1]); + emit_move_insn (part[0][0], part[1][0]); + return; + } + + /* Choose correct order to not overwrite the source before it is copied. */ + if ((REG_P (part[0][0]) + && REG_P (part[1][1]) + && (REGNO (part[0][0]) == REGNO (part[1][1]) + || (nparts == 3 + && REGNO (part[0][0]) == REGNO (part[1][2])) + || (nparts == 4 + && REGNO (part[0][0]) == REGNO (part[1][3])))) + || (collisions > 0 + && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0)))) + { + for (i = 0, j = nparts - 1; i < nparts; i++, j--) + { + operands[2 + i] = part[0][j]; + operands[6 + i] = part[1][j]; + } + } + else + { + for (i = 0; i < nparts; i++) + { + operands[2 + i] = part[0][i]; + operands[6 + i] = part[1][i]; + } + } + + /* If optimizing for size, attempt to locally unCSE nonzero constants. */ + if (optimize_insn_for_size_p ()) + { + for (j = 0; j < nparts - 1; j++) + if (CONST_INT_P (operands[6 + j]) + && operands[6 + j] != const0_rtx + && REG_P (operands[2 + j])) + for (i = j; i < nparts - 1; i++) + if (CONST_INT_P (operands[7 + i]) + && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j])) + operands[7 + i] = operands[2 + j]; + } + + for (i = 0; i < nparts; i++) + emit_move_insn (operands[2 + i], operands[6 + i]); + + return; +} + +/* Helper function of ix86_split_ashl used to generate an SImode/DImode + left shift by a constant, either using a single shift or + a sequence of add instructions. */ + +static void +ix86_expand_ashl_const (rtx operand, int count, machine_mode mode) +{ + rtx (*insn)(rtx, rtx, rtx); + + if (count == 1 + || (count * ix86_cost->add <= ix86_cost->shift_const + && !optimize_insn_for_size_p ())) + { + insn = mode == DImode ? gen_addsi3 : gen_adddi3; + while (count-- > 0) + emit_insn (insn (operand, operand, operand)); + } + else + { + insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3; + emit_insn (insn (operand, operand, GEN_INT (count))); + } +} + +void +ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode) +{ + rtx (*gen_ashl3)(rtx, rtx, rtx); + rtx (*gen_shld)(rtx, rtx, rtx); + int half_width = GET_MODE_BITSIZE (mode) >> 1; + + rtx low[2], high[2]; + int count; + + if (CONST_INT_P (operands[2])) + { + split_double_mode (mode, operands, 2, low, high); + count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); + + if (count >= half_width) + { + emit_move_insn (high[0], low[1]); + emit_move_insn (low[0], const0_rtx); + + if (count > half_width) + ix86_expand_ashl_const (high[0], count - half_width, mode); + } + else + { + gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + emit_insn (gen_shld (high[0], low[0], GEN_INT (count))); + ix86_expand_ashl_const (low[0], count, mode); + } + return; + } + + split_double_mode (mode, operands, 1, low, high); + + gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3; + + if (operands[1] == const1_rtx) + { + /* Assuming we've chosen a QImode capable registers, then 1 << N + can be done with two 32/64-bit shifts, no branches, no cmoves. */ + if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0])) + { + rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG); + + ix86_expand_clear (low[0]); + ix86_expand_clear (high[0]); + emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width))); + + d = gen_lowpart (QImode, low[0]); + d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); + s = gen_rtx_EQ (QImode, flags, const0_rtx); + emit_insn (gen_rtx_SET (d, s)); + + d = gen_lowpart (QImode, high[0]); + d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); + s = gen_rtx_NE (QImode, flags, const0_rtx); + emit_insn (gen_rtx_SET (d, s)); + } + + /* Otherwise, we can get the same results by manually performing + a bit extract operation on bit 5/6, and then performing the two + shifts. The two methods of getting 0/1 into low/high are exactly + the same size. Avoiding the shift in the bit extract case helps + pentium4 a bit; no one else seems to care much either way. */ + else + { + machine_mode half_mode; + rtx (*gen_lshr3)(rtx, rtx, rtx); + rtx (*gen_and3)(rtx, rtx, rtx); + rtx (*gen_xor3)(rtx, rtx, rtx); + HOST_WIDE_INT bits; + rtx x; + + if (mode == DImode) + { + half_mode = SImode; + gen_lshr3 = gen_lshrsi3; + gen_and3 = gen_andsi3; + gen_xor3 = gen_xorsi3; + bits = 5; + } + else + { + half_mode = DImode; + gen_lshr3 = gen_lshrdi3; + gen_and3 = gen_anddi3; + gen_xor3 = gen_xordi3; + bits = 6; + } + + if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ()) + x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]); + else + x = gen_lowpart (half_mode, operands[2]); + emit_insn (gen_rtx_SET (high[0], x)); + + emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits))); + emit_insn (gen_and3 (high[0], high[0], const1_rtx)); + emit_move_insn (low[0], high[0]); + emit_insn (gen_xor3 (low[0], low[0], const1_rtx)); + } + + emit_insn (gen_ashl3 (low[0], low[0], operands[2])); + emit_insn (gen_ashl3 (high[0], high[0], operands[2])); + return; + } + + if (operands[1] == constm1_rtx) + { + /* For -1 << N, we can avoid the shld instruction, because we + know that we're shifting 0...31/63 ones into a -1. */ + emit_move_insn (low[0], constm1_rtx); + if (optimize_insn_for_size_p ()) + emit_move_insn (high[0], low[0]); + else + emit_move_insn (high[0], constm1_rtx); + } + else + { + gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + split_double_mode (mode, operands, 1, low, high); + emit_insn (gen_shld (high[0], low[0], operands[2])); + } + + emit_insn (gen_ashl3 (low[0], low[0], operands[2])); + + if (TARGET_CMOVE && scratch) + { + rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1; + + ix86_expand_clear (scratch); + emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch)); + } + else + { + rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2; + + emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2])); + } +} + +void +ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode) +{ + rtx (*gen_ashr3)(rtx, rtx, rtx) + = mode == DImode ? gen_ashrsi3 : gen_ashrdi3; + rtx (*gen_shrd)(rtx, rtx, rtx); + int half_width = GET_MODE_BITSIZE (mode) >> 1; + + rtx low[2], high[2]; + int count; + + if (CONST_INT_P (operands[2])) + { + split_double_mode (mode, operands, 2, low, high); + count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); + + if (count == GET_MODE_BITSIZE (mode) - 1) + { + emit_move_insn (high[0], high[1]); + emit_insn (gen_ashr3 (high[0], high[0], + GEN_INT (half_width - 1))); + emit_move_insn (low[0], high[0]); + + } + else if (count >= half_width) + { + emit_move_insn (low[0], high[1]); + emit_move_insn (high[0], low[0]); + emit_insn (gen_ashr3 (high[0], high[0], + GEN_INT (half_width - 1))); + + if (count > half_width) + emit_insn (gen_ashr3 (low[0], low[0], + GEN_INT (count - half_width))); + } + else + { + gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); + emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count))); + } + } + else + { + gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + split_double_mode (mode, operands, 1, low, high); + + emit_insn (gen_shrd (low[0], high[0], operands[2])); + emit_insn (gen_ashr3 (high[0], high[0], operands[2])); + + if (TARGET_CMOVE && scratch) + { + rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1; + + emit_move_insn (scratch, high[0]); + emit_insn (gen_ashr3 (scratch, scratch, + GEN_INT (half_width - 1))); + emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2], + scratch)); + } + else + { + rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3; + + emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2])); + } + } +} + +void +ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode) +{ + rtx (*gen_lshr3)(rtx, rtx, rtx) + = mode == DImode ? gen_lshrsi3 : gen_lshrdi3; + rtx (*gen_shrd)(rtx, rtx, rtx); + int half_width = GET_MODE_BITSIZE (mode) >> 1; + + rtx low[2], high[2]; + int count; + + if (CONST_INT_P (operands[2])) + { + split_double_mode (mode, operands, 2, low, high); + count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); + + if (count >= half_width) + { + emit_move_insn (low[0], high[1]); + ix86_expand_clear (high[0]); + + if (count > half_width) + emit_insn (gen_lshr3 (low[0], low[0], + GEN_INT (count - half_width))); + } + else + { + gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); + emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count))); + } + } + else + { + gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; + + if (!rtx_equal_p (operands[0], operands[1])) + emit_move_insn (operands[0], operands[1]); + + split_double_mode (mode, operands, 1, low, high); + + emit_insn (gen_shrd (low[0], high[0], operands[2])); + emit_insn (gen_lshr3 (high[0], high[0], operands[2])); + + if (TARGET_CMOVE && scratch) + { + rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1; + + ix86_expand_clear (scratch); + emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2], + scratch)); + } + else + { + rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx) + = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2; + + emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2])); + } + } +} + +/* Return mode for the memcpy/memset loop counter. Prefer SImode over + DImode for constant loop counts. */ + +static machine_mode +counter_mode (rtx count_exp) +{ + if (GET_MODE (count_exp) != VOIDmode) + return GET_MODE (count_exp); + if (!CONST_INT_P (count_exp)) + return Pmode; + if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff)) + return DImode; + return SImode; +} + +/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR + to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT + specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set + memory by VALUE (supposed to be in MODE). + + The size is rounded down to whole number of chunk size moved at once. + SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */ + + +static void +expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx value, + rtx count, machine_mode mode, int unroll, + int expected_size, bool issetmem) +{ + rtx_code_label *out_label, *top_label; + rtx iter, tmp; + machine_mode iter_mode = counter_mode (count); + int piece_size_n = GET_MODE_SIZE (mode) * unroll; + rtx piece_size = GEN_INT (piece_size_n); + rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); + rtx size; + int i; + + top_label = gen_label_rtx (); + out_label = gen_label_rtx (); + iter = gen_reg_rtx (iter_mode); + + size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, + NULL, 1, OPTAB_DIRECT); + /* Those two should combine. */ + if (piece_size == const1_rtx) + { + emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, + true, out_label); + predict_jump (REG_BR_PROB_BASE * 10 / 100); + } + emit_move_insn (iter, const0_rtx); + + emit_label (top_label); + + tmp = convert_modes (Pmode, iter_mode, iter, true); + + /* This assert could be relaxed - in this case we'll need to compute + smallest power of two, containing in PIECE_SIZE_N and pass it to + offset_address. */ + gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0); + destmem = offset_address (destmem, tmp, piece_size_n); + destmem = adjust_address (destmem, mode, 0); + + if (!issetmem) + { + srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n); + srcmem = adjust_address (srcmem, mode, 0); + + /* When unrolling for chips that reorder memory reads and writes, + we can save registers by using single temporary. + Also using 4 temporaries is overkill in 32bit mode. */ + if (!TARGET_64BIT && 0) + { + for (i = 0; i < unroll; i++) + { + if (i) + { + destmem = adjust_address (copy_rtx (destmem), mode, + GET_MODE_SIZE (mode)); + srcmem = adjust_address (copy_rtx (srcmem), mode, + GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, srcmem); + } + } + else + { + rtx tmpreg[4]; + gcc_assert (unroll <= 4); + for (i = 0; i < unroll; i++) + { + tmpreg[i] = gen_reg_rtx (mode); + if (i) + srcmem = adjust_address (copy_rtx (srcmem), mode, + GET_MODE_SIZE (mode)); + emit_move_insn (tmpreg[i], srcmem); + } + for (i = 0; i < unroll; i++) + { + if (i) + destmem = adjust_address (copy_rtx (destmem), mode, + GET_MODE_SIZE (mode)); + emit_move_insn (destmem, tmpreg[i]); + } + } + } + else + for (i = 0; i < unroll; i++) + { + if (i) + destmem = adjust_address (copy_rtx (destmem), mode, + GET_MODE_SIZE (mode)); + emit_move_insn (destmem, value); + } + + tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter, + true, OPTAB_LIB_WIDEN); + if (tmp != iter) + emit_move_insn (iter, tmp); + + emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, + true, top_label); + if (expected_size != -1) + { + expected_size /= GET_MODE_SIZE (mode) * unroll; + if (expected_size == 0) + predict_jump (0); + else if (expected_size > REG_BR_PROB_BASE) + predict_jump (REG_BR_PROB_BASE - 1); + else + predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) + / expected_size); + } + else + predict_jump (REG_BR_PROB_BASE * 80 / 100); + iter = ix86_zero_extend_to_Pmode (iter); + tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr, + true, OPTAB_LIB_WIDEN); + if (tmp != destptr) + emit_move_insn (destptr, tmp); + if (!issetmem) + { + tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr, + true, OPTAB_LIB_WIDEN); + if (tmp != srcptr) + emit_move_insn (srcptr, tmp); + } + emit_label (out_label); +} + +/* Divide COUNTREG by SCALE. */ +static rtx +scale_counter (rtx countreg, int scale) +{ + rtx sc; + + if (scale == 1) + return countreg; + if (CONST_INT_P (countreg)) + return GEN_INT (INTVAL (countreg) / scale); + gcc_assert (REG_P (countreg)); + + sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg, + GEN_INT (exact_log2 (scale)), + NULL, 1, OPTAB_DIRECT); + return sc; +} + +/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument. + When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored. + When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored. + For setmem case, VALUE is a promoted to a wider size ORIG_VALUE. + ORIG_VALUE is the original value passed to memset to fill the memory with. + Other arguments have same meaning as for previous function. */ + +static void +expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx value, rtx orig_value, + rtx count, + machine_mode mode, bool issetmem) +{ + rtx destexp; + rtx srcexp; + rtx countreg; + HOST_WIDE_INT rounded_count; + + /* If possible, it is shorter to use rep movs. + TODO: Maybe it is better to move this logic to decide_alg. */ + if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3) + && (!issetmem || orig_value == const0_rtx)) + mode = SImode; + + if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) + destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); + + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, + GET_MODE_SIZE (mode))); + if (mode != QImode) + { + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + } + else + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count)) + { + rounded_count + = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode)); + destmem = shallow_copy_rtx (destmem); + set_mem_size (destmem, rounded_count); + } + else if (MEM_SIZE_KNOWN_P (destmem)) + clear_mem_size (destmem); + + if (issetmem) + { + value = force_reg (mode, gen_lowpart (mode, value)); + emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); + } + else + { + if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) + srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); + if (mode != QImode) + { + srcexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); + } + else + srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); + if (CONST_INT_P (count)) + { + rounded_count + = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode)); + srcmem = shallow_copy_rtx (srcmem); + set_mem_size (srcmem, rounded_count); + } + else + { + if (MEM_SIZE_KNOWN_P (srcmem)) + clear_mem_size (srcmem); + } + emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, + destexp, srcexp)); + } +} + +/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to + DESTMEM. + SRC is passed by pointer to be updated on return. + Return value is updated DST. */ +static rtx +emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr, + HOST_WIDE_INT size_to_move) +{ + rtx dst = destmem, src = *srcmem, adjust, tempreg; + enum insn_code code; + machine_mode move_mode; + int piece_size, i; + + /* Find the widest mode in which we could perform moves. + Start with the biggest power of 2 less than SIZE_TO_MOVE and half + it until move of such size is supported. */ + piece_size = 1 << floor_log2 (size_to_move); + while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode) + || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing) + { + gcc_assert (piece_size > 1); + piece_size >>= 1; + } + + /* Find the corresponding vector mode with the same size as MOVE_MODE. + MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ + if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) + { + int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); + if (!mode_for_vector (word_mode, nunits).exists (&move_mode) + || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing) + { + move_mode = word_mode; + piece_size = GET_MODE_SIZE (move_mode); + code = optab_handler (mov_optab, move_mode); + } + } + gcc_assert (code != CODE_FOR_nothing); + + dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); + src = adjust_automodify_address_nv (src, move_mode, srcptr, 0); + + /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ + gcc_assert (size_to_move % piece_size == 0); + adjust = GEN_INT (piece_size); + for (i = 0; i < size_to_move; i += piece_size) + { + /* We move from memory to memory, so we'll need to do it via + a temporary register. */ + tempreg = gen_reg_rtx (move_mode); + emit_insn (GEN_FCN (code) (tempreg, src)); + emit_insn (GEN_FCN (code) (dst, tempreg)); + + emit_move_insn (destptr, + gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust)); + emit_move_insn (srcptr, + gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust)); + + dst = adjust_automodify_address_nv (dst, move_mode, destptr, + piece_size); + src = adjust_automodify_address_nv (src, move_mode, srcptr, + piece_size); + } + + /* Update DST and SRC rtx. */ + *srcmem = src; + return dst; +} + +/* Helper function for the string operations below. Dest VARIABLE whether + it is aligned to VALUE bytes. If true, jump to the label. */ + +static rtx_code_label * +ix86_expand_aligntest (rtx variable, int value, bool epilogue) +{ + rtx_code_label *label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); + if (GET_MODE (variable) == DImode) + emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value))); + else + emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); + emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), + 1, label); + if (epilogue) + predict_jump (REG_BR_PROB_BASE * 50 / 100); + else + predict_jump (REG_BR_PROB_BASE * 90 / 100); + return label; +} + + +/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */ + +static void +expand_movmem_epilogue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx count, int max_size) +{ + rtx src, dest; + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + HOST_WIDE_INT epilogue_size = countval % max_size; + int i; + + /* For now MAX_SIZE should be a power of 2. This assert could be + relaxed, but it'll require a bit more complicated epilogue + expanding. */ + gcc_assert ((max_size & (max_size - 1)) == 0); + for (i = max_size; i >= 1; i >>= 1) + { + if (epilogue_size & i) + destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); + } + return; + } + if (max_size > 8) + { + count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), + count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL, + count, QImode, 1, 4, false); + return; + } + + /* When there are stringops, we can cheaply increase dest and src pointers. + Otherwise we save code size by maintaining offset (zero is readily + available from preceding rep operation) and using x86 addressing modes. + */ + if (TARGET_SINGLE_STRINGOP) + { + if (max_size > 4) + { + rtx_code_label *label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 2) + { + rtx_code_label *label = ix86_expand_aligntest (count, 2, true); + src = change_address (srcmem, HImode, srcptr); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 1) + { + rtx_code_label *label = ix86_expand_aligntest (count, 1, true); + src = change_address (srcmem, QImode, srcptr); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } + else + { + rtx offset = force_reg (Pmode, const0_rtx); + rtx tmp; + + if (max_size > 4) + { + rtx_code_label *label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 2) + { + rtx_code_label *label = ix86_expand_aligntest (count, 2, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, HImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, HImode, tmp); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 1) + { + rtx_code_label *label = ix86_expand_aligntest (count, 1, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, QImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, QImode, tmp); + emit_move_insn (dest, src); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } +} + +/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM + with value PROMOTED_VAL. + SRC is passed by pointer to be updated on return. + Return value is updated DST. */ +static rtx +emit_memset (rtx destmem, rtx destptr, rtx promoted_val, + HOST_WIDE_INT size_to_move) +{ + rtx dst = destmem, adjust; + enum insn_code code; + machine_mode move_mode; + int piece_size, i; + + /* Find the widest mode in which we could perform moves. + Start with the biggest power of 2 less than SIZE_TO_MOVE and half + it until move of such size is supported. */ + move_mode = GET_MODE (promoted_val); + if (move_mode == VOIDmode) + move_mode = QImode; + if (size_to_move < GET_MODE_SIZE (move_mode)) + { + unsigned int move_bits = size_to_move * BITS_PER_UNIT; + move_mode = int_mode_for_size (move_bits, 0).require (); + promoted_val = gen_lowpart (move_mode, promoted_val); + } + piece_size = GET_MODE_SIZE (move_mode); + code = optab_handler (mov_optab, move_mode); + gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX); + + dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); + + /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ + gcc_assert (size_to_move % piece_size == 0); + adjust = GEN_INT (piece_size); + for (i = 0; i < size_to_move; i += piece_size) + { + if (piece_size <= GET_MODE_SIZE (word_mode)) + { + emit_insn (gen_strset (destptr, dst, promoted_val)); + dst = adjust_automodify_address_nv (dst, move_mode, destptr, + piece_size); + continue; + } + + emit_insn (GEN_FCN (code) (dst, promoted_val)); + + emit_move_insn (destptr, + gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust)); + + dst = adjust_automodify_address_nv (dst, move_mode, destptr, + piece_size); + } + + /* Update DST rtx. */ + return dst; +} +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, + rtx count, int max_size) +{ + count = expand_simple_binop (counter_mode (count), AND, count, + GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL, + gen_lowpart (QImode, value), count, QImode, + 1, max_size / 2, true); +} + +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value, + rtx count, int max_size) +{ + rtx dest; + + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + HOST_WIDE_INT epilogue_size = countval % max_size; + int i; + + /* For now MAX_SIZE should be a power of 2. This assert could be + relaxed, but it'll require a bit more complicated epilogue + expanding. */ + gcc_assert ((max_size & (max_size - 1)) == 0); + for (i = max_size; i >= 1; i >>= 1) + { + if (epilogue_size & i) + { + if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) + destmem = emit_memset (destmem, destptr, vec_value, i); + else + destmem = emit_memset (destmem, destptr, value, i); + } + } + return; + } + if (max_size > 32) + { + expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size); + return; + } + if (max_size > 16) + { + rtx_code_label *label = ix86_expand_aligntest (count, 16, true); + if (TARGET_64BIT) + { + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (dest, DImode, destptr, 8); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (dest, SImode, destptr, 4); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (dest, SImode, destptr, 8); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (dest, SImode, destptr, 12); + emit_insn (gen_strset (destptr, dest, value)); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 8) + { + rtx_code_label *label = ix86_expand_aligntest (count, 8, true); + if (TARGET_64BIT) + { + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (dest, SImode, destptr, 4); + emit_insn (gen_strset (destptr, dest, value)); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 4) + { + rtx_code_label *label = ix86_expand_aligntest (count, 4, true); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 2) + { + rtx_code_label *label = ix86_expand_aligntest (count, 2, true); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 1) + { + rtx_code_label *label = ix86_expand_aligntest (count, 1, true); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } +} + +/* Adjust COUNTER by the VALUE. */ +static void +ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value) +{ + rtx (*gen_add)(rtx, rtx, rtx) + = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3; + + emit_insn (gen_add (countreg, countreg, GEN_INT (-value))); +} + +/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to + DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN. + Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are + ignored. + Return value is updated DESTMEM. */ + +static rtx +expand_set_or_movmem_prologue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx value, + rtx vec_value, rtx count, int align, + int desired_alignment, bool issetmem) +{ + int i; + for (i = 1; i < desired_alignment; i <<= 1) + { + if (align <= i) + { + rtx_code_label *label = ix86_expand_aligntest (destptr, i, false); + if (issetmem) + { + if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) + destmem = emit_memset (destmem, destptr, vec_value, i); + else + destmem = emit_memset (destmem, destptr, value, i); + } + else + destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); + ix86_adjust_counter (count, i); + emit_label (label); + LABEL_NUSES (label) = 1; + set_mem_align (destmem, i * 2 * BITS_PER_UNIT); + } + } + return destmem; +} + +/* Test if COUNT&SIZE is nonzero and if so, expand movme + or setmem sequence that is valid for SIZE..2*SIZE-1 bytes + and jump to DONE_LABEL. */ +static void +expand_small_movmem_or_setmem (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, + rtx value, rtx vec_value, + rtx count, int size, + rtx done_label, bool issetmem) +{ + rtx_code_label *label = ix86_expand_aligntest (count, size, false); + machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk (); + rtx modesize; + int n; + + /* If we do not have vector value to copy, we must reduce size. */ + if (issetmem) + { + if (!vec_value) + { + if (GET_MODE (value) == VOIDmode && size > 8) + mode = Pmode; + else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value))) + mode = GET_MODE (value); + } + else + mode = GET_MODE (vec_value), value = vec_value; + } + else + { + /* Choose appropriate vector mode. */ + if (size >= 32) + mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode; + else if (size >= 16) + mode = TARGET_SSE ? V16QImode : DImode; + srcmem = change_address (srcmem, mode, srcptr); + } + destmem = change_address (destmem, mode, destptr); + modesize = GEN_INT (GET_MODE_SIZE (mode)); + gcc_assert (GET_MODE_SIZE (mode) <= size); + for (n = 0; n * GET_MODE_SIZE (mode) < size; n++) + { + if (issetmem) + emit_move_insn (destmem, gen_lowpart (mode, value)); + else + { + emit_move_insn (destmem, srcmem); + srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); + } + destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); + } + + destmem = offset_address (destmem, count, 1); + destmem = offset_address (destmem, GEN_INT (-2 * size), + GET_MODE_SIZE (mode)); + if (!issetmem) + { + srcmem = offset_address (srcmem, count, 1); + srcmem = offset_address (srcmem, GEN_INT (-2 * size), + GET_MODE_SIZE (mode)); + } + for (n = 0; n * GET_MODE_SIZE (mode) < size; n++) + { + if (issetmem) + emit_move_insn (destmem, gen_lowpart (mode, value)); + else + { + emit_move_insn (destmem, srcmem); + srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); + } + destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); + } + emit_jump_insn (gen_jump (done_label)); + emit_barrier (); + + emit_label (label); + LABEL_NUSES (label) = 1; +} + +/* Handle small memcpy (up to SIZE that is supposed to be small power of 2. + and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN + bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can + proceed with an loop copying SIZE bytes at once. Do moves in MODE. + DONE_LABEL is a label after the whole copying sequence. The label is created + on demand if *DONE_LABEL is NULL. + MIN_SIZE is minimal size of block copied. This value gets adjusted for new + bounds after the initial copies. + + DESTMEM/SRCMEM are memory expressions pointing to the copies block, + DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether + we will dispatch to a library call for large blocks. + + In pseudocode we do: + + if (COUNT < SIZE) + { + Assume that SIZE is 4. Bigger sizes are handled analogously + if (COUNT & 4) + { + copy 4 bytes from SRCPTR to DESTPTR + copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4 + goto done_label + } + if (!COUNT) + goto done_label; + copy 1 byte from SRCPTR to DESTPTR + if (COUNT & 2) + { + copy 2 bytes from SRCPTR to DESTPTR + copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2 + } + } + else + { + copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR + copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE + + OLD_DESPTR = DESTPTR; + Align DESTPTR up to DESIRED_ALIGN + SRCPTR += DESTPTR - OLD_DESTPTR + COUNT -= DEST_PTR - OLD_DESTPTR + if (DYNAMIC_CHECK) + Round COUNT down to multiple of SIZE + << optional caller supplied zero size guard is here >> + << optional caller supplied dynamic check is here >> + << caller supplied main copy loop is here >> + } + done_label: + */ +static void +expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem, + rtx *destptr, rtx *srcptr, + machine_mode mode, + rtx value, rtx vec_value, + rtx *count, + rtx_code_label **done_label, + int size, + int desired_align, + int align, + unsigned HOST_WIDE_INT *min_size, + bool dynamic_check, + bool issetmem) +{ + rtx_code_label *loop_label = NULL, *label; + int n; + rtx modesize; + int prolog_size = 0; + rtx mode_value; + + /* Chose proper value to copy. */ + if (issetmem && VECTOR_MODE_P (mode)) + mode_value = vec_value; + else + mode_value = value; + gcc_assert (GET_MODE_SIZE (mode) <= size); + + /* See if block is big or small, handle small blocks. */ + if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size) + { + int size2 = size; + loop_label = gen_label_rtx (); + + if (!*done_label) + *done_label = gen_label_rtx (); + + emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count), + 1, loop_label); + size2 >>= 1; + + /* Handle sizes > 3. */ + for (;size2 > 2; size2 >>= 1) + expand_small_movmem_or_setmem (destmem, srcmem, + *destptr, *srcptr, + value, vec_value, + *count, + size2, *done_label, issetmem); + /* Nothing to copy? Jump to DONE_LABEL if so */ + emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count), + 1, *done_label); + + /* Do a byte copy. */ + destmem = change_address (destmem, QImode, *destptr); + if (issetmem) + emit_move_insn (destmem, gen_lowpart (QImode, value)); + else + { + srcmem = change_address (srcmem, QImode, *srcptr); + emit_move_insn (destmem, srcmem); + } + + /* Handle sizes 2 and 3. */ + label = ix86_expand_aligntest (*count, 2, false); + destmem = change_address (destmem, HImode, *destptr); + destmem = offset_address (destmem, *count, 1); + destmem = offset_address (destmem, GEN_INT (-2), 2); + if (issetmem) + emit_move_insn (destmem, gen_lowpart (HImode, value)); + else + { + srcmem = change_address (srcmem, HImode, *srcptr); + srcmem = offset_address (srcmem, *count, 1); + srcmem = offset_address (srcmem, GEN_INT (-2), 2); + emit_move_insn (destmem, srcmem); + } + + emit_label (label); + LABEL_NUSES (label) = 1; + emit_jump_insn (gen_jump (*done_label)); + emit_barrier (); + } + else + gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size + || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size); + + /* Start memcpy for COUNT >= SIZE. */ + if (loop_label) + { + emit_label (loop_label); + LABEL_NUSES (loop_label) = 1; + } + + /* Copy first desired_align bytes. */ + if (!issetmem) + srcmem = change_address (srcmem, mode, *srcptr); + destmem = change_address (destmem, mode, *destptr); + modesize = GEN_INT (GET_MODE_SIZE (mode)); + for (n = 0; prolog_size < desired_align - align; n++) + { + if (issetmem) + emit_move_insn (destmem, mode_value); + else + { + emit_move_insn (destmem, srcmem); + srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); + } + destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); + prolog_size += GET_MODE_SIZE (mode); + } + + + /* Copy last SIZE bytes. */ + destmem = offset_address (destmem, *count, 1); + destmem = offset_address (destmem, + GEN_INT (-size - prolog_size), + 1); + if (issetmem) + emit_move_insn (destmem, mode_value); + else + { + srcmem = offset_address (srcmem, *count, 1); + srcmem = offset_address (srcmem, + GEN_INT (-size - prolog_size), + 1); + emit_move_insn (destmem, srcmem); + } + for (n = 1; n * GET_MODE_SIZE (mode) < size; n++) + { + destmem = offset_address (destmem, modesize, 1); + if (issetmem) + emit_move_insn (destmem, mode_value); + else + { + srcmem = offset_address (srcmem, modesize, 1); + emit_move_insn (destmem, srcmem); + } + } + + /* Align destination. */ + if (desired_align > 1 && desired_align > align) + { + rtx saveddest = *destptr; + + gcc_assert (desired_align <= size); + /* Align destptr up, place it to new register. */ + *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr, + GEN_INT (prolog_size), + NULL_RTX, 1, OPTAB_DIRECT); + if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest)) + REG_POINTER (*destptr) = 1; + *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr, + GEN_INT (-desired_align), + *destptr, 1, OPTAB_DIRECT); + /* See how many bytes we skipped. */ + saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest, + *destptr, + saveddest, 1, OPTAB_DIRECT); + /* Adjust srcptr and count. */ + if (!issetmem) + *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, + saveddest, *srcptr, 1, OPTAB_DIRECT); + *count = expand_simple_binop (GET_MODE (*count), PLUS, *count, + saveddest, *count, 1, OPTAB_DIRECT); + /* We copied at most size + prolog_size. */ + if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size)) + *min_size + = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size); + else + *min_size = 0; + + /* Our loops always round down the block size, but for dispatch to + library we need precise value. */ + if (dynamic_check) + *count = expand_simple_binop (GET_MODE (*count), AND, *count, + GEN_INT (-size), *count, 1, OPTAB_DIRECT); + } + else + { + gcc_assert (prolog_size == 0); + /* Decrease count, so we won't end up copying last word twice. */ + if (!CONST_INT_P (*count)) + *count = expand_simple_binop (GET_MODE (*count), PLUS, *count, + constm1_rtx, *count, 1, OPTAB_DIRECT); + else + *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1, + (unsigned HOST_WIDE_INT)size)); + if (*min_size) + *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size); + } +} + + +/* This function is like the previous one, except here we know how many bytes + need to be copied. That allows us to update alignment not only of DST, which + is returned, but also of SRC, which is passed as a pointer for that + reason. */ +static rtx +expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg, + rtx srcreg, rtx value, rtx vec_value, + int desired_align, int align_bytes, + bool issetmem) +{ + rtx src = NULL; + rtx orig_dst = dst; + rtx orig_src = NULL; + int piece_size = 1; + int copied_bytes = 0; + + if (!issetmem) + { + gcc_assert (srcp != NULL); + src = *srcp; + orig_src = src; + } + + for (piece_size = 1; + piece_size <= desired_align && copied_bytes < align_bytes; + piece_size <<= 1) + { + if (align_bytes & piece_size) + { + if (issetmem) + { + if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value))) + dst = emit_memset (dst, destreg, vec_value, piece_size); + else + dst = emit_memset (dst, destreg, value, piece_size); + } + else + dst = emit_memmov (dst, &src, destreg, srcreg, piece_size); + copied_bytes += piece_size; + } + } + if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT) + set_mem_align (dst, desired_align * BITS_PER_UNIT); + if (MEM_SIZE_KNOWN_P (orig_dst)) + set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes); + + if (!issetmem) + { + int src_align_bytes = get_mem_align_offset (src, desired_align + * BITS_PER_UNIT); + if (src_align_bytes >= 0) + src_align_bytes = desired_align - src_align_bytes; + if (src_align_bytes >= 0) + { + unsigned int src_align; + for (src_align = desired_align; src_align >= 2; src_align >>= 1) + { + if ((src_align_bytes & (src_align - 1)) + == (align_bytes & (src_align - 1))) + break; + } + if (src_align > (unsigned int) desired_align) + src_align = desired_align; + if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT) + set_mem_align (src, src_align * BITS_PER_UNIT); + } + if (MEM_SIZE_KNOWN_P (orig_src)) + set_mem_size (src, MEM_SIZE (orig_src) - align_bytes); + *srcp = src; + } + + return dst; +} + +/* Return true if ALG can be used in current context. + Assume we expand memset if MEMSET is true. */ +static bool +alg_usable_p (enum stringop_alg alg, bool memset, bool have_as) +{ + if (alg == no_stringop) + return false; + if (alg == vector_loop) + return TARGET_SSE || TARGET_AVX; + /* Algorithms using the rep prefix want at least edi and ecx; + additionally, memset wants eax and memcpy wants esi. Don't + consider such algorithms if the user has appropriated those + registers for their own purposes, or if we have a non-default + address space, since some string insns cannot override the segment. */ + if (alg == rep_prefix_1_byte + || alg == rep_prefix_4_byte + || alg == rep_prefix_8_byte) + { + if (have_as) + return false; + if (fixed_regs[CX_REG] + || fixed_regs[DI_REG] + || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])) + return false; + } + return true; +} + +/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */ +static enum stringop_alg +decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, + unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size, + bool memset, bool zero_memset, bool have_as, + int *dynamic_check, bool *noalign, bool recur) +{ + const struct stringop_algs *algs; + bool optimize_for_speed; + int max = 0; + const struct processor_costs *cost; + int i; + bool any_alg_usable_p = false; + + *noalign = false; + *dynamic_check = -1; + + /* Even if the string operation call is cold, we still might spend a lot + of time processing large blocks. */ + if (optimize_function_for_size_p (cfun) + || (optimize_insn_for_size_p () + && (max_size < 256 + || (expected_size != -1 && expected_size < 256)))) + optimize_for_speed = false; + else + optimize_for_speed = true; + + cost = optimize_for_speed ? ix86_cost : &ix86_size_cost; + if (memset) + algs = &cost->memset[TARGET_64BIT != 0]; + else + algs = &cost->memcpy[TARGET_64BIT != 0]; + + /* See maximal size for user defined algorithm. */ + for (i = 0; i < MAX_STRINGOP_ALGS; i++) + { + enum stringop_alg candidate = algs->size[i].alg; + bool usable = alg_usable_p (candidate, memset, have_as); + any_alg_usable_p |= usable; + + if (candidate != libcall && candidate && usable) + max = algs->size[i].max; + } + + /* If expected size is not known but max size is small enough + so inline version is a win, set expected size into + the range. */ + if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1) + && expected_size == -1) + expected_size = min_size / 2 + max_size / 2; + + /* If user specified the algorithm, honor it if possible. */ + if (ix86_stringop_alg != no_stringop + && alg_usable_p (ix86_stringop_alg, memset, have_as)) + return ix86_stringop_alg; + /* rep; movq or rep; movl is the smallest variant. */ + else if (!optimize_for_speed) + { + *noalign = true; + if (!count || (count & 3) || (memset && !zero_memset)) + return alg_usable_p (rep_prefix_1_byte, memset, have_as) + ? rep_prefix_1_byte : loop_1_byte; + else + return alg_usable_p (rep_prefix_4_byte, memset, have_as) + ? rep_prefix_4_byte : loop; + } + /* Very tiny blocks are best handled via the loop, REP is expensive to + setup. */ + else if (expected_size != -1 && expected_size < 4) + return loop_1_byte; + else if (expected_size != -1) + { + enum stringop_alg alg = libcall; + bool alg_noalign = false; + for (i = 0; i < MAX_STRINGOP_ALGS; i++) + { + /* We get here if the algorithms that were not libcall-based + were rep-prefix based and we are unable to use rep prefixes + based on global register usage. Break out of the loop and + use the heuristic below. */ + if (algs->size[i].max == 0) + break; + if (algs->size[i].max >= expected_size || algs->size[i].max == -1) + { + enum stringop_alg candidate = algs->size[i].alg; + + if (candidate != libcall + && alg_usable_p (candidate, memset, have_as)) + { + alg = candidate; + alg_noalign = algs->size[i].noalign; + } + /* Honor TARGET_INLINE_ALL_STRINGOPS by picking + last non-libcall inline algorithm. */ + if (TARGET_INLINE_ALL_STRINGOPS) + { + /* When the current size is best to be copied by a libcall, + but we are still forced to inline, run the heuristic below + that will pick code for medium sized blocks. */ + if (alg != libcall) + { + *noalign = alg_noalign; + return alg; + } + else if (!any_alg_usable_p) + break; + } + else if (alg_usable_p (candidate, memset, have_as)) + { + *noalign = algs->size[i].noalign; + return candidate; + } + } + } + } + /* When asked to inline the call anyway, try to pick meaningful choice. + We look for maximal size of block that is faster to copy by hand and + take blocks of at most of that size guessing that average size will + be roughly half of the block. + + If this turns out to be bad, we might simply specify the preferred + choice in ix86_costs. */ + if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY) + && (algs->unknown_size == libcall + || !alg_usable_p (algs->unknown_size, memset, have_as))) + { + enum stringop_alg alg; + HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2; + + /* If there aren't any usable algorithms or if recursing already, + then recursing on smaller sizes or same size isn't going to + find anything. Just return the simple byte-at-a-time copy loop. */ + if (!any_alg_usable_p || recur) + { + /* Pick something reasonable. */ + if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur) + *dynamic_check = 128; + return loop_1_byte; + } + alg = decide_alg (count, new_expected_size, min_size, max_size, memset, + zero_memset, have_as, dynamic_check, noalign, true); + gcc_assert (*dynamic_check == -1); + if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) + *dynamic_check = max; + else + gcc_assert (alg != libcall); + return alg; + } + return (alg_usable_p (algs->unknown_size, memset, have_as) + ? algs->unknown_size : libcall); +} + +/* Decide on alignment. We know that the operand is already aligned to ALIGN + (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */ +static int +decide_alignment (int align, + enum stringop_alg alg, + int expected_size, + machine_mode move_mode) +{ + int desired_align = 0; + + gcc_assert (alg != no_stringop); + + if (alg == libcall) + return 0; + if (move_mode == VOIDmode) + return 0; + + desired_align = GET_MODE_SIZE (move_mode); + /* PentiumPro has special logic triggering for 8 byte aligned blocks. + copying whole cacheline at once. */ + if (TARGET_PENTIUMPRO + && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte)) + desired_align = 8; + + if (optimize_size) + desired_align = 1; + if (desired_align < align) + desired_align = align; + if (expected_size != -1 && expected_size < 4) + desired_align = align; + + return desired_align; +} + + +/* Helper function for memcpy. For QImode value 0xXY produce + 0xXYXYXYXY of wide specified by MODE. This is essentially + a * 0x10101010, but we can do slightly better than + synth_mult by unwinding the sequence by hand on CPUs with + slow multiply. */ +static rtx +promote_duplicated_reg (machine_mode mode, rtx val) +{ + machine_mode valmode = GET_MODE (val); + rtx tmp; + int nops = mode == DImode ? 3 : 2; + + gcc_assert (mode == SImode || mode == DImode || val == const0_rtx); + if (val == const0_rtx) + return copy_to_mode_reg (mode, CONST0_RTX (mode)); + if (CONST_INT_P (val)) + { + HOST_WIDE_INT v = INTVAL (val) & 255; + + v |= v << 8; + v |= v << 16; + if (mode == DImode) + v |= (v << 16) << 16; + return copy_to_mode_reg (mode, gen_int_mode (v, mode)); + } + + if (valmode == VOIDmode) + valmode = QImode; + if (valmode != QImode) + val = gen_lowpart (QImode, val); + if (mode == QImode) + return val; + if (!TARGET_PARTIAL_REG_STALL) + nops--; + if (ix86_cost->mult_init[mode == DImode ? 3 : 2] + + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) + <= (ix86_cost->shift_const + ix86_cost->add) * nops + + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) + { + rtx reg = convert_modes (mode, QImode, val, true); + tmp = promote_duplicated_reg (mode, const1_rtx); + return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, + OPTAB_DIRECT); + } + else + { + rtx reg = convert_modes (mode, QImode, val, true); + + if (!TARGET_PARTIAL_REG_STALL) + if (mode == SImode) + emit_insn (gen_insvsi_1 (reg, reg)); + else + emit_insn (gen_insvdi_1 (reg, reg)); + else + { + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, + OPTAB_DIRECT); + } + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + if (mode == SImode) + return reg; + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + return reg; + } +} + +/* Duplicate value VAL using promote_duplicated_reg into maximal size that will + be needed by main loop copying SIZE_NEEDED chunks and prologue getting + alignment from ALIGN to DESIRED_ALIGN. */ +static rtx +promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, + int align) +{ + rtx promoted_val; + + if (TARGET_64BIT + && (size_needed > 4 || (desired_align > align && desired_align > 4))) + promoted_val = promote_duplicated_reg (DImode, val); + else if (size_needed > 2 || (desired_align > align && desired_align > 2)) + promoted_val = promote_duplicated_reg (SImode, val); + else if (size_needed > 1 || (desired_align > align && desired_align > 1)) + promoted_val = promote_duplicated_reg (HImode, val); + else + promoted_val = val; + + return promoted_val; +} + +/* Copy the address to a Pmode register. This is used for x32 to + truncate DImode TLS address to a SImode register. */ + +static rtx +ix86_copy_addr_to_reg (rtx addr) +{ + rtx reg; + if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode) + { + reg = copy_addr_to_reg (addr); + REG_POINTER (reg) = 1; + return reg; + } + else + { + gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode); + reg = copy_to_mode_reg (DImode, addr); + REG_POINTER (reg) = 1; + return gen_rtx_SUBREG (SImode, reg, 0); + } +} + +/* Expand string move (memcpy) ot store (memset) operation. Use i386 string + operations when profitable. The code depends upon architecture, block size + and alignment, but always has one of the following overall structures: + + Aligned move sequence: + + 1) Prologue guard: Conditional that jumps up to epilogues for small + blocks that can be handled by epilogue alone. This is faster + but also needed for correctness, since prologue assume the block + is larger than the desired alignment. + + Optional dynamic check for size and libcall for large + blocks is emitted here too, with -minline-stringops-dynamically. + + 2) Prologue: copy first few bytes in order to get destination + aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less + than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be + copied. We emit either a jump tree on power of two sized + blocks, or a byte loop. + + 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks + with specified algorithm. + + 4) Epilogue: code copying tail of the block that is too small to be + handled by main body (or up to size guarded by prologue guard). + + Misaligned move sequence + + 1) missaligned move prologue/epilogue containing: + a) Prologue handling small memory blocks and jumping to done_label + (skipped if blocks are known to be large enough) + b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is + needed by single possibly misaligned move + (skipped if alignment is not needed) + c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves + + 2) Zero size guard dispatching to done_label, if needed + + 3) dispatch to library call, if needed, + + 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks + with specified algorithm. */ +bool +ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp, + rtx align_exp, rtx expected_align_exp, + rtx expected_size_exp, rtx min_size_exp, + rtx max_size_exp, rtx probable_max_size_exp, + bool issetmem) +{ + rtx destreg; + rtx srcreg = NULL; + rtx_code_label *label = NULL; + rtx tmp; + rtx_code_label *jump_around_label = NULL; + HOST_WIDE_INT align = 1; + unsigned HOST_WIDE_INT count = 0; + HOST_WIDE_INT expected_size = -1; + int size_needed = 0, epilogue_size_needed; + int desired_align = 0, align_bytes = 0; + enum stringop_alg alg; + rtx promoted_val = NULL; + rtx vec_promoted_val = NULL; + bool force_loopy_epilogue = false; + int dynamic_check; + bool need_zero_guard = false; + bool noalign; + machine_mode move_mode = VOIDmode; + machine_mode wider_mode; + int unroll_factor = 1; + /* TODO: Once value ranges are available, fill in proper data. */ + unsigned HOST_WIDE_INT min_size = 0; + unsigned HOST_WIDE_INT max_size = -1; + unsigned HOST_WIDE_INT probable_max_size = -1; + bool misaligned_prologue_used = false; + bool have_as; + + if (CONST_INT_P (align_exp)) + align = INTVAL (align_exp); + /* i386 can do misaligned access on reasonably increased cost. */ + if (CONST_INT_P (expected_align_exp) + && INTVAL (expected_align_exp) > align) + align = INTVAL (expected_align_exp); + /* ALIGN is the minimum of destination and source alignment, but we care here + just about destination alignment. */ + else if (!issetmem + && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT) + align = MEM_ALIGN (dst) / BITS_PER_UNIT; + + if (CONST_INT_P (count_exp)) + { + min_size = max_size = probable_max_size = count = expected_size + = INTVAL (count_exp); + /* When COUNT is 0, there is nothing to do. */ + if (!count) + return true; + } + else + { + if (min_size_exp) + min_size = INTVAL (min_size_exp); + if (max_size_exp) + max_size = INTVAL (max_size_exp); + if (probable_max_size_exp) + probable_max_size = INTVAL (probable_max_size_exp); + if (CONST_INT_P (expected_size_exp)) + expected_size = INTVAL (expected_size_exp); + } + + /* Make sure we don't need to care about overflow later on. */ + if (count > (HOST_WIDE_INT_1U << 30)) + return false; + + have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst)); + if (!issetmem) + have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)); + + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ + alg = decide_alg (count, expected_size, min_size, probable_max_size, + issetmem, + issetmem && val_exp == const0_rtx, have_as, + &dynamic_check, &noalign, false); + + if (dump_file) + fprintf (dump_file, "Selected stringop expansion strategy: %s\n", + stringop_alg_names[alg]); + + if (alg == libcall) + return false; + gcc_assert (alg != no_stringop); + + /* For now vector-version of memset is generated only for memory zeroing, as + creating of promoted vector value is very cheap in this case. */ + if (issetmem && alg == vector_loop && val_exp != const0_rtx) + alg = unrolled_loop; + + if (!count) + count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); + destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); + if (!issetmem) + srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); + + unroll_factor = 1; + move_mode = word_mode; + switch (alg) + { + case libcall: + case no_stringop: + case last_alg: + gcc_unreachable (); + case loop_1_byte: + need_zero_guard = true; + move_mode = QImode; + break; + case loop: + need_zero_guard = true; + break; + case unrolled_loop: + need_zero_guard = true; + unroll_factor = (TARGET_64BIT ? 4 : 2); + break; + case vector_loop: + need_zero_guard = true; + unroll_factor = 4; + /* Find the widest supported mode. */ + move_mode = word_mode; + while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode) + && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing) + move_mode = wider_mode; + + if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128) + move_mode = TImode; + + /* Find the corresponding vector mode with the same size as MOVE_MODE. + MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ + if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) + { + int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); + if (!mode_for_vector (word_mode, nunits).exists (&move_mode) + || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing) + move_mode = word_mode; + } + gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing); + break; + case rep_prefix_8_byte: + move_mode = DImode; + break; + case rep_prefix_4_byte: + move_mode = SImode; + break; + case rep_prefix_1_byte: + move_mode = QImode; + break; + } + size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; + epilogue_size_needed = size_needed; + + /* If we are going to call any library calls conditionally, make sure any + pending stack adjustment happen before the first conditional branch, + otherwise they will be emitted before the library call only and won't + happen from the other branches. */ + if (dynamic_check != -1) + do_pending_stack_adjust (); + + desired_align = decide_alignment (align, alg, expected_size, move_mode); + if (!TARGET_ALIGN_STRINGOPS || noalign) + align = desired_align; + + /* Step 1: Prologue guard. */ + + /* Alignment code needs count to be in register. */ + if (CONST_INT_P (count_exp) && desired_align > align) + { + if (INTVAL (count_exp) > desired_align + && INTVAL (count_exp) > size_needed) + { + align_bytes + = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT); + if (align_bytes <= 0) + align_bytes = 0; + else + align_bytes = desired_align - align_bytes; + } + if (align_bytes == 0) + count_exp = force_reg (counter_mode (count_exp), count_exp); + } + gcc_assert (desired_align >= 1 && align >= 1); + + /* Misaligned move sequences handle both prologue and epilogue at once. + Default code generation results in a smaller code for large alignments + and also avoids redundant job when sizes are known precisely. */ + misaligned_prologue_used + = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES + && MAX (desired_align, epilogue_size_needed) <= 32 + && desired_align <= epilogue_size_needed + && ((desired_align > align && !align_bytes) + || (!count && epilogue_size_needed > 1))); + + /* Do the cheap promotion to allow better CSE across the + main loop and epilogue (ie one load of the big constant in the + front of all code. + For now the misaligned move sequences do not have fast path + without broadcasting. */ + if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used))) + { + if (alg == vector_loop) + { + gcc_assert (val_exp == const0_rtx); + vec_promoted_val = promote_duplicated_reg (move_mode, val_exp); + promoted_val = promote_duplicated_reg_to_size (val_exp, + GET_MODE_SIZE (word_mode), + desired_align, align); + } + else + { + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + } + } + /* Misaligned move sequences handles both prologues and epilogues at once. + Default code generation results in smaller code for large alignments and + also avoids redundant job when sizes are known precisely. */ + if (misaligned_prologue_used) + { + /* Misaligned move prologue handled small blocks by itself. */ + expand_set_or_movmem_prologue_epilogue_by_misaligned_moves + (dst, src, &destreg, &srcreg, + move_mode, promoted_val, vec_promoted_val, + &count_exp, + &jump_around_label, + desired_align < align + ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed, + desired_align, align, &min_size, dynamic_check, issetmem); + if (!issetmem) + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + set_mem_align (dst, desired_align * BITS_PER_UNIT); + epilogue_size_needed = 0; + if (need_zero_guard + && min_size < (unsigned HOST_WIDE_INT) size_needed) + { + /* It is possible that we copied enough so the main loop will not + execute. */ + gcc_assert (size_needed > 1); + if (jump_around_label == NULL_RTX) + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (size_needed), + LTU, 0, counter_mode (count_exp), 1, jump_around_label); + if (expected_size == -1 + || expected_size < (desired_align - align) / 2 + size_needed) + predict_jump (REG_BR_PROB_BASE * 20 / 100); + else + predict_jump (REG_BR_PROB_BASE * 60 / 100); + } + } + /* Ensure that alignment prologue won't copy past end of block. */ + else if (size_needed > 1 || (desired_align > 1 && desired_align > align)) + { + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1); + + /* To improve performance of small blocks, we jump around the VAL + promoting mode. This mean that if the promoted VAL is not constant, + we might not use it in the epilogue and have to use byte + loop variant. */ + if (issetmem && epilogue_size_needed > 2 && !promoted_val) + force_loopy_epilogue = true; + if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed) + || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed) + { + /* If main algorithm works on QImode, no epilogue is needed. + For small sizes just don't align anything. */ + if (size_needed == 1) + desired_align = align; + else + goto epilogue; + } + else if (!count + && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed) + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (expected_size == -1 || expected_size < epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); + } + } + + /* Emit code to decide on runtime whether library call or inline should be + used. */ + if (dynamic_check != -1) + { + if (!issetmem && CONST_INT_P (count_exp)) + { + if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check) + { + emit_block_copy_via_libcall (dst, src, count_exp); + count_exp = const0_rtx; + goto epilogue; + } + } + else + { + rtx_code_label *hot_label = gen_label_rtx (); + if (jump_around_label == NULL_RTX) + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, counter_mode (count_exp), + 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + if (issetmem) + set_storage_via_libcall (dst, count_exp, val_exp); + else + emit_block_copy_via_libcall (dst, src, count_exp); + emit_jump (jump_around_label); + emit_label (hot_label); + } + } + + /* Step 2: Alignment prologue. */ + /* Do the expensive promotion once we branched off the small blocks. */ + if (issetmem && !promoted_val) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + + if (desired_align > align && !misaligned_prologue_used) + { + if (align_bytes == 0) + { + /* Except for the first move in prologue, we no longer know + constant offset in aliasing info. It don't seems to worth + the pain to maintain it for the first move, so throw away + the info early. */ + dst = change_address (dst, BLKmode, destreg); + if (!issetmem) + src = change_address (src, BLKmode, srcreg); + dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg, + promoted_val, vec_promoted_val, + count_exp, align, desired_align, + issetmem); + /* At most desired_align - align bytes are copied. */ + if (min_size < (unsigned)(desired_align - align)) + min_size = 0; + else + min_size -= desired_align - align; + } + else + { + /* If we know how many bytes need to be stored before dst is + sufficiently aligned, maintain aliasing info accurately. */ + dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg, + srcreg, + promoted_val, + vec_promoted_val, + desired_align, + align_bytes, + issetmem); + + count_exp = plus_constant (counter_mode (count_exp), + count_exp, -align_bytes); + count -= align_bytes; + min_size -= align_bytes; + max_size -= align_bytes; + } + if (need_zero_guard + && min_size < (unsigned HOST_WIDE_INT) size_needed + && (count < (unsigned HOST_WIDE_INT) size_needed + || (align_bytes == 0 + && count < ((unsigned HOST_WIDE_INT) size_needed + + desired_align - align)))) + { + /* It is possible that we copied enough so the main loop will not + execute. */ + gcc_assert (size_needed > 1); + if (label == NULL_RTX) + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (expected_size == -1 + || expected_size < (desired_align - align) / 2 + size_needed) + predict_jump (REG_BR_PROB_BASE * 20 / 100); + else + predict_jump (REG_BR_PROB_BASE * 60 / 100); + } + } + if (label && size_needed == 1) + { + emit_label (label); + LABEL_NUSES (label) = 1; + label = NULL; + epilogue_size_needed = 1; + if (issetmem) + promoted_val = val_exp; + } + else if (label == NULL_RTX && !misaligned_prologue_used) + epilogue_size_needed = size_needed; + + /* Step 3: Main loop. */ + + switch (alg) + { + case libcall: + case no_stringop: + case last_alg: + gcc_unreachable (); + case loop_1_byte: + case loop: + case unrolled_loop: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val, + count_exp, move_mode, unroll_factor, + expected_size, issetmem); + break; + case vector_loop: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, + vec_promoted_val, count_exp, move_mode, + unroll_factor, expected_size, issetmem); + break; + case rep_prefix_8_byte: + case rep_prefix_4_byte: + case rep_prefix_1_byte: + expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val, + val_exp, count_exp, move_mode, issetmem); + break; + } + /* Adjust properly the offset of src and dest memory for aliasing. */ + if (CONST_INT_P (count_exp)) + { + if (!issetmem) + src = adjust_automodify_address_nv (src, BLKmode, srcreg, + (count / size_needed) * size_needed); + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, + (count / size_needed) * size_needed); + } + else + { + if (!issetmem) + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + } + + /* Step 4: Epilogue to copy the remaining bytes. */ + epilogue: + if (label) + { + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ + + if (size_needed < epilogue_size_needed) + { + tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp, + GEN_INT (size_needed - 1), count_exp, 1, + OPTAB_DIRECT); + if (tmp != count_exp) + emit_move_insn (count_exp, tmp); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (count_exp != const0_rtx && epilogue_size_needed > 1) + { + if (force_loopy_epilogue) + expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, + epilogue_size_needed); + else + { + if (issetmem) + expand_setmem_epilogue (dst, destreg, promoted_val, + vec_promoted_val, count_exp, + epilogue_size_needed); + else + expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, + epilogue_size_needed); + } + } + if (jump_around_label) + emit_label (jump_around_label); + return true; +} + + +/* Expand the appropriate insns for doing strlen if not just doing + repnz; scasb + + out = result, initialized with the start address + align_rtx = alignment of the address. + scratch = scratch register, initialized with the startaddress when + not aligned, otherwise undefined + + This is just the body. It needs the initializations mentioned above and + some address computing at the end. These things are done in i386.md. */ + +static void +ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx) +{ + int align; + rtx tmp; + rtx_code_label *align_2_label = NULL; + rtx_code_label *align_3_label = NULL; + rtx_code_label *align_4_label = gen_label_rtx (); + rtx_code_label *end_0_label = gen_label_rtx (); + rtx mem; + rtx tmpreg = gen_reg_rtx (SImode); + rtx scratch = gen_reg_rtx (SImode); + rtx cmp; + + align = 0; + if (CONST_INT_P (align_rtx)) + align = INTVAL (align_rtx); + + /* Loop to check 1..3 bytes for null to get an aligned pointer. */ + + /* Is there a known alignment and is it less than 4? */ + if (align < 4) + { + rtx scratch1 = gen_reg_rtx (Pmode); + emit_move_insn (scratch1, out); + /* Is there a known alignment and is it not 2? */ + if (align != 2) + { + align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */ + align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */ + + /* Leave just the 3 lower bits. */ + align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3), + NULL_RTX, 0, OPTAB_WIDEN); + + emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, + Pmode, 1, align_4_label); + emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL, + Pmode, 1, align_2_label); + emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL, + Pmode, 1, align_3_label); + } + else + { + /* Since the alignment is 2, we have to check 2 or 0 bytes; + check if is aligned to 4 - byte. */ + + align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx, + NULL_RTX, 0, OPTAB_WIDEN); + + emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, + Pmode, 1, align_4_label); + } + + mem = change_address (src, QImode, out); + + /* Now compare the bytes. */ + + /* Compare the first n unaligned byte on a byte per byte basis. */ + emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, + QImode, 1, end_0_label); + + /* Increment the address. */ + emit_insn (ix86_gen_add3 (out, out, const1_rtx)); + + /* Not needed with an alignment of 2 */ + if (align != 2) + { + emit_label (align_2_label); + + emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, + end_0_label); + + emit_insn (ix86_gen_add3 (out, out, const1_rtx)); + + emit_label (align_3_label); + } + + emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, + end_0_label); + + emit_insn (ix86_gen_add3 (out, out, const1_rtx)); + } + + /* Generate loop to check 4 bytes at a time. It is not a good idea to + align this loop. It gives only huge programs, but does not help to + speed up. */ + emit_label (align_4_label); + + mem = change_address (src, SImode, out); + emit_move_insn (scratch, mem); + emit_insn (ix86_gen_add3 (out, out, GEN_INT (4))); + + /* This formula yields a nonzero result iff one of the bytes is zero. + This saves three branches inside loop and many cycles. */ + + emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101))); + emit_insn (gen_one_cmplsi2 (scratch, scratch)); + emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch)); + emit_insn (gen_andsi3 (tmpreg, tmpreg, + gen_int_mode (0x80808080, SImode))); + emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, + align_4_label); + + if (TARGET_CMOVE) + { + rtx reg = gen_reg_rtx (SImode); + rtx reg2 = gen_reg_rtx (Pmode); + emit_move_insn (reg, tmpreg); + emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16))); + + /* If zero is not in the first two bytes, move two bytes forward. */ + emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); + tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); + tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); + emit_insn (gen_rtx_SET (tmpreg, + gen_rtx_IF_THEN_ELSE (SImode, tmp, + reg, + tmpreg))); + /* Emit lea manually to avoid clobbering of flags. */ + emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx))); + + tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); + tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); + emit_insn (gen_rtx_SET (out, + gen_rtx_IF_THEN_ELSE (Pmode, tmp, + reg2, + out))); + } + else + { + rtx_code_label *end_2_label = gen_label_rtx (); + /* Is zero in the first two bytes? */ + + emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); + tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); + tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, end_2_label), + pc_rtx); + tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + JUMP_LABEL (tmp) = end_2_label; + + /* Not in the first two. Move two bytes forward. */ + emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16))); + emit_insn (ix86_gen_add3 (out, out, const2_rtx)); + + emit_label (end_2_label); + + } + + /* Avoid branch in fixing the byte. */ + tmpreg = gen_lowpart (QImode, tmpreg); + emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg)); + tmp = gen_rtx_REG (CCmode, FLAGS_REG); + cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx); + emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp)); + + emit_label (end_0_label); +} + +/* Expand strlen. */ + +bool +ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) +{ +if (TARGET_UNROLL_STRLEN + && TARGET_INLINE_ALL_STRINGOPS + && eoschar == const0_rtx + && optimize > 1) + { + /* The generic case of strlen expander is long. Avoid it's + expanding unless TARGET_INLINE_ALL_STRINGOPS. */ + rtx addr = force_reg (Pmode, XEXP (src, 0)); + /* Well it seems that some optimizer does not combine a call like + foo(strlen(bar), strlen(bar)); + when the move and the subtraction is done here. It does calculate + the length just once when these instructions are done inside of + output_strlen_unroll(). But I think since &bar[strlen(bar)] is + often used and I use one fewer register for the lifetime of + output_strlen_unroll() this is better. */ + + emit_move_insn (out, addr); + + ix86_expand_strlensi_unroll_1 (out, src, align); + + /* strlensi_unroll_1 returns the address of the zero at the end of + the string, like memchr(), so compute the length by subtracting + the start address. */ + emit_insn (ix86_gen_sub3 (out, out, addr)); + return true; + } + else + return false; +} + +/* For given symbol (function) construct code to compute address of it's PLT + entry in large x86-64 PIC model. */ + +static rtx +construct_plt_address (rtx symbol) +{ + rtx tmp, unspec; + + gcc_assert (GET_CODE (symbol) == SYMBOL_REF); + gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF); + gcc_assert (Pmode == DImode); + + tmp = gen_reg_rtx (Pmode); + unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF); + + emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec)); + emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx)); + return tmp; +} + +/* Additional registers that are clobbered by SYSV calls. */ + +static int const x86_64_ms_sysv_extra_clobbered_registers + [NUM_X86_64_MS_CLOBBERED_REGS] = +{ + SI_REG, DI_REG, + XMM6_REG, XMM7_REG, + XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG, + XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG +}; + +rtx_insn * +ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, + rtx callarg2, + rtx pop, bool sibcall) +{ + rtx vec[3]; + rtx use = NULL, call; + unsigned int vec_len = 0; + tree fndecl; + + if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) + { + fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0)); + if (fndecl + && (lookup_attribute ("interrupt", + TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))) + error ("interrupt service routine can%'t be called directly"); + } + else + fndecl = NULL_TREE; + + if (pop == const0_rtx) + pop = NULL; + gcc_assert (!TARGET_64BIT || !pop); + + if (TARGET_MACHO && !TARGET_64BIT) + { +#if TARGET_MACHO + if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) + fnaddr = machopic_indirect_call_target (fnaddr); +#endif + } + else + { + /* Static functions and indirect calls don't need the pic register. Also, + check if PLT was explicitly avoided via no-plt or "noplt" attribute, making + it an indirect call. */ + rtx addr = XEXP (fnaddr, 0); + if (flag_pic + && GET_CODE (addr) == SYMBOL_REF + && !SYMBOL_REF_LOCAL_P (addr)) + { + if (flag_plt + && (SYMBOL_REF_DECL (addr) == NULL_TREE + || !lookup_attribute ("noplt", + DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr))))) + { + if (!TARGET_64BIT + || (ix86_cmodel == CM_LARGE_PIC + && DEFAULT_ABI != MS_ABI)) + { + use_reg (&use, gen_rtx_REG (Pmode, + REAL_PIC_OFFSET_TABLE_REGNUM)); + if (ix86_use_pseudo_pic_reg ()) + emit_move_insn (gen_rtx_REG (Pmode, + REAL_PIC_OFFSET_TABLE_REGNUM), + pic_offset_table_rtx); + } + } + else if (!TARGET_PECOFF && !TARGET_MACHO) + { + if (TARGET_64BIT) + { + fnaddr = gen_rtx_UNSPEC (Pmode, + gen_rtvec (1, addr), + UNSPEC_GOTPCREL); + fnaddr = gen_rtx_CONST (Pmode, fnaddr); + } + else + { + fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), + UNSPEC_GOT); + fnaddr = gen_rtx_CONST (Pmode, fnaddr); + fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, + fnaddr); + } + fnaddr = gen_const_mem (Pmode, fnaddr); + /* Pmode may not be the same as word_mode for x32, which + doesn't support indirect branch via 32-bit memory slot. + Since x32 GOT slot is 64 bit with zero upper 32 bits, + indirect branch via x32 GOT slot is OK. */ + if (GET_MODE (fnaddr) != word_mode) + fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr); + fnaddr = gen_rtx_MEM (QImode, fnaddr); + } + } + } + + /* Skip setting up RAX register for -mskip-rax-setup when there are no + parameters passed in vector registers. */ + if (TARGET_64BIT + && (INTVAL (callarg2) > 0 + || (INTVAL (callarg2) == 0 + && (TARGET_SSE || !flag_skip_rax_setup)))) + { + rtx al = gen_rtx_REG (QImode, AX_REG); + emit_move_insn (al, callarg2); + use_reg (&use, al); + } + + if (ix86_cmodel == CM_LARGE_PIC + && !TARGET_PECOFF + && MEM_P (fnaddr) + && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF + && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode)) + fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0))); + /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect + branch via x32 GOT slot is OK. */ + else if (!(TARGET_X32 + && MEM_P (fnaddr) + && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND + && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)) + && (sibcall + ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode) + : !call_insn_operand (XEXP (fnaddr, 0), word_mode))) + { + fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1); + fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr)); + } + + call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1); + + if (retval) + call = gen_rtx_SET (retval, call); + vec[vec_len++] = call; + + if (pop) + { + pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop); + pop = gen_rtx_SET (stack_pointer_rtx, pop); + vec[vec_len++] = pop; + } + + if (cfun->machine->no_caller_saved_registers + && (!fndecl + || (!TREE_THIS_VOLATILE (fndecl) + && !lookup_attribute ("no_caller_saved_registers", + TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))) + { + static const char ix86_call_used_regs[] = CALL_USED_REGISTERS; + bool is_64bit_ms_abi = (TARGET_64BIT + && ix86_function_abi (fndecl) == MS_ABI); + char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi); + + /* If there are no caller-saved registers, add all registers + that are clobbered by the call which returns. */ + for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (!fixed_regs[i] + && (ix86_call_used_regs[i] == 1 + || (ix86_call_used_regs[i] & c_mask)) + && !STACK_REGNO_P (i) + && !MMX_REGNO_P (i)) + clobber_reg (&use, + gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i)); + } + else if (TARGET_64BIT_MS_ABI + && (!callarg2 || INTVAL (callarg2) != -2)) + { + unsigned i; + + for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++) + { + int regno = x86_64_ms_sysv_extra_clobbered_registers[i]; + machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode; + + clobber_reg (&use, gen_rtx_REG (mode, regno)); + } + + /* Set here, but it may get cleared later. */ + if (TARGET_CALL_MS2SYSV_XLOGUES) + { + if (!TARGET_SSE) + ; + + /* Don't break hot-patched functions. */ + else if (ix86_function_ms_hook_prologue (current_function_decl)) + ; + + /* TODO: Cases not yet examined. */ + else if (flag_split_stack) + warn_once_call_ms2sysv_xlogues ("-fsplit-stack"); + + else + { + gcc_assert (!reload_completed); + cfun->machine->call_ms2sysv = true; + } + } + } + + if (vec_len > 1) + call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec)); + rtx_insn *call_insn = emit_call_insn (call); + if (use) + CALL_INSN_FUNCTION_USAGE (call_insn) = use; + + return call_insn; +} + +/* Split simple return with popping POPC bytes from stack to indirect + branch with stack adjustment . */ + +void +ix86_split_simple_return_pop_internal (rtx popc) +{ + struct machine_function *m = cfun->machine; + rtx ecx = gen_rtx_REG (SImode, CX_REG); + rtx_insn *insn; + + /* There is no "pascal" calling convention in any 64bit ABI. */ + gcc_assert (!TARGET_64BIT); + + insn = emit_insn (gen_pop (ecx)); + m->fs.cfa_offset -= UNITS_PER_WORD; + m->fs.sp_offset -= UNITS_PER_WORD; + + rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); + x = gen_rtx_SET (stack_pointer_rtx, x); + add_reg_note (insn, REG_CFA_ADJUST_CFA, x); + add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx)); + RTX_FRAME_RELATED_P (insn) = 1; + + x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc); + x = gen_rtx_SET (stack_pointer_rtx, x); + insn = emit_insn (x); + add_reg_note (insn, REG_CFA_ADJUST_CFA, x); + RTX_FRAME_RELATED_P (insn) = 1; + + /* Now return address is in ECX. */ + emit_jump_insn (gen_simple_return_indirect_internal (ecx)); +} + +/* Errors in the source file can cause expand_expr to return const0_rtx + where we expect a vector. To avoid crashing, use one of the vector + clear instructions. */ + +static rtx +safe_vector_operand (rtx x, machine_mode mode) +{ + if (x == const0_rtx) + x = CONST0_RTX (mode); + return x; +} + +/* Subroutine of ix86_expand_builtin to take care of binop insns. */ + +static rtx +ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + machine_mode tmode = insn_data[icode].operand[0].mode; + machine_mode mode0 = insn_data[icode].operand[1].mode; + machine_mode mode1 = insn_data[icode].operand[2].mode; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + if (optimize || !target + || GET_MODE (target) != tmode + || !insn_data[icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + if (GET_MODE (op1) == SImode && mode1 == TImode) + { + rtx x = gen_reg_rtx (V4SImode); + emit_insn (gen_sse2_loadd (x, op1)); + op1 = gen_lowpart (TImode, x); + } + + if (!insn_data[icode].operand[1].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (!insn_data[icode].operand[2].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return 0; + + emit_insn (pat); + + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */ + +static rtx +ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, + enum ix86_builtin_func_type m_type, + enum rtx_code sub_code) +{ + rtx pat; + int i; + int nargs; + bool comparison_p = false; + bool tf_p = false; + bool last_arg_constant = false; + int num_memory = 0; + struct { + rtx op; + machine_mode mode; + } args[4]; + + machine_mode tmode = insn_data[icode].operand[0].mode; + + switch (m_type) + { + case MULTI_ARG_4_DF2_DI_I: + case MULTI_ARG_4_DF2_DI_I1: + case MULTI_ARG_4_SF2_SI_I: + case MULTI_ARG_4_SF2_SI_I1: + nargs = 4; + last_arg_constant = true; + break; + + case MULTI_ARG_3_SF: + case MULTI_ARG_3_DF: + case MULTI_ARG_3_SF2: + case MULTI_ARG_3_DF2: + case MULTI_ARG_3_DI: + case MULTI_ARG_3_SI: + case MULTI_ARG_3_SI_DI: + case MULTI_ARG_3_HI: + case MULTI_ARG_3_HI_SI: + case MULTI_ARG_3_QI: + case MULTI_ARG_3_DI2: + case MULTI_ARG_3_SI2: + case MULTI_ARG_3_HI2: + case MULTI_ARG_3_QI2: + nargs = 3; + break; + + case MULTI_ARG_2_SF: + case MULTI_ARG_2_DF: + case MULTI_ARG_2_DI: + case MULTI_ARG_2_SI: + case MULTI_ARG_2_HI: + case MULTI_ARG_2_QI: + nargs = 2; + break; + + case MULTI_ARG_2_DI_IMM: + case MULTI_ARG_2_SI_IMM: + case MULTI_ARG_2_HI_IMM: + case MULTI_ARG_2_QI_IMM: + nargs = 2; + last_arg_constant = true; + break; + + case MULTI_ARG_1_SF: + case MULTI_ARG_1_DF: + case MULTI_ARG_1_SF2: + case MULTI_ARG_1_DF2: + case MULTI_ARG_1_DI: + case MULTI_ARG_1_SI: + case MULTI_ARG_1_HI: + case MULTI_ARG_1_QI: + case MULTI_ARG_1_SI_DI: + case MULTI_ARG_1_HI_DI: + case MULTI_ARG_1_HI_SI: + case MULTI_ARG_1_QI_DI: + case MULTI_ARG_1_QI_SI: + case MULTI_ARG_1_QI_HI: + nargs = 1; + break; + + case MULTI_ARG_2_DI_CMP: + case MULTI_ARG_2_SI_CMP: + case MULTI_ARG_2_HI_CMP: + case MULTI_ARG_2_QI_CMP: + nargs = 2; + comparison_p = true; + break; + + case MULTI_ARG_2_SF_TF: + case MULTI_ARG_2_DF_TF: + case MULTI_ARG_2_DI_TF: + case MULTI_ARG_2_SI_TF: + case MULTI_ARG_2_HI_TF: + case MULTI_ARG_2_QI_TF: + nargs = 2; + tf_p = true; + break; + + default: + gcc_unreachable (); + } + + if (optimize || !target + || GET_MODE (target) != tmode + || !insn_data[icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + else if (memory_operand (target, tmode)) + num_memory++; + + gcc_assert (nargs <= 4); + + for (i = 0; i < nargs; i++) + { + tree arg = CALL_EXPR_ARG (exp, i); + rtx op = expand_normal (arg); + int adjust = (comparison_p) ? 1 : 0; + machine_mode mode = insn_data[icode].operand[i+adjust+1].mode; + + if (last_arg_constant && i == nargs - 1) + { + if (!insn_data[icode].operand[i + 1].predicate (op, mode)) + { + enum insn_code new_icode = icode; + switch (icode) + { + case CODE_FOR_xop_vpermil2v2df3: + case CODE_FOR_xop_vpermil2v4sf3: + case CODE_FOR_xop_vpermil2v4df3: + case CODE_FOR_xop_vpermil2v8sf3: + error ("the last argument must be a 2-bit immediate"); + return gen_reg_rtx (tmode); + case CODE_FOR_xop_rotlv2di3: + new_icode = CODE_FOR_rotlv2di3; + goto xop_rotl; + case CODE_FOR_xop_rotlv4si3: + new_icode = CODE_FOR_rotlv4si3; + goto xop_rotl; + case CODE_FOR_xop_rotlv8hi3: + new_icode = CODE_FOR_rotlv8hi3; + goto xop_rotl; + case CODE_FOR_xop_rotlv16qi3: + new_icode = CODE_FOR_rotlv16qi3; + xop_rotl: + if (CONST_INT_P (op)) + { + int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1; + op = GEN_INT (INTVAL (op) & mask); + gcc_checking_assert + (insn_data[icode].operand[i + 1].predicate (op, mode)); + } + else + { + gcc_checking_assert + (nargs == 2 + && insn_data[new_icode].operand[0].mode == tmode + && insn_data[new_icode].operand[1].mode == tmode + && insn_data[new_icode].operand[2].mode == mode + && insn_data[new_icode].operand[0].predicate + == insn_data[icode].operand[0].predicate + && insn_data[new_icode].operand[1].predicate + == insn_data[icode].operand[1].predicate); + icode = new_icode; + goto non_constant; + } + break; + default: + gcc_unreachable (); + } + } + } + else + { + non_constant: + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); + + /* If we aren't optimizing, only allow one memory operand to be + generated. */ + if (memory_operand (op, mode)) + num_memory++; + + gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode); + + if (optimize + || !insn_data[icode].operand[i+adjust+1].predicate (op, mode) + || num_memory > 1) + op = force_reg (mode, op); + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 1: + pat = GEN_FCN (icode) (target, args[0].op); + break; + + case 2: + if (tf_p) + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + GEN_INT ((int)sub_code)); + else if (! comparison_p) + pat = GEN_FCN (icode) (target, args[0].op, args[1].op); + else + { + rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target), + args[0].op, + args[1].op); + + pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op); + } + break; + + case 3: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); + break; + + case 4: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op); + break; + + default: + gcc_unreachable (); + } + + if (! pat) + return 0; + + emit_insn (pat); + return target; +} + +/* Subroutine of ix86_expand_args_builtin to take care of scalar unop + insns with vec_merge. */ + +static rtx +ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + rtx op1, op0 = expand_normal (arg0); + machine_mode tmode = insn_data[icode].operand[0].mode; + machine_mode mode0 = insn_data[icode].operand[1].mode; + + if (optimize || !target + || GET_MODE (target) != tmode + || !insn_data[icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[icode].operand[1].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + + op1 = op0; + if (!insn_data[icode].operand[2].predicate (op1, mode0)) + op1 = copy_to_mode_reg (mode0, op1); + + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of comparison insns. */ + +static rtx +ix86_expand_sse_compare (const struct builtin_description *d, + tree exp, rtx target, bool swap) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2; + machine_mode tmode = insn_data[d->icode].operand[0].mode; + machine_mode mode0 = insn_data[d->icode].operand[1].mode; + machine_mode mode1 = insn_data[d->icode].operand[2].mode; + enum rtx_code comparison = d->comparison; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + /* Swap operands if we have a comparison that isn't available in + hardware. */ + if (swap) + std::swap (op0, op1); + + if (optimize || !target + || GET_MODE (target) != tmode + || !insn_data[d->icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[1].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_data[d->icode].operand[2].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1); + pat = GEN_FCN (d->icode) (target, op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of comi insns. */ + +static rtx +ix86_expand_sse_comi (const struct builtin_description *d, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + machine_mode mode0 = insn_data[d->icode].operand[0].mode; + machine_mode mode1 = insn_data[d->icode].operand[1].mode; + enum rtx_code comparison = d->comparison; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + /* Swap operands if we have a comparison that isn't available in + hardware. */ + if (d->flag & BUILTIN_DESC_SWAP_OPERANDS) + std::swap (op0, op1); + + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_data[d->icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + pat = GEN_FCN (d->icode) (op0, op1); + if (! pat) + return 0; + emit_insn (pat); + emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (comparison, QImode, + SET_DEST (pat), + const0_rtx))); + + return SUBREG_REG (target); +} + +/* Subroutines of ix86_expand_args_builtin to take care of round insns. */ + +static rtx +ix86_expand_sse_round (const struct builtin_description *d, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + rtx op1, op0 = expand_normal (arg0); + machine_mode tmode = insn_data[d->icode].operand[0].mode; + machine_mode mode0 = insn_data[d->icode].operand[1].mode; + + if (optimize || target == 0 + || GET_MODE (target) != tmode + || !insn_data[d->icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + + op1 = GEN_INT (d->comparison); + + pat = GEN_FCN (d->icode) (target, op0, op1); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + +static rtx +ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2; + machine_mode tmode = insn_data[d->icode].operand[0].mode; + machine_mode mode0 = insn_data[d->icode].operand[1].mode; + machine_mode mode1 = insn_data[d->icode].operand[2].mode; + + if (optimize || target == 0 + || GET_MODE (target) != tmode + || !insn_data[d->icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + op0 = safe_vector_operand (op0, mode0); + op1 = safe_vector_operand (op1, mode1); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_data[d->icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + op2 = GEN_INT (d->comparison); + + pat = GEN_FCN (d->icode) (target, op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of ptest insns. */ + +static rtx +ix86_expand_sse_ptest (const struct builtin_description *d, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + machine_mode mode0 = insn_data[d->icode].operand[0].mode; + machine_mode mode1 = insn_data[d->icode].operand[1].mode; + enum rtx_code comparison = d->comparison; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_data[d->icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + pat = GEN_FCN (d->icode) (op0, op1); + if (! pat) + return 0; + emit_insn (pat); + emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (comparison, QImode, + SET_DEST (pat), + const0_rtx))); + + return SUBREG_REG (target); +} + +/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */ + +static rtx +ix86_expand_sse_pcmpestr (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + tree arg2 = CALL_EXPR_ARG (exp, 2); + tree arg3 = CALL_EXPR_ARG (exp, 3); + tree arg4 = CALL_EXPR_ARG (exp, 4); + rtx scratch0, scratch1; + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2 = expand_normal (arg2); + rtx op3 = expand_normal (arg3); + rtx op4 = expand_normal (arg4); + machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm; + + tmode0 = insn_data[d->icode].operand[0].mode; + tmode1 = insn_data[d->icode].operand[1].mode; + modev2 = insn_data[d->icode].operand[2].mode; + modei3 = insn_data[d->icode].operand[3].mode; + modev4 = insn_data[d->icode].operand[4].mode; + modei5 = insn_data[d->icode].operand[5].mode; + modeimm = insn_data[d->icode].operand[6].mode; + + if (VECTOR_MODE_P (modev2)) + op0 = safe_vector_operand (op0, modev2); + if (VECTOR_MODE_P (modev4)) + op2 = safe_vector_operand (op2, modev4); + + if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) + op0 = copy_to_mode_reg (modev2, op0); + if (!insn_data[d->icode].operand[3].predicate (op1, modei3)) + op1 = copy_to_mode_reg (modei3, op1); + if ((optimize && !register_operand (op2, modev4)) + || !insn_data[d->icode].operand[4].predicate (op2, modev4)) + op2 = copy_to_mode_reg (modev4, op2); + if (!insn_data[d->icode].operand[5].predicate (op3, modei5)) + op3 = copy_to_mode_reg (modei5, op3); + + if (!insn_data[d->icode].operand[6].predicate (op4, modeimm)) + { + error ("the fifth argument must be an 8-bit immediate"); + return const0_rtx; + } + + if (d->code == IX86_BUILTIN_PCMPESTRI128) + { + if (optimize || !target + || GET_MODE (target) != tmode0 + || !insn_data[d->icode].operand[0].predicate (target, tmode0)) + target = gen_reg_rtx (tmode0); + + scratch1 = gen_reg_rtx (tmode1); + + pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4); + } + else if (d->code == IX86_BUILTIN_PCMPESTRM128) + { + if (optimize || !target + || GET_MODE (target) != tmode1 + || !insn_data[d->icode].operand[1].predicate (target, tmode1)) + target = gen_reg_rtx (tmode1); + + scratch0 = gen_reg_rtx (tmode0); + + pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4); + } + else + { + gcc_assert (d->flag); + + scratch0 = gen_reg_rtx (tmode0); + scratch1 = gen_reg_rtx (tmode1); + + pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4); + } + + if (! pat) + return 0; + + emit_insn (pat); + + if (d->flag) + { + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + emit_insn + (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (EQ, QImode, + gen_rtx_REG ((machine_mode) d->flag, + FLAGS_REG), + const0_rtx))); + return SUBREG_REG (target); + } + else + return target; +} + + +/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */ + +static rtx +ix86_expand_sse_pcmpistr (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + tree arg2 = CALL_EXPR_ARG (exp, 2); + rtx scratch0, scratch1; + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2 = expand_normal (arg2); + machine_mode tmode0, tmode1, modev2, modev3, modeimm; + + tmode0 = insn_data[d->icode].operand[0].mode; + tmode1 = insn_data[d->icode].operand[1].mode; + modev2 = insn_data[d->icode].operand[2].mode; + modev3 = insn_data[d->icode].operand[3].mode; + modeimm = insn_data[d->icode].operand[4].mode; + + if (VECTOR_MODE_P (modev2)) + op0 = safe_vector_operand (op0, modev2); + if (VECTOR_MODE_P (modev3)) + op1 = safe_vector_operand (op1, modev3); + + if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) + op0 = copy_to_mode_reg (modev2, op0); + if ((optimize && !register_operand (op1, modev3)) + || !insn_data[d->icode].operand[3].predicate (op1, modev3)) + op1 = copy_to_mode_reg (modev3, op1); + + if (!insn_data[d->icode].operand[4].predicate (op2, modeimm)) + { + error ("the third argument must be an 8-bit immediate"); + return const0_rtx; + } + + if (d->code == IX86_BUILTIN_PCMPISTRI128) + { + if (optimize || !target + || GET_MODE (target) != tmode0 + || !insn_data[d->icode].operand[0].predicate (target, tmode0)) + target = gen_reg_rtx (tmode0); + + scratch1 = gen_reg_rtx (tmode1); + + pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2); + } + else if (d->code == IX86_BUILTIN_PCMPISTRM128) + { + if (optimize || !target + || GET_MODE (target) != tmode1 + || !insn_data[d->icode].operand[1].predicate (target, tmode1)) + target = gen_reg_rtx (tmode1); + + scratch0 = gen_reg_rtx (tmode0); + + pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2); + } + else + { + gcc_assert (d->flag); + + scratch0 = gen_reg_rtx (tmode0); + scratch1 = gen_reg_rtx (tmode1); + + pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2); + } + + if (! pat) + return 0; + + emit_insn (pat); + + if (d->flag) + { + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + emit_insn + (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (EQ, QImode, + gen_rtx_REG ((machine_mode) d->flag, + FLAGS_REG), + const0_rtx))); + return SUBREG_REG (target); + } + else + return target; +} + +/* Fixup modeless constants to fit required mode. */ + +static rtx +fixup_modeless_constant (rtx x, machine_mode mode) +{ + if (GET_MODE (x) == VOIDmode) + x = convert_to_mode (mode, x, 1); + return x; +} + +/* Subroutine of ix86_expand_builtin to take care of insns with + variable number of operands. */ + +static rtx +ix86_expand_args_builtin (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat, real_target; + unsigned int i, nargs; + unsigned int nargs_constant = 0; + unsigned int mask_pos = 0; + int num_memory = 0; + struct + { + rtx op; + machine_mode mode; + } args[6]; + bool second_arg_count = false; + enum insn_code icode = d->icode; + const struct insn_data_d *insn_p = &insn_data[icode]; + machine_mode tmode = insn_p->operand[0].mode; + machine_mode rmode = VOIDmode; + bool swap = false; + enum rtx_code comparison = d->comparison; + + switch ((enum ix86_builtin_func_type) d->flag) + { + case V2DF_FTYPE_V2DF_ROUND: + case V4DF_FTYPE_V4DF_ROUND: + case V8DF_FTYPE_V8DF_ROUND: + case V4SF_FTYPE_V4SF_ROUND: + case V8SF_FTYPE_V8SF_ROUND: + case V16SF_FTYPE_V16SF_ROUND: + case V4SI_FTYPE_V4SF_ROUND: + case V8SI_FTYPE_V8SF_ROUND: + case V16SI_FTYPE_V16SF_ROUND: + return ix86_expand_sse_round (d, exp, target); + case V4SI_FTYPE_V2DF_V2DF_ROUND: + case V8SI_FTYPE_V4DF_V4DF_ROUND: + case V16SI_FTYPE_V8DF_V8DF_ROUND: + return ix86_expand_sse_round_vec_pack_sfix (d, exp, target); + case INT_FTYPE_V8SF_V8SF_PTEST: + case INT_FTYPE_V4DI_V4DI_PTEST: + case INT_FTYPE_V4DF_V4DF_PTEST: + case INT_FTYPE_V4SF_V4SF_PTEST: + case INT_FTYPE_V2DI_V2DI_PTEST: + case INT_FTYPE_V2DF_V2DF_PTEST: + return ix86_expand_sse_ptest (d, exp, target); + case FLOAT128_FTYPE_FLOAT128: + case FLOAT_FTYPE_FLOAT: + case INT_FTYPE_INT: + case UINT_FTYPE_UINT: + case UINT16_FTYPE_UINT16: + case UINT64_FTYPE_INT: + case UINT64_FTYPE_UINT64: + case INT64_FTYPE_INT64: + case INT64_FTYPE_V4SF: + case INT64_FTYPE_V2DF: + case INT_FTYPE_V16QI: + case INT_FTYPE_V8QI: + case INT_FTYPE_V8SF: + case INT_FTYPE_V4DF: + case INT_FTYPE_V4SF: + case INT_FTYPE_V2DF: + case INT_FTYPE_V32QI: + case V16QI_FTYPE_V16QI: + case V8SI_FTYPE_V8SF: + case V8SI_FTYPE_V4SI: + case V8HI_FTYPE_V8HI: + case V8HI_FTYPE_V16QI: + case V8QI_FTYPE_V8QI: + case V8SF_FTYPE_V8SF: + case V8SF_FTYPE_V8SI: + case V8SF_FTYPE_V4SF: + case V8SF_FTYPE_V8HI: + case V4SI_FTYPE_V4SI: + case V4SI_FTYPE_V16QI: + case V4SI_FTYPE_V4SF: + case V4SI_FTYPE_V8SI: + case V4SI_FTYPE_V8HI: + case V4SI_FTYPE_V4DF: + case V4SI_FTYPE_V2DF: + case V4HI_FTYPE_V4HI: + case V4DF_FTYPE_V4DF: + case V4DF_FTYPE_V4SI: + case V4DF_FTYPE_V4SF: + case V4DF_FTYPE_V2DF: + case V4SF_FTYPE_V4SF: + case V4SF_FTYPE_V4SI: + case V4SF_FTYPE_V8SF: + case V4SF_FTYPE_V4DF: + case V4SF_FTYPE_V8HI: + case V4SF_FTYPE_V2DF: + case V2DI_FTYPE_V2DI: + case V2DI_FTYPE_V16QI: + case V2DI_FTYPE_V8HI: + case V2DI_FTYPE_V4SI: + case V2DF_FTYPE_V2DF: + case V2DF_FTYPE_V4SI: + case V2DF_FTYPE_V4DF: + case V2DF_FTYPE_V4SF: + case V2DF_FTYPE_V2SI: + case V2SI_FTYPE_V2SI: + case V2SI_FTYPE_V4SF: + case V2SI_FTYPE_V2SF: + case V2SI_FTYPE_V2DF: + case V2SF_FTYPE_V2SF: + case V2SF_FTYPE_V2SI: + case V32QI_FTYPE_V32QI: + case V32QI_FTYPE_V16QI: + case V16HI_FTYPE_V16HI: + case V16HI_FTYPE_V8HI: + case V8SI_FTYPE_V8SI: + case V16HI_FTYPE_V16QI: + case V8SI_FTYPE_V16QI: + case V4DI_FTYPE_V16QI: + case V8SI_FTYPE_V8HI: + case V4DI_FTYPE_V8HI: + case V4DI_FTYPE_V4SI: + case V4DI_FTYPE_V2DI: + case UQI_FTYPE_UQI: + case UHI_FTYPE_UHI: + case USI_FTYPE_USI: + case USI_FTYPE_UQI: + case USI_FTYPE_UHI: + case UDI_FTYPE_UDI: + case UHI_FTYPE_V16QI: + case USI_FTYPE_V32QI: + case UDI_FTYPE_V64QI: + case V16QI_FTYPE_UHI: + case V32QI_FTYPE_USI: + case V64QI_FTYPE_UDI: + case V8HI_FTYPE_UQI: + case V16HI_FTYPE_UHI: + case V32HI_FTYPE_USI: + case V4SI_FTYPE_UQI: + case V8SI_FTYPE_UQI: + case V4SI_FTYPE_UHI: + case V8SI_FTYPE_UHI: + case UQI_FTYPE_V8HI: + case UHI_FTYPE_V16HI: + case USI_FTYPE_V32HI: + case UQI_FTYPE_V4SI: + case UQI_FTYPE_V8SI: + case UHI_FTYPE_V16SI: + case UQI_FTYPE_V2DI: + case UQI_FTYPE_V4DI: + case UQI_FTYPE_V8DI: + case V16SI_FTYPE_UHI: + case V2DI_FTYPE_UQI: + case V4DI_FTYPE_UQI: + case V16SI_FTYPE_INT: + case V16SF_FTYPE_V8SF: + case V16SI_FTYPE_V8SI: + case V16SF_FTYPE_V4SF: + case V16SI_FTYPE_V4SI: + case V16SI_FTYPE_V16SF: + case V16SI_FTYPE_V16SI: + case V64QI_FTYPE_V64QI: + case V32HI_FTYPE_V32HI: + case V16SF_FTYPE_V16SF: + case V8DI_FTYPE_UQI: + case V8DI_FTYPE_V8DI: + case V8DF_FTYPE_V4DF: + case V8DF_FTYPE_V2DF: + case V8DF_FTYPE_V8DF: + case V4DI_FTYPE_V4DI: + nargs = 1; + break; + case V4SF_FTYPE_V4SF_VEC_MERGE: + case V2DF_FTYPE_V2DF_VEC_MERGE: + return ix86_expand_unop_vec_merge_builtin (icode, exp, target); + case FLOAT128_FTYPE_FLOAT128_FLOAT128: + case V16QI_FTYPE_V16QI_V16QI: + case V16QI_FTYPE_V8HI_V8HI: + case V16SF_FTYPE_V16SF_V16SF: + case V8QI_FTYPE_V8QI_V8QI: + case V8QI_FTYPE_V4HI_V4HI: + case V8HI_FTYPE_V8HI_V8HI: + case V8HI_FTYPE_V16QI_V16QI: + case V8HI_FTYPE_V4SI_V4SI: + case V8SF_FTYPE_V8SF_V8SF: + case V8SF_FTYPE_V8SF_V8SI: + case V8DF_FTYPE_V8DF_V8DF: + case V4SI_FTYPE_V4SI_V4SI: + case V4SI_FTYPE_V8HI_V8HI: + case V4SI_FTYPE_V2DF_V2DF: + case V4HI_FTYPE_V4HI_V4HI: + case V4HI_FTYPE_V8QI_V8QI: + case V4HI_FTYPE_V2SI_V2SI: + case V4DF_FTYPE_V4DF_V4DF: + case V4DF_FTYPE_V4DF_V4DI: + case V4SF_FTYPE_V4SF_V4SF: + case V4SF_FTYPE_V4SF_V4SI: + case V4SF_FTYPE_V4SF_V2SI: + case V4SF_FTYPE_V4SF_V2DF: + case V4SF_FTYPE_V4SF_UINT: + case V4SF_FTYPE_V4SF_DI: + case V4SF_FTYPE_V4SF_SI: + case V2DI_FTYPE_V2DI_V2DI: + case V2DI_FTYPE_V16QI_V16QI: + case V2DI_FTYPE_V4SI_V4SI: + case V2DI_FTYPE_V2DI_V16QI: + case V2SI_FTYPE_V2SI_V2SI: + case V2SI_FTYPE_V4HI_V4HI: + case V2SI_FTYPE_V2SF_V2SF: + case V2DF_FTYPE_V2DF_V2DF: + case V2DF_FTYPE_V2DF_V4SF: + case V2DF_FTYPE_V2DF_V2DI: + case V2DF_FTYPE_V2DF_DI: + case V2DF_FTYPE_V2DF_SI: + case V2DF_FTYPE_V2DF_UINT: + case V2SF_FTYPE_V2SF_V2SF: + case V1DI_FTYPE_V1DI_V1DI: + case V1DI_FTYPE_V8QI_V8QI: + case V1DI_FTYPE_V2SI_V2SI: + case V32QI_FTYPE_V16HI_V16HI: + case V16HI_FTYPE_V8SI_V8SI: + case V64QI_FTYPE_V64QI_V64QI: + case V32QI_FTYPE_V32QI_V32QI: + case V16HI_FTYPE_V32QI_V32QI: + case V16HI_FTYPE_V16HI_V16HI: + case V8SI_FTYPE_V4DF_V4DF: + case V8SI_FTYPE_V8SI_V8SI: + case V8SI_FTYPE_V16HI_V16HI: + case V4DI_FTYPE_V4DI_V4DI: + case V4DI_FTYPE_V8SI_V8SI: + case V8DI_FTYPE_V64QI_V64QI: + if (comparison == UNKNOWN) + return ix86_expand_binop_builtin (icode, exp, target); + nargs = 2; + break; + case V4SF_FTYPE_V4SF_V4SF_SWAP: + case V2DF_FTYPE_V2DF_V2DF_SWAP: + gcc_assert (comparison != UNKNOWN); + nargs = 2; + swap = true; + break; + case V16HI_FTYPE_V16HI_V8HI_COUNT: + case V16HI_FTYPE_V16HI_SI_COUNT: + case V8SI_FTYPE_V8SI_V4SI_COUNT: + case V8SI_FTYPE_V8SI_SI_COUNT: + case V4DI_FTYPE_V4DI_V2DI_COUNT: + case V4DI_FTYPE_V4DI_INT_COUNT: + case V8HI_FTYPE_V8HI_V8HI_COUNT: + case V8HI_FTYPE_V8HI_SI_COUNT: + case V4SI_FTYPE_V4SI_V4SI_COUNT: + case V4SI_FTYPE_V4SI_SI_COUNT: + case V4HI_FTYPE_V4HI_V4HI_COUNT: + case V4HI_FTYPE_V4HI_SI_COUNT: + case V2DI_FTYPE_V2DI_V2DI_COUNT: + case V2DI_FTYPE_V2DI_SI_COUNT: + case V2SI_FTYPE_V2SI_V2SI_COUNT: + case V2SI_FTYPE_V2SI_SI_COUNT: + case V1DI_FTYPE_V1DI_V1DI_COUNT: + case V1DI_FTYPE_V1DI_SI_COUNT: + nargs = 2; + second_arg_count = true; + break; + case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT: + case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT: + case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT: + case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT: + case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT: + case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT: + case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT: + case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT: + case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT: + case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT: + case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT: + case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT: + case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT: + case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT: + case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT: + case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT: + case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT: + case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT: + nargs = 4; + second_arg_count = true; + break; + case UINT64_FTYPE_UINT64_UINT64: + case UINT_FTYPE_UINT_UINT: + case UINT_FTYPE_UINT_USHORT: + case UINT_FTYPE_UINT_UCHAR: + case UINT16_FTYPE_UINT16_INT: + case UINT8_FTYPE_UINT8_INT: + case UQI_FTYPE_UQI_UQI: + case UHI_FTYPE_UHI_UHI: + case USI_FTYPE_USI_USI: + case UDI_FTYPE_UDI_UDI: + case V16SI_FTYPE_V8DF_V8DF: + nargs = 2; + break; + case V2DI_FTYPE_V2DI_INT_CONVERT: + nargs = 2; + rmode = V1TImode; + nargs_constant = 1; + break; + case V4DI_FTYPE_V4DI_INT_CONVERT: + nargs = 2; + rmode = V2TImode; + nargs_constant = 1; + break; + case V8DI_FTYPE_V8DI_INT_CONVERT: + nargs = 2; + rmode = V4TImode; + nargs_constant = 1; + break; + case V8HI_FTYPE_V8HI_INT: + case V8HI_FTYPE_V8SF_INT: + case V16HI_FTYPE_V16SF_INT: + case V8HI_FTYPE_V4SF_INT: + case V8SF_FTYPE_V8SF_INT: + case V4SF_FTYPE_V16SF_INT: + case V16SF_FTYPE_V16SF_INT: + case V4SI_FTYPE_V4SI_INT: + case V4SI_FTYPE_V8SI_INT: + case V4HI_FTYPE_V4HI_INT: + case V4DF_FTYPE_V4DF_INT: + case V4DF_FTYPE_V8DF_INT: + case V4SF_FTYPE_V4SF_INT: + case V4SF_FTYPE_V8SF_INT: + case V2DI_FTYPE_V2DI_INT: + case V2DF_FTYPE_V2DF_INT: + case V2DF_FTYPE_V4DF_INT: + case V16HI_FTYPE_V16HI_INT: + case V8SI_FTYPE_V8SI_INT: + case V16SI_FTYPE_V16SI_INT: + case V4SI_FTYPE_V16SI_INT: + case V4DI_FTYPE_V4DI_INT: + case V2DI_FTYPE_V4DI_INT: + case V4DI_FTYPE_V8DI_INT: + case QI_FTYPE_V4SF_INT: + case QI_FTYPE_V2DF_INT: + case UQI_FTYPE_UQI_UQI_CONST: + case UHI_FTYPE_UHI_UQI: + case USI_FTYPE_USI_UQI: + case UDI_FTYPE_UDI_UQI: + nargs = 2; + nargs_constant = 1; + break; + case V16QI_FTYPE_V16QI_V16QI_V16QI: + case V8SF_FTYPE_V8SF_V8SF_V8SF: + case V4DF_FTYPE_V4DF_V4DF_V4DF: + case V4SF_FTYPE_V4SF_V4SF_V4SF: + case V2DF_FTYPE_V2DF_V2DF_V2DF: + case V32QI_FTYPE_V32QI_V32QI_V32QI: + case UHI_FTYPE_V16SI_V16SI_UHI: + case UQI_FTYPE_V8DI_V8DI_UQI: + case V16HI_FTYPE_V16SI_V16HI_UHI: + case V16QI_FTYPE_V16SI_V16QI_UHI: + case V16QI_FTYPE_V8DI_V16QI_UQI: + case V16SF_FTYPE_V16SF_V16SF_UHI: + case V16SF_FTYPE_V4SF_V16SF_UHI: + case V16SI_FTYPE_SI_V16SI_UHI: + case V16SI_FTYPE_V16HI_V16SI_UHI: + case V16SI_FTYPE_V16QI_V16SI_UHI: + case V8SF_FTYPE_V4SF_V8SF_UQI: + case V4DF_FTYPE_V2DF_V4DF_UQI: + case V8SI_FTYPE_V4SI_V8SI_UQI: + case V8SI_FTYPE_SI_V8SI_UQI: + case V4SI_FTYPE_V4SI_V4SI_UQI: + case V4SI_FTYPE_SI_V4SI_UQI: + case V4DI_FTYPE_V2DI_V4DI_UQI: + case V4DI_FTYPE_DI_V4DI_UQI: + case V2DI_FTYPE_V2DI_V2DI_UQI: + case V2DI_FTYPE_DI_V2DI_UQI: + case V64QI_FTYPE_V64QI_V64QI_UDI: + case V64QI_FTYPE_V16QI_V64QI_UDI: + case V64QI_FTYPE_QI_V64QI_UDI: + case V32QI_FTYPE_V32QI_V32QI_USI: + case V32QI_FTYPE_V16QI_V32QI_USI: + case V32QI_FTYPE_QI_V32QI_USI: + case V16QI_FTYPE_V16QI_V16QI_UHI: + case V16QI_FTYPE_QI_V16QI_UHI: + case V32HI_FTYPE_V8HI_V32HI_USI: + case V32HI_FTYPE_HI_V32HI_USI: + case V16HI_FTYPE_V8HI_V16HI_UHI: + case V16HI_FTYPE_HI_V16HI_UHI: + case V8HI_FTYPE_V8HI_V8HI_UQI: + case V8HI_FTYPE_HI_V8HI_UQI: + case V8SF_FTYPE_V8HI_V8SF_UQI: + case V4SF_FTYPE_V8HI_V4SF_UQI: + case V8SI_FTYPE_V8SF_V8SI_UQI: + case V4SI_FTYPE_V4SF_V4SI_UQI: + case V4DI_FTYPE_V4SF_V4DI_UQI: + case V2DI_FTYPE_V4SF_V2DI_UQI: + case V4SF_FTYPE_V4DI_V4SF_UQI: + case V4SF_FTYPE_V2DI_V4SF_UQI: + case V4DF_FTYPE_V4DI_V4DF_UQI: + case V2DF_FTYPE_V2DI_V2DF_UQI: + case V16QI_FTYPE_V8HI_V16QI_UQI: + case V16QI_FTYPE_V16HI_V16QI_UHI: + case V16QI_FTYPE_V4SI_V16QI_UQI: + case V16QI_FTYPE_V8SI_V16QI_UQI: + case V8HI_FTYPE_V4SI_V8HI_UQI: + case V8HI_FTYPE_V8SI_V8HI_UQI: + case V16QI_FTYPE_V2DI_V16QI_UQI: + case V16QI_FTYPE_V4DI_V16QI_UQI: + case V8HI_FTYPE_V2DI_V8HI_UQI: + case V8HI_FTYPE_V4DI_V8HI_UQI: + case V4SI_FTYPE_V2DI_V4SI_UQI: + case V4SI_FTYPE_V4DI_V4SI_UQI: + case V32QI_FTYPE_V32HI_V32QI_USI: + case UHI_FTYPE_V16QI_V16QI_UHI: + case USI_FTYPE_V32QI_V32QI_USI: + case UDI_FTYPE_V64QI_V64QI_UDI: + case UQI_FTYPE_V8HI_V8HI_UQI: + case UHI_FTYPE_V16HI_V16HI_UHI: + case USI_FTYPE_V32HI_V32HI_USI: + case UQI_FTYPE_V4SI_V4SI_UQI: + case UQI_FTYPE_V8SI_V8SI_UQI: + case UQI_FTYPE_V2DI_V2DI_UQI: + case UQI_FTYPE_V4DI_V4DI_UQI: + case V4SF_FTYPE_V2DF_V4SF_UQI: + case V4SF_FTYPE_V4DF_V4SF_UQI: + case V16SI_FTYPE_V16SI_V16SI_UHI: + case V16SI_FTYPE_V4SI_V16SI_UHI: + case V2DI_FTYPE_V4SI_V2DI_UQI: + case V2DI_FTYPE_V8HI_V2DI_UQI: + case V2DI_FTYPE_V16QI_V2DI_UQI: + case V4DI_FTYPE_V4DI_V4DI_UQI: + case V4DI_FTYPE_V4SI_V4DI_UQI: + case V4DI_FTYPE_V8HI_V4DI_UQI: + case V4DI_FTYPE_V16QI_V4DI_UQI: + case V4DI_FTYPE_V4DF_V4DI_UQI: + case V2DI_FTYPE_V2DF_V2DI_UQI: + case V4SI_FTYPE_V4DF_V4SI_UQI: + case V4SI_FTYPE_V2DF_V4SI_UQI: + case V4SI_FTYPE_V8HI_V4SI_UQI: + case V4SI_FTYPE_V16QI_V4SI_UQI: + case V4DI_FTYPE_V4DI_V4DI_V4DI: + case V8DF_FTYPE_V2DF_V8DF_UQI: + case V8DF_FTYPE_V4DF_V8DF_UQI: + case V8DF_FTYPE_V8DF_V8DF_UQI: + case V8SF_FTYPE_V8SF_V8SF_UQI: + case V8SF_FTYPE_V8SI_V8SF_UQI: + case V4DF_FTYPE_V4DF_V4DF_UQI: + case V4SF_FTYPE_V4SF_V4SF_UQI: + case V2DF_FTYPE_V2DF_V2DF_UQI: + case V2DF_FTYPE_V4SF_V2DF_UQI: + case V2DF_FTYPE_V4SI_V2DF_UQI: + case V4SF_FTYPE_V4SI_V4SF_UQI: + case V4DF_FTYPE_V4SF_V4DF_UQI: + case V4DF_FTYPE_V4SI_V4DF_UQI: + case V8SI_FTYPE_V8SI_V8SI_UQI: + case V8SI_FTYPE_V8HI_V8SI_UQI: + case V8SI_FTYPE_V16QI_V8SI_UQI: + case V8DF_FTYPE_V8SI_V8DF_UQI: + case V8DI_FTYPE_DI_V8DI_UQI: + case V16SF_FTYPE_V8SF_V16SF_UHI: + case V16SI_FTYPE_V8SI_V16SI_UHI: + case V16HI_FTYPE_V16HI_V16HI_UHI: + case V8HI_FTYPE_V16QI_V8HI_UQI: + case V16HI_FTYPE_V16QI_V16HI_UHI: + case V32HI_FTYPE_V32HI_V32HI_USI: + case V32HI_FTYPE_V32QI_V32HI_USI: + case V8DI_FTYPE_V16QI_V8DI_UQI: + case V8DI_FTYPE_V2DI_V8DI_UQI: + case V8DI_FTYPE_V4DI_V8DI_UQI: + case V8DI_FTYPE_V8DI_V8DI_UQI: + case V8DI_FTYPE_V8HI_V8DI_UQI: + case V8DI_FTYPE_V8SI_V8DI_UQI: + case V8HI_FTYPE_V8DI_V8HI_UQI: + case V8SI_FTYPE_V8DI_V8SI_UQI: + case V4SI_FTYPE_V4SI_V4SI_V4SI: + case V16SI_FTYPE_V16SI_V16SI_V16SI: + case V8DI_FTYPE_V8DI_V8DI_V8DI: + case V32HI_FTYPE_V32HI_V32HI_V32HI: + case V2DI_FTYPE_V2DI_V2DI_V2DI: + case V16HI_FTYPE_V16HI_V16HI_V16HI: + case V8SI_FTYPE_V8SI_V8SI_V8SI: + case V8HI_FTYPE_V8HI_V8HI_V8HI: + nargs = 3; + break; + case V32QI_FTYPE_V32QI_V32QI_INT: + case V16HI_FTYPE_V16HI_V16HI_INT: + case V16QI_FTYPE_V16QI_V16QI_INT: + case V4DI_FTYPE_V4DI_V4DI_INT: + case V8HI_FTYPE_V8HI_V8HI_INT: + case V8SI_FTYPE_V8SI_V8SI_INT: + case V8SI_FTYPE_V8SI_V4SI_INT: + case V8SF_FTYPE_V8SF_V8SF_INT: + case V8SF_FTYPE_V8SF_V4SF_INT: + case V4SI_FTYPE_V4SI_V4SI_INT: + case V4DF_FTYPE_V4DF_V4DF_INT: + case V16SF_FTYPE_V16SF_V16SF_INT: + case V16SF_FTYPE_V16SF_V4SF_INT: + case V16SI_FTYPE_V16SI_V4SI_INT: + case V4DF_FTYPE_V4DF_V2DF_INT: + case V4SF_FTYPE_V4SF_V4SF_INT: + case V2DI_FTYPE_V2DI_V2DI_INT: + case V4DI_FTYPE_V4DI_V2DI_INT: + case V2DF_FTYPE_V2DF_V2DF_INT: + case UQI_FTYPE_V8DI_V8UDI_INT: + case UQI_FTYPE_V8DF_V8DF_INT: + case UQI_FTYPE_V2DF_V2DF_INT: + case UQI_FTYPE_V4SF_V4SF_INT: + case UHI_FTYPE_V16SI_V16SI_INT: + case UHI_FTYPE_V16SF_V16SF_INT: + case V64QI_FTYPE_V64QI_V64QI_INT: + case V32HI_FTYPE_V32HI_V32HI_INT: + case V16SI_FTYPE_V16SI_V16SI_INT: + case V8DI_FTYPE_V8DI_V8DI_INT: + nargs = 3; + nargs_constant = 1; + break; + case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT: + nargs = 3; + rmode = V4DImode; + nargs_constant = 1; + break; + case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT: + nargs = 3; + rmode = V2DImode; + nargs_constant = 1; + break; + case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT: + nargs = 3; + rmode = DImode; + nargs_constant = 1; + break; + case V2DI_FTYPE_V2DI_UINT_UINT: + nargs = 3; + nargs_constant = 2; + break; + case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT: + nargs = 3; + rmode = V8DImode; + nargs_constant = 1; + break; + case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT: + nargs = 5; + rmode = V8DImode; + mask_pos = 2; + nargs_constant = 1; + break; + case QI_FTYPE_V8DF_INT_UQI: + case QI_FTYPE_V4DF_INT_UQI: + case QI_FTYPE_V2DF_INT_UQI: + case HI_FTYPE_V16SF_INT_UHI: + case QI_FTYPE_V8SF_INT_UQI: + case QI_FTYPE_V4SF_INT_UQI: + case V4SI_FTYPE_V4SI_V4SI_UHI: + case V8SI_FTYPE_V8SI_V8SI_UHI: + nargs = 3; + mask_pos = 1; + nargs_constant = 1; + break; + case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT: + nargs = 5; + rmode = V4DImode; + mask_pos = 2; + nargs_constant = 1; + break; + case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT: + nargs = 5; + rmode = V2DImode; + mask_pos = 2; + nargs_constant = 1; + break; + case V32QI_FTYPE_V32QI_V32QI_V32QI_USI: + case V32HI_FTYPE_V32HI_V32HI_V32HI_USI: + case V32HI_FTYPE_V64QI_V64QI_V32HI_USI: + case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI: + case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI: + case V32HI_FTYPE_V32HI_V8HI_V32HI_USI: + case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI: + case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI: + case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI: + case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI: + case V32QI_FTYPE_V16HI_V16HI_V32QI_USI: + case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI: + case V32HI_FTYPE_V16SI_V16SI_V32HI_USI: + case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI: + case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI: + case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI: + case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI: + case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI: + case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI: + case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI: + case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI: + case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI: + case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI: + case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI: + case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI: + case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI: + case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI: + case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI: + case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI: + case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI: + case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI: + case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI: + case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI: + case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI: + case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI: + case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI: + case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI: + case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI: + case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI: + case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI: + case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI: + case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI: + case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI: + case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI: + case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI: + case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI: + case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI: + case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI: + case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI: + case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI: + case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI: + nargs = 4; + break; + case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: + case V4DF_FTYPE_V4DF_V4DF_V4DI_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SI_INT: + case V8SF_FTYPE_V8SF_V8SF_V8SI_INT: + case V16SF_FTYPE_V16SF_V16SF_V16SI_INT: + nargs = 4; + nargs_constant = 1; + break; + case UQI_FTYPE_V4DI_V4DI_INT_UQI: + case UQI_FTYPE_V8SI_V8SI_INT_UQI: + case QI_FTYPE_V4DF_V4DF_INT_UQI: + case QI_FTYPE_V8SF_V8SF_INT_UQI: + case UQI_FTYPE_V2DI_V2DI_INT_UQI: + case UQI_FTYPE_V4SI_V4SI_INT_UQI: + case UQI_FTYPE_V2DF_V2DF_INT_UQI: + case UQI_FTYPE_V4SF_V4SF_INT_UQI: + case UDI_FTYPE_V64QI_V64QI_INT_UDI: + case USI_FTYPE_V32QI_V32QI_INT_USI: + case UHI_FTYPE_V16QI_V16QI_INT_UHI: + case USI_FTYPE_V32HI_V32HI_INT_USI: + case UHI_FTYPE_V16HI_V16HI_INT_UHI: + case UQI_FTYPE_V8HI_V8HI_INT_UQI: + case V32HI_FTYPE_V32HI_V32HI_V32HI_INT: + case V16HI_FTYPE_V16HI_V16HI_V16HI_INT: + case V8HI_FTYPE_V8HI_V8HI_V8HI_INT: + case V8SI_FTYPE_V8SI_V8SI_V8SI_INT: + case V4DI_FTYPE_V4DI_V4DI_V4DI_INT: + case V8DI_FTYPE_V8DI_V8DI_V8DI_INT: + case V16SI_FTYPE_V16SI_V16SI_V16SI_INT: + case V2DI_FTYPE_V2DI_V2DI_V2DI_INT: + case V4SI_FTYPE_V4SI_V4SI_V4SI_INT: + nargs = 4; + mask_pos = 1; + nargs_constant = 1; + break; + case V2DI_FTYPE_V2DI_V2DI_UINT_UINT: + nargs = 4; + nargs_constant = 2; + break; + case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED: + case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG: + nargs = 4; + break; + case UQI_FTYPE_V8DI_V8DI_INT_UQI: + case UHI_FTYPE_V16SI_V16SI_INT_UHI: + mask_pos = 1; + nargs = 4; + nargs_constant = 1; + break; + case V8SF_FTYPE_V8SF_INT_V8SF_UQI: + case V4SF_FTYPE_V4SF_INT_V4SF_UQI: + case V2DF_FTYPE_V4DF_INT_V2DF_UQI: + case V2DI_FTYPE_V4DI_INT_V2DI_UQI: + case V8SF_FTYPE_V16SF_INT_V8SF_UQI: + case V8SI_FTYPE_V16SI_INT_V8SI_UQI: + case V2DF_FTYPE_V8DF_INT_V2DF_UQI: + case V2DI_FTYPE_V8DI_INT_V2DI_UQI: + case V4SF_FTYPE_V8SF_INT_V4SF_UQI: + case V4SI_FTYPE_V8SI_INT_V4SI_UQI: + case V8HI_FTYPE_V8SF_INT_V8HI_UQI: + case V8HI_FTYPE_V4SF_INT_V8HI_UQI: + case V32HI_FTYPE_V32HI_INT_V32HI_USI: + case V16HI_FTYPE_V16HI_INT_V16HI_UHI: + case V8HI_FTYPE_V8HI_INT_V8HI_UQI: + case V4DI_FTYPE_V4DI_INT_V4DI_UQI: + case V2DI_FTYPE_V2DI_INT_V2DI_UQI: + case V8SI_FTYPE_V8SI_INT_V8SI_UQI: + case V4SI_FTYPE_V4SI_INT_V4SI_UQI: + case V4DF_FTYPE_V4DF_INT_V4DF_UQI: + case V2DF_FTYPE_V2DF_INT_V2DF_UQI: + case V8DF_FTYPE_V8DF_INT_V8DF_UQI: + case V16SF_FTYPE_V16SF_INT_V16SF_UHI: + case V16HI_FTYPE_V16SF_INT_V16HI_UHI: + case V16SI_FTYPE_V16SI_INT_V16SI_UHI: + case V4SI_FTYPE_V16SI_INT_V4SI_UQI: + case V4DI_FTYPE_V8DI_INT_V4DI_UQI: + case V4DF_FTYPE_V8DF_INT_V4DF_UQI: + case V4SF_FTYPE_V16SF_INT_V4SF_UQI: + case V8DI_FTYPE_V8DI_INT_V8DI_UQI: + nargs = 4; + mask_pos = 2; + nargs_constant = 1; + break; + case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI: + case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI: + case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI: + case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI: + case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI: + case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI: + case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI: + case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI: + case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI: + case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI: + case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI: + case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI: + case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI: + case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI: + case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI: + case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI: + case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI: + case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI: + case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI: + case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI: + case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI: + case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI: + case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI: + case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI: + case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI: + case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI: + case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI: + nargs = 5; + mask_pos = 2; + nargs_constant = 1; + break; + case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI: + case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI: + case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI: + case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI: + case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI: + case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI: + case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI: + case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI: + case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI: + case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI: + nargs = 5; + mask_pos = 1; + nargs_constant = 1; + break; + case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI: + case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI: + case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI: + case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT: + case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT: + case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT: + case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT: + case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT: + case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT: + case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT: + case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT: + case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT: + nargs = 5; + mask_pos = 1; + nargs_constant = 2; + break; + + default: + gcc_unreachable (); + } + + gcc_assert (nargs <= ARRAY_SIZE (args)); + + if (comparison != UNKNOWN) + { + gcc_assert (nargs == 2); + return ix86_expand_sse_compare (d, exp, target, swap); + } + + if (rmode == VOIDmode || rmode == tmode) + { + if (optimize + || target == 0 + || GET_MODE (target) != tmode + || !insn_p->operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + else if (memory_operand (target, tmode)) + num_memory++; + real_target = target; + } + else + { + real_target = gen_reg_rtx (tmode); + target = lowpart_subreg (rmode, real_target, tmode); + } + + for (i = 0; i < nargs; i++) + { + tree arg = CALL_EXPR_ARG (exp, i); + rtx op = expand_normal (arg); + machine_mode mode = insn_p->operand[i + 1].mode; + bool match = insn_p->operand[i + 1].predicate (op, mode); + + if (second_arg_count && i == 1) + { + /* SIMD shift insns take either an 8-bit immediate or + register as count. But builtin functions take int as + count. If count doesn't match, we put it in register. + The instructions are using 64-bit count, if op is just + 32-bit, zero-extend it, as negative shift counts + are undefined behavior and zero-extension is more + efficient. */ + if (!match) + { + if (SCALAR_INT_MODE_P (GET_MODE (op))) + op = convert_modes (mode, GET_MODE (op), op, 1); + else + op = lowpart_subreg (mode, op, GET_MODE (op)); + if (!insn_p->operand[i + 1].predicate (op, mode)) + op = copy_to_reg (op); + } + } + else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || + (!mask_pos && (nargs - i) <= nargs_constant)) + { + if (!match) + switch (icode) + { + case CODE_FOR_avx_vinsertf128v4di: + case CODE_FOR_avx_vextractf128v4di: + error ("the last argument must be an 1-bit immediate"); + return const0_rtx; + + case CODE_FOR_avx512f_cmpv8di3_mask: + case CODE_FOR_avx512f_cmpv16si3_mask: + case CODE_FOR_avx512f_ucmpv8di3_mask: + case CODE_FOR_avx512f_ucmpv16si3_mask: + case CODE_FOR_avx512vl_cmpv4di3_mask: + case CODE_FOR_avx512vl_cmpv8si3_mask: + case CODE_FOR_avx512vl_ucmpv4di3_mask: + case CODE_FOR_avx512vl_ucmpv8si3_mask: + case CODE_FOR_avx512vl_cmpv2di3_mask: + case CODE_FOR_avx512vl_cmpv4si3_mask: + case CODE_FOR_avx512vl_ucmpv2di3_mask: + case CODE_FOR_avx512vl_ucmpv4si3_mask: + error ("the last argument must be a 3-bit immediate"); + return const0_rtx; + + case CODE_FOR_sse4_1_roundsd: + case CODE_FOR_sse4_1_roundss: + + case CODE_FOR_sse4_1_roundpd: + case CODE_FOR_sse4_1_roundps: + case CODE_FOR_avx_roundpd256: + case CODE_FOR_avx_roundps256: + + case CODE_FOR_sse4_1_roundpd_vec_pack_sfix: + case CODE_FOR_sse4_1_roundps_sfix: + case CODE_FOR_avx_roundpd_vec_pack_sfix256: + case CODE_FOR_avx_roundps_sfix256: + + case CODE_FOR_sse4_1_blendps: + case CODE_FOR_avx_blendpd256: + case CODE_FOR_avx_vpermilv4df: + case CODE_FOR_avx_vpermilv4df_mask: + case CODE_FOR_avx512f_getmantv8df_mask: + case CODE_FOR_avx512f_getmantv16sf_mask: + case CODE_FOR_avx512vl_getmantv8sf_mask: + case CODE_FOR_avx512vl_getmantv4df_mask: + case CODE_FOR_avx512vl_getmantv4sf_mask: + case CODE_FOR_avx512vl_getmantv2df_mask: + case CODE_FOR_avx512dq_rangepv8df_mask_round: + case CODE_FOR_avx512dq_rangepv16sf_mask_round: + case CODE_FOR_avx512dq_rangepv4df_mask: + case CODE_FOR_avx512dq_rangepv8sf_mask: + case CODE_FOR_avx512dq_rangepv2df_mask: + case CODE_FOR_avx512dq_rangepv4sf_mask: + case CODE_FOR_avx_shufpd256_mask: + error ("the last argument must be a 4-bit immediate"); + return const0_rtx; + + case CODE_FOR_sha1rnds4: + case CODE_FOR_sse4_1_blendpd: + case CODE_FOR_avx_vpermilv2df: + case CODE_FOR_avx_vpermilv2df_mask: + case CODE_FOR_xop_vpermil2v2df3: + case CODE_FOR_xop_vpermil2v4sf3: + case CODE_FOR_xop_vpermil2v4df3: + case CODE_FOR_xop_vpermil2v8sf3: + case CODE_FOR_avx512f_vinsertf32x4_mask: + case CODE_FOR_avx512f_vinserti32x4_mask: + case CODE_FOR_avx512f_vextractf32x4_mask: + case CODE_FOR_avx512f_vextracti32x4_mask: + case CODE_FOR_sse2_shufpd: + case CODE_FOR_sse2_shufpd_mask: + case CODE_FOR_avx512dq_shuf_f64x2_mask: + case CODE_FOR_avx512dq_shuf_i64x2_mask: + case CODE_FOR_avx512vl_shuf_i32x4_mask: + case CODE_FOR_avx512vl_shuf_f32x4_mask: + error ("the last argument must be a 2-bit immediate"); + return const0_rtx; + + case CODE_FOR_avx_vextractf128v4df: + case CODE_FOR_avx_vextractf128v8sf: + case CODE_FOR_avx_vextractf128v8si: + case CODE_FOR_avx_vinsertf128v4df: + case CODE_FOR_avx_vinsertf128v8sf: + case CODE_FOR_avx_vinsertf128v8si: + case CODE_FOR_avx512f_vinsertf64x4_mask: + case CODE_FOR_avx512f_vinserti64x4_mask: + case CODE_FOR_avx512f_vextractf64x4_mask: + case CODE_FOR_avx512f_vextracti64x4_mask: + case CODE_FOR_avx512dq_vinsertf32x8_mask: + case CODE_FOR_avx512dq_vinserti32x8_mask: + case CODE_FOR_avx512vl_vinsertv4df: + case CODE_FOR_avx512vl_vinsertv4di: + case CODE_FOR_avx512vl_vinsertv8sf: + case CODE_FOR_avx512vl_vinsertv8si: + error ("the last argument must be a 1-bit immediate"); + return const0_rtx; + + case CODE_FOR_avx_vmcmpv2df3: + case CODE_FOR_avx_vmcmpv4sf3: + case CODE_FOR_avx_cmpv2df3: + case CODE_FOR_avx_cmpv4sf3: + case CODE_FOR_avx_cmpv4df3: + case CODE_FOR_avx_cmpv8sf3: + case CODE_FOR_avx512f_cmpv8df3_mask: + case CODE_FOR_avx512f_cmpv16sf3_mask: + case CODE_FOR_avx512f_vmcmpv2df3_mask: + case CODE_FOR_avx512f_vmcmpv4sf3_mask: + error ("the last argument must be a 5-bit immediate"); + return const0_rtx; + + default: + switch (nargs_constant) + { + case 2: + if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || + (!mask_pos && (nargs - i) == nargs_constant)) + { + error ("the next to last argument must be an 8-bit immediate"); + break; + } + /* FALLTHRU */ + case 1: + error ("the last argument must be an 8-bit immediate"); + break; + default: + gcc_unreachable (); + } + return const0_rtx; + } + } + else + { + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); + + /* If we aren't optimizing, only allow one memory operand to + be generated. */ + if (memory_operand (op, mode)) + num_memory++; + + op = fixup_modeless_constant (op, mode); + + if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) + { + if (optimize || !match || num_memory > 1) + op = copy_to_mode_reg (mode, op); + } + else + { + op = copy_to_reg (op); + op = lowpart_subreg (mode, op, GET_MODE (op)); + } + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 1: + pat = GEN_FCN (icode) (real_target, args[0].op); + break; + case 2: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op); + break; + case 3: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, + args[2].op); + break; + case 4: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, + args[2].op, args[3].op); + break; + case 5: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, + args[2].op, args[3].op, args[4].op); + break; + case 6: + pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, + args[2].op, args[3].op, args[4].op, + args[5].op); + break; + default: + gcc_unreachable (); + } + + if (! pat) + return 0; + + emit_insn (pat); + return target; +} + +/* Transform pattern of following layout: + (set A + (unspec [B C] UNSPEC_EMBEDDED_ROUNDING)) + ) + into: + (set (A B)) */ + +static rtx +ix86_erase_embedded_rounding (rtx pat) +{ + if (GET_CODE (pat) == INSN) + pat = PATTERN (pat); + + gcc_assert (GET_CODE (pat) == SET); + rtx src = SET_SRC (pat); + gcc_assert (XVECLEN (src, 0) == 2); + rtx p0 = XVECEXP (src, 0, 0); + gcc_assert (GET_CODE (src) == UNSPEC + && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING); + rtx res = gen_rtx_SET (SET_DEST (pat), p0); + return res; +} + +/* Subroutine of ix86_expand_round_builtin to take care of comi insns + with rounding. */ +static rtx +ix86_expand_sse_comi_round (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat, set_dst; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + tree arg2 = CALL_EXPR_ARG (exp, 2); + tree arg3 = CALL_EXPR_ARG (exp, 3); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2 = expand_normal (arg2); + rtx op3 = expand_normal (arg3); + enum insn_code icode = d->icode; + const struct insn_data_d *insn_p = &insn_data[icode]; + machine_mode mode0 = insn_p->operand[0].mode; + machine_mode mode1 = insn_p->operand[1].mode; + enum rtx_code comparison = UNEQ; + bool need_ucomi = false; + + /* See avxintrin.h for values. */ + enum rtx_code comi_comparisons[32] = + { + UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT, + UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE, + UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT + }; + bool need_ucomi_values[32] = + { + true, false, false, true, true, false, false, true, + true, false, false, true, true, false, false, true, + false, true, true, false, false, true, true, false, + false, true, true, false, false, true, true, false + }; + + if (!CONST_INT_P (op2)) + { + error ("the third argument must be comparison constant"); + return const0_rtx; + } + if (INTVAL (op2) < 0 || INTVAL (op2) >= 32) + { + error ("incorrect comparison mode"); + return const0_rtx; + } + + if (!insn_p->operand[2].predicate (op3, SImode)) + { + error ("incorrect rounding operand"); + return const0_rtx; + } + + comparison = comi_comparisons[INTVAL (op2)]; + need_ucomi = need_ucomi_values[INTVAL (op2)]; + + if (VECTOR_MODE_P (mode0)) + op0 = safe_vector_operand (op0, mode0); + if (VECTOR_MODE_P (mode1)) + op1 = safe_vector_operand (op1, mode1); + + target = gen_reg_rtx (SImode); + emit_move_insn (target, const0_rtx); + target = gen_rtx_SUBREG (QImode, target, 0); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_p->operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_p->operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + if (need_ucomi) + icode = icode == CODE_FOR_sse_comi_round + ? CODE_FOR_sse_ucomi_round + : CODE_FOR_sse2_ucomi_round; + + pat = GEN_FCN (icode) (op0, op1, op3); + if (! pat) + return 0; + + /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */ + if (INTVAL (op3) == NO_ROUND) + { + pat = ix86_erase_embedded_rounding (pat); + if (! pat) + return 0; + + set_dst = SET_DEST (pat); + } + else + { + gcc_assert (GET_CODE (pat) == SET); + set_dst = SET_DEST (pat); + } + + emit_insn (pat); + emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), + gen_rtx_fmt_ee (comparison, QImode, + set_dst, + const0_rtx))); + + return SUBREG_REG (target); +} + +static rtx +ix86_expand_round_builtin (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat; + unsigned int i, nargs; + struct + { + rtx op; + machine_mode mode; + } args[6]; + enum insn_code icode = d->icode; + const struct insn_data_d *insn_p = &insn_data[icode]; + machine_mode tmode = insn_p->operand[0].mode; + unsigned int nargs_constant = 0; + unsigned int redundant_embed_rnd = 0; + + switch ((enum ix86_builtin_func_type) d->flag) + { + case UINT64_FTYPE_V2DF_INT: + case UINT64_FTYPE_V4SF_INT: + case UINT_FTYPE_V2DF_INT: + case UINT_FTYPE_V4SF_INT: + case INT64_FTYPE_V2DF_INT: + case INT64_FTYPE_V4SF_INT: + case INT_FTYPE_V2DF_INT: + case INT_FTYPE_V4SF_INT: + nargs = 2; + break; + case V4SF_FTYPE_V4SF_UINT_INT: + case V4SF_FTYPE_V4SF_UINT64_INT: + case V2DF_FTYPE_V2DF_UINT64_INT: + case V4SF_FTYPE_V4SF_INT_INT: + case V4SF_FTYPE_V4SF_INT64_INT: + case V2DF_FTYPE_V2DF_INT64_INT: + case V4SF_FTYPE_V4SF_V4SF_INT: + case V2DF_FTYPE_V2DF_V2DF_INT: + case V4SF_FTYPE_V4SF_V2DF_INT: + case V2DF_FTYPE_V2DF_V4SF_INT: + nargs = 3; + break; + case V8SF_FTYPE_V8DF_V8SF_QI_INT: + case V8DF_FTYPE_V8DF_V8DF_QI_INT: + case V8SI_FTYPE_V8DF_V8SI_QI_INT: + case V8DI_FTYPE_V8DF_V8DI_QI_INT: + case V8SF_FTYPE_V8DI_V8SF_QI_INT: + case V8DF_FTYPE_V8DI_V8DF_QI_INT: + case V16SF_FTYPE_V16SF_V16SF_HI_INT: + case V8DI_FTYPE_V8SF_V8DI_QI_INT: + case V16SF_FTYPE_V16SI_V16SF_HI_INT: + case V16SI_FTYPE_V16SF_V16SI_HI_INT: + case V8DF_FTYPE_V8SF_V8DF_QI_INT: + case V16SF_FTYPE_V16HI_V16SF_HI_INT: + case V2DF_FTYPE_V2DF_V2DF_V2DF_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SF_INT: + nargs = 4; + break; + case V4SF_FTYPE_V4SF_V4SF_INT_INT: + case V2DF_FTYPE_V2DF_V2DF_INT_INT: + nargs_constant = 2; + nargs = 4; + break; + case INT_FTYPE_V4SF_V4SF_INT_INT: + case INT_FTYPE_V2DF_V2DF_INT_INT: + return ix86_expand_sse_comi_round (d, exp, target); + case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT: + case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT: + case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: + case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: + case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT: + case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT: + nargs = 5; + break; + case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT: + case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT: + nargs_constant = 4; + nargs = 5; + break; + case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT: + case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT: + case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT: + case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT: + nargs_constant = 3; + nargs = 5; + break; + case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT: + case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT: + case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT: + case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT: + case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT: + case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT: + nargs = 6; + nargs_constant = 4; + break; + case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT: + case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT: + case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT: + nargs = 6; + nargs_constant = 3; + break; + default: + gcc_unreachable (); + } + gcc_assert (nargs <= ARRAY_SIZE (args)); + + if (optimize + || target == 0 + || GET_MODE (target) != tmode + || !insn_p->operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + for (i = 0; i < nargs; i++) + { + tree arg = CALL_EXPR_ARG (exp, i); + rtx op = expand_normal (arg); + machine_mode mode = insn_p->operand[i + 1].mode; + bool match = insn_p->operand[i + 1].predicate (op, mode); + + if (i == nargs - nargs_constant) + { + if (!match) + { + switch (icode) + { + case CODE_FOR_avx512f_getmantv8df_mask_round: + case CODE_FOR_avx512f_getmantv16sf_mask_round: + case CODE_FOR_avx512f_vgetmantv2df_round: + case CODE_FOR_avx512f_vgetmantv2df_mask_round: + case CODE_FOR_avx512f_vgetmantv4sf_round: + case CODE_FOR_avx512f_vgetmantv4sf_mask_round: + error ("the immediate argument must be a 4-bit immediate"); + return const0_rtx; + case CODE_FOR_avx512f_cmpv8df3_mask_round: + case CODE_FOR_avx512f_cmpv16sf3_mask_round: + case CODE_FOR_avx512f_vmcmpv2df3_mask_round: + case CODE_FOR_avx512f_vmcmpv4sf3_mask_round: + error ("the immediate argument must be a 5-bit immediate"); + return const0_rtx; + default: + error ("the immediate argument must be an 8-bit immediate"); + return const0_rtx; + } + } + } + else if (i == nargs-1) + { + if (!insn_p->operand[nargs].predicate (op, SImode)) + { + error ("incorrect rounding operand"); + return const0_rtx; + } + + /* If there is no rounding use normal version of the pattern. */ + if (INTVAL (op) == NO_ROUND) + redundant_embed_rnd = 1; + } + else + { + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); + + op = fixup_modeless_constant (op, mode); + + if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) + { + if (optimize || !match) + op = copy_to_mode_reg (mode, op); + } + else + { + op = copy_to_reg (op); + op = lowpart_subreg (mode, op, GET_MODE (op)); + } + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 1: + pat = GEN_FCN (icode) (target, args[0].op); + break; + case 2: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op); + break; + case 3: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + args[2].op); + break; + case 4: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + args[2].op, args[3].op); + break; + case 5: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + args[2].op, args[3].op, args[4].op); + break; + case 6: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + args[2].op, args[3].op, args[4].op, + args[5].op); + break; + default: + gcc_unreachable (); + } + + if (!pat) + return 0; + + if (redundant_embed_rnd) + pat = ix86_erase_embedded_rounding (pat); + + emit_insn (pat); + return target; +} + +/* Subroutine of ix86_expand_builtin to take care of special insns + with variable number of operands. */ + +static rtx +ix86_expand_special_args_builtin (const struct builtin_description *d, + tree exp, rtx target) +{ + tree arg; + rtx pat, op; + unsigned int i, nargs, arg_adjust, memory; + bool aligned_mem = false; + struct + { + rtx op; + machine_mode mode; + } args[3]; + enum insn_code icode = d->icode; + bool last_arg_constant = false; + const struct insn_data_d *insn_p = &insn_data[icode]; + machine_mode tmode = insn_p->operand[0].mode; + enum { load, store } klass; + + switch ((enum ix86_builtin_func_type) d->flag) + { + case VOID_FTYPE_VOID: + emit_insn (GEN_FCN (icode) (target)); + return 0; + case VOID_FTYPE_UINT64: + case VOID_FTYPE_UNSIGNED: + nargs = 0; + klass = store; + memory = 0; + break; + + case INT_FTYPE_VOID: + case USHORT_FTYPE_VOID: + case UINT64_FTYPE_VOID: + case UINT_FTYPE_VOID: + case UNSIGNED_FTYPE_VOID: + nargs = 0; + klass = load; + memory = 0; + break; + case UINT64_FTYPE_PUNSIGNED: + case V2DI_FTYPE_PV2DI: + case V4DI_FTYPE_PV4DI: + case V32QI_FTYPE_PCCHAR: + case V16QI_FTYPE_PCCHAR: + case V8SF_FTYPE_PCV4SF: + case V8SF_FTYPE_PCFLOAT: + case V4SF_FTYPE_PCFLOAT: + case V4DF_FTYPE_PCV2DF: + case V4DF_FTYPE_PCDOUBLE: + case V2DF_FTYPE_PCDOUBLE: + case VOID_FTYPE_PVOID: + case V8DI_FTYPE_PV8DI: + nargs = 1; + klass = load; + memory = 0; + switch (icode) + { + case CODE_FOR_sse4_1_movntdqa: + case CODE_FOR_avx2_movntdqa: + case CODE_FOR_avx512f_movntdqa: + aligned_mem = true; + break; + default: + break; + } + break; + case VOID_FTYPE_PV2SF_V4SF: + case VOID_FTYPE_PV8DI_V8DI: + case VOID_FTYPE_PV4DI_V4DI: + case VOID_FTYPE_PV2DI_V2DI: + case VOID_FTYPE_PCHAR_V32QI: + case VOID_FTYPE_PCHAR_V16QI: + case VOID_FTYPE_PFLOAT_V16SF: + case VOID_FTYPE_PFLOAT_V8SF: + case VOID_FTYPE_PFLOAT_V4SF: + case VOID_FTYPE_PDOUBLE_V8DF: + case VOID_FTYPE_PDOUBLE_V4DF: + case VOID_FTYPE_PDOUBLE_V2DF: + case VOID_FTYPE_PLONGLONG_LONGLONG: + case VOID_FTYPE_PULONGLONG_ULONGLONG: + case VOID_FTYPE_PUNSIGNED_UNSIGNED: + case VOID_FTYPE_PINT_INT: + nargs = 1; + klass = store; + /* Reserve memory operand for target. */ + memory = ARRAY_SIZE (args); + switch (icode) + { + /* These builtins and instructions require the memory + to be properly aligned. */ + case CODE_FOR_avx_movntv4di: + case CODE_FOR_sse2_movntv2di: + case CODE_FOR_avx_movntv8sf: + case CODE_FOR_sse_movntv4sf: + case CODE_FOR_sse4a_vmmovntv4sf: + case CODE_FOR_avx_movntv4df: + case CODE_FOR_sse2_movntv2df: + case CODE_FOR_sse4a_vmmovntv2df: + case CODE_FOR_sse2_movntidi: + case CODE_FOR_sse_movntq: + case CODE_FOR_sse2_movntisi: + case CODE_FOR_avx512f_movntv16sf: + case CODE_FOR_avx512f_movntv8df: + case CODE_FOR_avx512f_movntv8di: + aligned_mem = true; + break; + default: + break; + } + break; + case VOID_FTYPE_PVOID_PCVOID: + nargs = 1; + klass = store; + memory = 0; + + break; + case V4SF_FTYPE_V4SF_PCV2SF: + case V2DF_FTYPE_V2DF_PCDOUBLE: + nargs = 2; + klass = load; + memory = 1; + break; + case V8SF_FTYPE_PCV8SF_V8SI: + case V4DF_FTYPE_PCV4DF_V4DI: + case V4SF_FTYPE_PCV4SF_V4SI: + case V2DF_FTYPE_PCV2DF_V2DI: + case V8SI_FTYPE_PCV8SI_V8SI: + case V4DI_FTYPE_PCV4DI_V4DI: + case V4SI_FTYPE_PCV4SI_V4SI: + case V2DI_FTYPE_PCV2DI_V2DI: + case VOID_FTYPE_INT_INT64: + nargs = 2; + klass = load; + memory = 0; + break; + case VOID_FTYPE_PV8DF_V8DF_UQI: + case VOID_FTYPE_PV4DF_V4DF_UQI: + case VOID_FTYPE_PV2DF_V2DF_UQI: + case VOID_FTYPE_PV16SF_V16SF_UHI: + case VOID_FTYPE_PV8SF_V8SF_UQI: + case VOID_FTYPE_PV4SF_V4SF_UQI: + case VOID_FTYPE_PV8DI_V8DI_UQI: + case VOID_FTYPE_PV4DI_V4DI_UQI: + case VOID_FTYPE_PV2DI_V2DI_UQI: + case VOID_FTYPE_PV16SI_V16SI_UHI: + case VOID_FTYPE_PV8SI_V8SI_UQI: + case VOID_FTYPE_PV4SI_V4SI_UQI: + case VOID_FTYPE_PV64QI_V64QI_UDI: + case VOID_FTYPE_PV32HI_V32HI_USI: + case VOID_FTYPE_PV32QI_V32QI_USI: + case VOID_FTYPE_PV16QI_V16QI_UHI: + case VOID_FTYPE_PV16HI_V16HI_UHI: + case VOID_FTYPE_PV8HI_V8HI_UQI: + switch (icode) + { + /* These builtins and instructions require the memory + to be properly aligned. */ + case CODE_FOR_avx512f_storev16sf_mask: + case CODE_FOR_avx512f_storev16si_mask: + case CODE_FOR_avx512f_storev8df_mask: + case CODE_FOR_avx512f_storev8di_mask: + case CODE_FOR_avx512vl_storev8sf_mask: + case CODE_FOR_avx512vl_storev8si_mask: + case CODE_FOR_avx512vl_storev4df_mask: + case CODE_FOR_avx512vl_storev4di_mask: + case CODE_FOR_avx512vl_storev4sf_mask: + case CODE_FOR_avx512vl_storev4si_mask: + case CODE_FOR_avx512vl_storev2df_mask: + case CODE_FOR_avx512vl_storev2di_mask: + aligned_mem = true; + break; + default: + break; + } + /* FALLTHRU */ + case VOID_FTYPE_PV8SF_V8SI_V8SF: + case VOID_FTYPE_PV4DF_V4DI_V4DF: + case VOID_FTYPE_PV4SF_V4SI_V4SF: + case VOID_FTYPE_PV2DF_V2DI_V2DF: + case VOID_FTYPE_PV8SI_V8SI_V8SI: + case VOID_FTYPE_PV4DI_V4DI_V4DI: + case VOID_FTYPE_PV4SI_V4SI_V4SI: + case VOID_FTYPE_PV2DI_V2DI_V2DI: + case VOID_FTYPE_PV8SI_V8DI_UQI: + case VOID_FTYPE_PV8HI_V8DI_UQI: + case VOID_FTYPE_PV16HI_V16SI_UHI: + case VOID_FTYPE_PV16QI_V8DI_UQI: + case VOID_FTYPE_PV16QI_V16SI_UHI: + case VOID_FTYPE_PV4SI_V4DI_UQI: + case VOID_FTYPE_PV4SI_V2DI_UQI: + case VOID_FTYPE_PV8HI_V4DI_UQI: + case VOID_FTYPE_PV8HI_V2DI_UQI: + case VOID_FTYPE_PV8HI_V8SI_UQI: + case VOID_FTYPE_PV8HI_V4SI_UQI: + case VOID_FTYPE_PV16QI_V4DI_UQI: + case VOID_FTYPE_PV16QI_V2DI_UQI: + case VOID_FTYPE_PV16QI_V8SI_UQI: + case VOID_FTYPE_PV16QI_V4SI_UQI: + case VOID_FTYPE_PCHAR_V64QI_UDI: + case VOID_FTYPE_PCHAR_V32QI_USI: + case VOID_FTYPE_PCHAR_V16QI_UHI: + case VOID_FTYPE_PSHORT_V32HI_USI: + case VOID_FTYPE_PSHORT_V16HI_UHI: + case VOID_FTYPE_PSHORT_V8HI_UQI: + case VOID_FTYPE_PINT_V16SI_UHI: + case VOID_FTYPE_PINT_V8SI_UQI: + case VOID_FTYPE_PINT_V4SI_UQI: + case VOID_FTYPE_PINT64_V8DI_UQI: + case VOID_FTYPE_PINT64_V4DI_UQI: + case VOID_FTYPE_PINT64_V2DI_UQI: + case VOID_FTYPE_PDOUBLE_V8DF_UQI: + case VOID_FTYPE_PDOUBLE_V4DF_UQI: + case VOID_FTYPE_PDOUBLE_V2DF_UQI: + case VOID_FTYPE_PFLOAT_V16SF_UHI: + case VOID_FTYPE_PFLOAT_V8SF_UQI: + case VOID_FTYPE_PFLOAT_V4SF_UQI: + case VOID_FTYPE_PV32QI_V32HI_USI: + case VOID_FTYPE_PV16QI_V16HI_UHI: + case VOID_FTYPE_PV8QI_V8HI_UQI: + nargs = 2; + klass = store; + /* Reserve memory operand for target. */ + memory = ARRAY_SIZE (args); + break; + case V4SF_FTYPE_PCV4SF_V4SF_UQI: + case V8SF_FTYPE_PCV8SF_V8SF_UQI: + case V16SF_FTYPE_PCV16SF_V16SF_UHI: + case V4SI_FTYPE_PCV4SI_V4SI_UQI: + case V8SI_FTYPE_PCV8SI_V8SI_UQI: + case V16SI_FTYPE_PCV16SI_V16SI_UHI: + case V2DF_FTYPE_PCV2DF_V2DF_UQI: + case V4DF_FTYPE_PCV4DF_V4DF_UQI: + case V8DF_FTYPE_PCV8DF_V8DF_UQI: + case V2DI_FTYPE_PCV2DI_V2DI_UQI: + case V4DI_FTYPE_PCV4DI_V4DI_UQI: + case V8DI_FTYPE_PCV8DI_V8DI_UQI: + case V64QI_FTYPE_PCV64QI_V64QI_UDI: + case V32HI_FTYPE_PCV32HI_V32HI_USI: + case V32QI_FTYPE_PCV32QI_V32QI_USI: + case V16QI_FTYPE_PCV16QI_V16QI_UHI: + case V16HI_FTYPE_PCV16HI_V16HI_UHI: + case V8HI_FTYPE_PCV8HI_V8HI_UQI: + switch (icode) + { + /* These builtins and instructions require the memory + to be properly aligned. */ + case CODE_FOR_avx512f_loadv16sf_mask: + case CODE_FOR_avx512f_loadv16si_mask: + case CODE_FOR_avx512f_loadv8df_mask: + case CODE_FOR_avx512f_loadv8di_mask: + case CODE_FOR_avx512vl_loadv8sf_mask: + case CODE_FOR_avx512vl_loadv8si_mask: + case CODE_FOR_avx512vl_loadv4df_mask: + case CODE_FOR_avx512vl_loadv4di_mask: + case CODE_FOR_avx512vl_loadv4sf_mask: + case CODE_FOR_avx512vl_loadv4si_mask: + case CODE_FOR_avx512vl_loadv2df_mask: + case CODE_FOR_avx512vl_loadv2di_mask: + case CODE_FOR_avx512bw_loadv64qi_mask: + case CODE_FOR_avx512vl_loadv32qi_mask: + case CODE_FOR_avx512vl_loadv16qi_mask: + case CODE_FOR_avx512bw_loadv32hi_mask: + case CODE_FOR_avx512vl_loadv16hi_mask: + case CODE_FOR_avx512vl_loadv8hi_mask: + aligned_mem = true; + break; + default: + break; + } + /* FALLTHRU */ + case V64QI_FTYPE_PCCHAR_V64QI_UDI: + case V32QI_FTYPE_PCCHAR_V32QI_USI: + case V16QI_FTYPE_PCCHAR_V16QI_UHI: + case V32HI_FTYPE_PCSHORT_V32HI_USI: + case V16HI_FTYPE_PCSHORT_V16HI_UHI: + case V8HI_FTYPE_PCSHORT_V8HI_UQI: + case V16SI_FTYPE_PCINT_V16SI_UHI: + case V8SI_FTYPE_PCINT_V8SI_UQI: + case V4SI_FTYPE_PCINT_V4SI_UQI: + case V8DI_FTYPE_PCINT64_V8DI_UQI: + case V4DI_FTYPE_PCINT64_V4DI_UQI: + case V2DI_FTYPE_PCINT64_V2DI_UQI: + case V8DF_FTYPE_PCDOUBLE_V8DF_UQI: + case V4DF_FTYPE_PCDOUBLE_V4DF_UQI: + case V2DF_FTYPE_PCDOUBLE_V2DF_UQI: + case V16SF_FTYPE_PCFLOAT_V16SF_UHI: + case V8SF_FTYPE_PCFLOAT_V8SF_UQI: + case V4SF_FTYPE_PCFLOAT_V4SF_UQI: + nargs = 3; + klass = load; + memory = 0; + break; + case VOID_FTYPE_UINT_UINT_UINT: + case VOID_FTYPE_UINT64_UINT_UINT: + case UCHAR_FTYPE_UINT_UINT_UINT: + case UCHAR_FTYPE_UINT64_UINT_UINT: + nargs = 3; + klass = load; + memory = ARRAY_SIZE (args); + last_arg_constant = true; + break; + default: + gcc_unreachable (); + } + + gcc_assert (nargs <= ARRAY_SIZE (args)); + + if (klass == store) + { + arg = CALL_EXPR_ARG (exp, 0); + op = expand_normal (arg); + gcc_assert (target == 0); + if (memory) + { + op = ix86_zero_extend_to_Pmode (op); + target = gen_rtx_MEM (tmode, op); + /* target at this point has just BITS_PER_UNIT MEM_ALIGN + on it. Try to improve it using get_pointer_alignment, + and if the special builtin is one that requires strict + mode alignment, also from it's GET_MODE_ALIGNMENT. + Failure to do so could lead to ix86_legitimate_combined_insn + rejecting all changes to such insns. */ + unsigned int align = get_pointer_alignment (arg); + if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode)) + align = GET_MODE_ALIGNMENT (tmode); + if (MEM_ALIGN (target) < align) + set_mem_align (target, align); + } + else + target = force_reg (tmode, op); + arg_adjust = 1; + } + else + { + arg_adjust = 0; + if (optimize + || target == 0 + || !register_operand (target, tmode) + || GET_MODE (target) != tmode) + target = gen_reg_rtx (tmode); + } + + for (i = 0; i < nargs; i++) + { + machine_mode mode = insn_p->operand[i + 1].mode; + bool match; + + arg = CALL_EXPR_ARG (exp, i + arg_adjust); + op = expand_normal (arg); + match = insn_p->operand[i + 1].predicate (op, mode); + + if (last_arg_constant && (i + 1) == nargs) + { + if (!match) + { + if (icode == CODE_FOR_lwp_lwpvalsi3 + || icode == CODE_FOR_lwp_lwpinssi3 + || icode == CODE_FOR_lwp_lwpvaldi3 + || icode == CODE_FOR_lwp_lwpinsdi3) + error ("the last argument must be a 32-bit immediate"); + else + error ("the last argument must be an 8-bit immediate"); + return const0_rtx; + } + } + else + { + if (i == memory) + { + /* This must be the memory operand. */ + op = ix86_zero_extend_to_Pmode (op); + op = gen_rtx_MEM (mode, op); + /* op at this point has just BITS_PER_UNIT MEM_ALIGN + on it. Try to improve it using get_pointer_alignment, + and if the special builtin is one that requires strict + mode alignment, also from it's GET_MODE_ALIGNMENT. + Failure to do so could lead to ix86_legitimate_combined_insn + rejecting all changes to such insns. */ + unsigned int align = get_pointer_alignment (arg); + if (aligned_mem && align < GET_MODE_ALIGNMENT (mode)) + align = GET_MODE_ALIGNMENT (mode); + if (MEM_ALIGN (op) < align) + set_mem_align (op, align); + } + else + { + /* This must be register. */ + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); + + op = fixup_modeless_constant (op, mode); + + if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) + op = copy_to_mode_reg (mode, op); + else + { + op = copy_to_reg (op); + op = lowpart_subreg (mode, op, GET_MODE (op)); + } + } + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 0: + pat = GEN_FCN (icode) (target); + break; + case 1: + pat = GEN_FCN (icode) (target, args[0].op); + break; + case 2: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op); + break; + case 3: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); + break; + default: + gcc_unreachable (); + } + + if (! pat) + return 0; + emit_insn (pat); + return klass == store ? 0 : target; +} + +/* Return the integer constant in ARG. Constrain it to be in the range + of the subparts of VEC_TYPE; issue an error if not. */ + +static int +get_element_number (tree vec_type, tree arg) +{ + unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1; + + if (!tree_fits_uhwi_p (arg) + || (elt = tree_to_uhwi (arg), elt > max)) + { + error ("selector must be an integer constant in the range 0..%wi", max); + return 0; + } + + return elt; +} + +/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around + ix86_expand_vector_init. We DO have language-level syntax for this, in + the form of (type){ init-list }. Except that since we can't place emms + instructions from inside the compiler, we can't allow the use of MMX + registers unless the user explicitly asks for it. So we do *not* define + vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead + we have builtins invoked by mmintrin.h that gives us license to emit + these sorts of instructions. */ + +static rtx +ix86_expand_vec_init_builtin (tree type, tree exp, rtx target) +{ + machine_mode tmode = TYPE_MODE (type); + machine_mode inner_mode = GET_MODE_INNER (tmode); + int i, n_elt = GET_MODE_NUNITS (tmode); + rtvec v = rtvec_alloc (n_elt); + + gcc_assert (VECTOR_MODE_P (tmode)); + gcc_assert (call_expr_nargs (exp) == n_elt); + + for (i = 0; i < n_elt; ++i) + { + rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); + RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); + } + + if (!target || !register_operand (target, tmode)) + target = gen_reg_rtx (tmode); + + ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v)); + return target; +} + +/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around + ix86_expand_vector_extract. They would be redundant (for non-MMX) if we + had a language-level syntax for referencing vector elements. */ + +static rtx +ix86_expand_vec_ext_builtin (tree exp, rtx target) +{ + machine_mode tmode, mode0; + tree arg0, arg1; + int elt; + rtx op0; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + + op0 = expand_normal (arg0); + elt = get_element_number (TREE_TYPE (arg0), arg1); + + tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); + mode0 = TYPE_MODE (TREE_TYPE (arg0)); + gcc_assert (VECTOR_MODE_P (mode0)); + + op0 = force_reg (mode0, op0); + + if (optimize || !target || !register_operand (target, tmode)) + target = gen_reg_rtx (tmode); + + ix86_expand_vector_extract (true, target, op0, elt); + + return target; +} + +/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around + ix86_expand_vector_set. They would be redundant (for non-MMX) if we had + a language-level syntax for referencing vector elements. */ + +static rtx +ix86_expand_vec_set_builtin (tree exp) +{ + machine_mode tmode, mode1; + tree arg0, arg1, arg2; + int elt; + rtx op0, op1, target; + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + + tmode = TYPE_MODE (TREE_TYPE (arg0)); + mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); + gcc_assert (VECTOR_MODE_P (tmode)); + + op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); + op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); + elt = get_element_number (TREE_TYPE (arg0), arg2); + + if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) + op1 = convert_modes (mode1, GET_MODE (op1), op1, true); + + op0 = force_reg (tmode, op0); + op1 = force_reg (mode1, op1); + + /* OP0 is the source of these builtin functions and shouldn't be + modified. Create a copy, use it and return it as target. */ + target = gen_reg_rtx (tmode); + emit_move_insn (target, op0); + ix86_expand_vector_set (true, target, op1, elt); + + return target; +} + +/* Expand an expression EXP that calls a built-in function, + with result going to TARGET if that's convenient + (and in mode MODE if that's convenient). + SUBTARGET may be used as the target for computing one of EXP's operands. + IGNORE is nonzero if the value is to be ignored. */ + +rtx +ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + machine_mode mode, int ignore) +{ + size_t i; + enum insn_code icode, icode2; + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + tree arg0, arg1, arg2, arg3, arg4; + rtx op0, op1, op2, op3, op4, pat, pat2, insn; + machine_mode mode0, mode1, mode2, mode3, mode4; + unsigned int fcode = DECL_FUNCTION_CODE (fndecl); + + /* For CPU builtins that can be folded, fold first and expand the fold. */ + switch (fcode) + { + case IX86_BUILTIN_CPU_INIT: + { + /* Make it call __cpu_indicator_init in libgcc. */ + tree call_expr, fndecl, type; + type = build_function_type_list (integer_type_node, NULL_TREE); + fndecl = build_fn_decl ("__cpu_indicator_init", type); + call_expr = build_call_expr (fndecl, 0); + return expand_expr (call_expr, target, mode, EXPAND_NORMAL); + } + case IX86_BUILTIN_CPU_IS: + case IX86_BUILTIN_CPU_SUPPORTS: + { + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree fold_expr = fold_builtin_cpu (fndecl, &arg0); + gcc_assert (fold_expr != NULL_TREE); + return expand_expr (fold_expr, target, mode, EXPAND_NORMAL); + } + } + + HOST_WIDE_INT isa = ix86_isa_flags; + HOST_WIDE_INT isa2 = ix86_isa_flags2; + HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa; + HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2; + /* The general case is we require all the ISAs specified in bisa{,2} + to be enabled. + The exceptions are: + OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A + OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 + OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4 + where for each this pair it is sufficient if either of the ISAs is + enabled, plus if it is ored with other options also those others. */ + if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) + == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) + && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0) + isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A); + if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) + == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) + && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0) + isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32); + if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) + == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) + && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0) + isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4); + if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2) + { + bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT; + if (TARGET_ABI_X32) + bisa |= OPTION_MASK_ABI_X32; + else + bisa |= OPTION_MASK_ABI_64; + char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL, + (enum fpmath_unit) 0, false, add_abi_p); + if (!opts) + error ("%qE needs unknown isa option", fndecl); + else + { + gcc_assert (opts != NULL); + error ("%qE needs isa option %s", fndecl, opts); + free (opts); + } + return expand_call (exp, target, ignore); + } + + switch (fcode) + { + case IX86_BUILTIN_MASKMOVQ: + case IX86_BUILTIN_MASKMOVDQU: + icode = (fcode == IX86_BUILTIN_MASKMOVQ + ? CODE_FOR_mmx_maskmovq + : CODE_FOR_sse2_maskmovdqu); + /* Note the arg order is different from the operand order. */ + arg1 = CALL_EXPR_ARG (exp, 0); + arg2 = CALL_EXPR_ARG (exp, 1); + arg0 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + mode0 = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + + op0 = ix86_zero_extend_to_Pmode (op0); + op0 = gen_rtx_MEM (mode1, op0); + + if (!insn_data[icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (!insn_data[icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + if (!insn_data[icode].operand[2].predicate (op2, mode2)) + op2 = copy_to_mode_reg (mode2, op2); + pat = GEN_FCN (icode) (op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return 0; + + case IX86_BUILTIN_LDMXCSR: + op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); + target = assign_386_stack_local (SImode, SLOT_TEMP); + emit_move_insn (target, op0); + emit_insn (gen_sse_ldmxcsr (target)); + return 0; + + case IX86_BUILTIN_STMXCSR: + target = assign_386_stack_local (SImode, SLOT_TEMP); + emit_insn (gen_sse_stmxcsr (target)); + return copy_to_mode_reg (SImode, target); + + case IX86_BUILTIN_CLFLUSH: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + icode = CODE_FOR_sse2_clflush; + if (!insn_data[icode].operand[0].predicate (op0, Pmode)) + op0 = ix86_zero_extend_to_Pmode (op0); + + emit_insn (gen_sse2_clflush (op0)); + return 0; + + case IX86_BUILTIN_CLWB: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + icode = CODE_FOR_clwb; + if (!insn_data[icode].operand[0].predicate (op0, Pmode)) + op0 = ix86_zero_extend_to_Pmode (op0); + + emit_insn (gen_clwb (op0)); + return 0; + + case IX86_BUILTIN_CLFLUSHOPT: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + icode = CODE_FOR_clflushopt; + if (!insn_data[icode].operand[0].predicate (op0, Pmode)) + op0 = ix86_zero_extend_to_Pmode (op0); + + emit_insn (gen_clflushopt (op0)); + return 0; + + case IX86_BUILTIN_MONITOR: + case IX86_BUILTIN_MONITORX: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + if (!REG_P (op0)) + op0 = ix86_zero_extend_to_Pmode (op0); + if (!REG_P (op1)) + op1 = copy_to_mode_reg (SImode, op1); + if (!REG_P (op2)) + op2 = copy_to_mode_reg (SImode, op2); + + emit_insn (fcode == IX86_BUILTIN_MONITOR + ? ix86_gen_monitor (op0, op1, op2) + : ix86_gen_monitorx (op0, op1, op2)); + return 0; + + case IX86_BUILTIN_MWAIT: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + if (!REG_P (op1)) + op1 = copy_to_mode_reg (SImode, op1); + emit_insn (gen_sse3_mwait (op0, op1)); + return 0; + + case IX86_BUILTIN_MWAITX: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + if (!REG_P (op1)) + op1 = copy_to_mode_reg (SImode, op1); + if (!REG_P (op2)) + op2 = copy_to_mode_reg (SImode, op2); + emit_insn (gen_mwaitx (op0, op1, op2)); + return 0; + + case IX86_BUILTIN_UMONITOR: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + + op0 = ix86_zero_extend_to_Pmode (op0); + + insn = (TARGET_64BIT + ? gen_umonitor_di (op0) + : gen_umonitor_si (op0)); + + emit_insn (insn); + return 0; + + case IX86_BUILTIN_UMWAIT: + case IX86_BUILTIN_TPAUSE: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + + op1 = force_reg (DImode, op1); + + if (TARGET_64BIT) + { + op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + switch (fcode) + { + case IX86_BUILTIN_UMWAIT: + icode = CODE_FOR_umwait_rex64; + break; + case IX86_BUILTIN_TPAUSE: + icode = CODE_FOR_tpause_rex64; + break; + default: + gcc_unreachable (); + } + + op2 = gen_lowpart (SImode, op2); + op1 = gen_lowpart (SImode, op1); + pat = GEN_FCN (icode) (op0, op1, op2); + } + else + { + switch (fcode) + { + case IX86_BUILTIN_UMWAIT: + icode = CODE_FOR_umwait; + break; + case IX86_BUILTIN_TPAUSE: + icode = CODE_FOR_tpause; + break; + default: + gcc_unreachable (); + } + pat = GEN_FCN (icode) (op0, op1); + } + + if (!pat) + return 0; + + emit_insn (pat); + + if (target == 0 + || !register_operand (target, QImode)) + target = gen_reg_rtx (QImode); + + pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG), + const0_rtx); + emit_insn (gen_rtx_SET (target, pat)); + + return target; + + case IX86_BUILTIN_CLZERO: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + if (!REG_P (op0)) + op0 = ix86_zero_extend_to_Pmode (op0); + emit_insn (ix86_gen_clzero (op0)); + return 0; + + case IX86_BUILTIN_CLDEMOTE: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + icode = CODE_FOR_cldemote; + if (!insn_data[icode].operand[0].predicate (op0, Pmode)) + op0 = ix86_zero_extend_to_Pmode (op0); + + emit_insn (gen_cldemote (op0)); + return 0; + + case IX86_BUILTIN_VEC_INIT_V2SI: + case IX86_BUILTIN_VEC_INIT_V4HI: + case IX86_BUILTIN_VEC_INIT_V8QI: + return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); + + case IX86_BUILTIN_VEC_EXT_V2DF: + case IX86_BUILTIN_VEC_EXT_V2DI: + case IX86_BUILTIN_VEC_EXT_V4SF: + case IX86_BUILTIN_VEC_EXT_V4SI: + case IX86_BUILTIN_VEC_EXT_V8HI: + case IX86_BUILTIN_VEC_EXT_V2SI: + case IX86_BUILTIN_VEC_EXT_V4HI: + case IX86_BUILTIN_VEC_EXT_V16QI: + return ix86_expand_vec_ext_builtin (exp, target); + + case IX86_BUILTIN_VEC_SET_V2DI: + case IX86_BUILTIN_VEC_SET_V4SF: + case IX86_BUILTIN_VEC_SET_V4SI: + case IX86_BUILTIN_VEC_SET_V8HI: + case IX86_BUILTIN_VEC_SET_V4HI: + case IX86_BUILTIN_VEC_SET_V16QI: + return ix86_expand_vec_set_builtin (exp); + + case IX86_BUILTIN_NANQ: + case IX86_BUILTIN_NANSQ: + return expand_call (exp, target, ignore); + + case IX86_BUILTIN_RDPID: + + op0 = gen_reg_rtx (word_mode); + + if (TARGET_64BIT) + { + insn = gen_rdpid_rex64 (op0); + op0 = convert_to_mode (SImode, op0, 1); + } + else + insn = gen_rdpid (op0); + + emit_insn (insn); + + if (target == 0 + || !register_operand (target, SImode)) + target = gen_reg_rtx (SImode); + + emit_move_insn (target, op0); + return target; + + case IX86_BUILTIN_RDPMC: + case IX86_BUILTIN_RDTSC: + case IX86_BUILTIN_RDTSCP: + case IX86_BUILTIN_XGETBV: + + op0 = gen_reg_rtx (DImode); + op1 = gen_reg_rtx (DImode); + + if (fcode == IX86_BUILTIN_RDPMC) + { + arg0 = CALL_EXPR_ARG (exp, 0); + op2 = expand_normal (arg0); + if (!register_operand (op2, SImode)) + op2 = copy_to_mode_reg (SImode, op2); + + insn = (TARGET_64BIT + ? gen_rdpmc_rex64 (op0, op1, op2) + : gen_rdpmc (op0, op2)); + emit_insn (insn); + } + else if (fcode == IX86_BUILTIN_XGETBV) + { + arg0 = CALL_EXPR_ARG (exp, 0); + op2 = expand_normal (arg0); + if (!register_operand (op2, SImode)) + op2 = copy_to_mode_reg (SImode, op2); + + insn = (TARGET_64BIT + ? gen_xgetbv_rex64 (op0, op1, op2) + : gen_xgetbv (op0, op2)); + emit_insn (insn); + } + else if (fcode == IX86_BUILTIN_RDTSC) + { + insn = (TARGET_64BIT + ? gen_rdtsc_rex64 (op0, op1) + : gen_rdtsc (op0)); + emit_insn (insn); + } + else + { + op2 = gen_reg_rtx (SImode); + + insn = (TARGET_64BIT + ? gen_rdtscp_rex64 (op0, op1, op2) + : gen_rdtscp (op0, op2)); + emit_insn (insn); + + arg0 = CALL_EXPR_ARG (exp, 0); + op4 = expand_normal (arg0); + if (!address_operand (op4, VOIDmode)) + { + op4 = convert_memory_address (Pmode, op4); + op4 = copy_addr_to_reg (op4); + } + emit_move_insn (gen_rtx_MEM (SImode, op4), op2); + } + + if (target == 0 + || !register_operand (target, DImode)) + target = gen_reg_rtx (DImode); + + if (TARGET_64BIT) + { + op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32), + op1, 1, OPTAB_DIRECT); + op0 = expand_simple_binop (DImode, IOR, op0, op1, + op0, 1, OPTAB_DIRECT); + } + + emit_move_insn (target, op0); + return target; + + case IX86_BUILTIN_MOVDIR64B: + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + + op0 = ix86_zero_extend_to_Pmode (op0); + if (!address_operand (op1, VOIDmode)) + { + op1 = convert_memory_address (Pmode, op1); + op1 = copy_addr_to_reg (op1); + } + op1 = gen_rtx_MEM (XImode, op1); + + insn = (TARGET_64BIT + ? gen_movdir64b_di (op0, op1) + : gen_movdir64b_si (op0, op1)); + emit_insn (insn); + return 0; + + case IX86_BUILTIN_FXSAVE: + case IX86_BUILTIN_FXRSTOR: + case IX86_BUILTIN_FXSAVE64: + case IX86_BUILTIN_FXRSTOR64: + case IX86_BUILTIN_FNSTENV: + case IX86_BUILTIN_FLDENV: + mode0 = BLKmode; + switch (fcode) + { + case IX86_BUILTIN_FXSAVE: + icode = CODE_FOR_fxsave; + break; + case IX86_BUILTIN_FXRSTOR: + icode = CODE_FOR_fxrstor; + break; + case IX86_BUILTIN_FXSAVE64: + icode = CODE_FOR_fxsave64; + break; + case IX86_BUILTIN_FXRSTOR64: + icode = CODE_FOR_fxrstor64; + break; + case IX86_BUILTIN_FNSTENV: + icode = CODE_FOR_fnstenv; + break; + case IX86_BUILTIN_FLDENV: + icode = CODE_FOR_fldenv; + break; + default: + gcc_unreachable (); + } + + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + + if (!address_operand (op0, VOIDmode)) + { + op0 = convert_memory_address (Pmode, op0); + op0 = copy_addr_to_reg (op0); + } + op0 = gen_rtx_MEM (mode0, op0); + + pat = GEN_FCN (icode) (op0); + if (pat) + emit_insn (pat); + return 0; + + case IX86_BUILTIN_XSETBV: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + + op1 = force_reg (DImode, op1); + + if (TARGET_64BIT) + { + op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + + icode = CODE_FOR_xsetbv_rex64; + + op2 = gen_lowpart (SImode, op2); + op1 = gen_lowpart (SImode, op1); + pat = GEN_FCN (icode) (op0, op1, op2); + } + else + { + icode = CODE_FOR_xsetbv; + + pat = GEN_FCN (icode) (op0, op1); + } + if (pat) + emit_insn (pat); + return 0; + + case IX86_BUILTIN_XSAVE: + case IX86_BUILTIN_XRSTOR: + case IX86_BUILTIN_XSAVE64: + case IX86_BUILTIN_XRSTOR64: + case IX86_BUILTIN_XSAVEOPT: + case IX86_BUILTIN_XSAVEOPT64: + case IX86_BUILTIN_XSAVES: + case IX86_BUILTIN_XRSTORS: + case IX86_BUILTIN_XSAVES64: + case IX86_BUILTIN_XRSTORS64: + case IX86_BUILTIN_XSAVEC: + case IX86_BUILTIN_XSAVEC64: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + + if (!address_operand (op0, VOIDmode)) + { + op0 = convert_memory_address (Pmode, op0); + op0 = copy_addr_to_reg (op0); + } + op0 = gen_rtx_MEM (BLKmode, op0); + + op1 = force_reg (DImode, op1); + + if (TARGET_64BIT) + { + op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + switch (fcode) + { + case IX86_BUILTIN_XSAVE: + icode = CODE_FOR_xsave_rex64; + break; + case IX86_BUILTIN_XRSTOR: + icode = CODE_FOR_xrstor_rex64; + break; + case IX86_BUILTIN_XSAVE64: + icode = CODE_FOR_xsave64; + break; + case IX86_BUILTIN_XRSTOR64: + icode = CODE_FOR_xrstor64; + break; + case IX86_BUILTIN_XSAVEOPT: + icode = CODE_FOR_xsaveopt_rex64; + break; + case IX86_BUILTIN_XSAVEOPT64: + icode = CODE_FOR_xsaveopt64; + break; + case IX86_BUILTIN_XSAVES: + icode = CODE_FOR_xsaves_rex64; + break; + case IX86_BUILTIN_XRSTORS: + icode = CODE_FOR_xrstors_rex64; + break; + case IX86_BUILTIN_XSAVES64: + icode = CODE_FOR_xsaves64; + break; + case IX86_BUILTIN_XRSTORS64: + icode = CODE_FOR_xrstors64; + break; + case IX86_BUILTIN_XSAVEC: + icode = CODE_FOR_xsavec_rex64; + break; + case IX86_BUILTIN_XSAVEC64: + icode = CODE_FOR_xsavec64; + break; + default: + gcc_unreachable (); + } + + op2 = gen_lowpart (SImode, op2); + op1 = gen_lowpart (SImode, op1); + pat = GEN_FCN (icode) (op0, op1, op2); + } + else + { + switch (fcode) + { + case IX86_BUILTIN_XSAVE: + icode = CODE_FOR_xsave; + break; + case IX86_BUILTIN_XRSTOR: + icode = CODE_FOR_xrstor; + break; + case IX86_BUILTIN_XSAVEOPT: + icode = CODE_FOR_xsaveopt; + break; + case IX86_BUILTIN_XSAVES: + icode = CODE_FOR_xsaves; + break; + case IX86_BUILTIN_XRSTORS: + icode = CODE_FOR_xrstors; + break; + case IX86_BUILTIN_XSAVEC: + icode = CODE_FOR_xsavec; + break; + default: + gcc_unreachable (); + } + pat = GEN_FCN (icode) (op0, op1); + } + + if (pat) + emit_insn (pat); + return 0; + + case IX86_BUILTIN_LLWPCB: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + icode = CODE_FOR_lwp_llwpcb; + if (!insn_data[icode].operand[0].predicate (op0, Pmode)) + op0 = ix86_zero_extend_to_Pmode (op0); + emit_insn (gen_lwp_llwpcb (op0)); + return 0; + + case IX86_BUILTIN_SLWPCB: + icode = CODE_FOR_lwp_slwpcb; + if (!target + || !insn_data[icode].operand[0].predicate (target, Pmode)) + target = gen_reg_rtx (Pmode); + emit_insn (gen_lwp_slwpcb (target)); + return target; + + case IX86_BUILTIN_BEXTRI32: + case IX86_BUILTIN_BEXTRI64: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + icode = (fcode == IX86_BUILTIN_BEXTRI32 + ? CODE_FOR_tbm_bextri_si + : CODE_FOR_tbm_bextri_di); + if (!CONST_INT_P (op1)) + { + error ("last argument must be an immediate"); + return const0_rtx; + } + else + { + unsigned char length = (INTVAL (op1) >> 8) & 0xFF; + unsigned char lsb_index = INTVAL (op1) & 0xFF; + op1 = GEN_INT (length); + op2 = GEN_INT (lsb_index); + + mode1 = insn_data[icode].operand[1].mode; + if (!insn_data[icode].operand[1].predicate (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + + mode0 = insn_data[icode].operand[0].mode; + if (target == 0 + || !register_operand (target, mode0)) + target = gen_reg_rtx (mode0); + + pat = GEN_FCN (icode) (target, op0, op1, op2); + if (pat) + emit_insn (pat); + return target; + } + + case IX86_BUILTIN_RDRAND16_STEP: + icode = CODE_FOR_rdrandhi_1; + mode0 = HImode; + goto rdrand_step; + + case IX86_BUILTIN_RDRAND32_STEP: + icode = CODE_FOR_rdrandsi_1; + mode0 = SImode; + goto rdrand_step; + + case IX86_BUILTIN_RDRAND64_STEP: + icode = CODE_FOR_rdranddi_1; + mode0 = DImode; + +rdrand_step: + arg0 = CALL_EXPR_ARG (exp, 0); + op1 = expand_normal (arg0); + if (!address_operand (op1, VOIDmode)) + { + op1 = convert_memory_address (Pmode, op1); + op1 = copy_addr_to_reg (op1); + } + + op0 = gen_reg_rtx (mode0); + emit_insn (GEN_FCN (icode) (op0)); + + emit_move_insn (gen_rtx_MEM (mode0, op1), op0); + + op1 = gen_reg_rtx (SImode); + emit_move_insn (op1, CONST1_RTX (SImode)); + + /* Emit SImode conditional move. */ + if (mode0 == HImode) + { + if (TARGET_ZERO_EXTEND_WITH_AND + && optimize_function_for_speed_p (cfun)) + { + op2 = force_reg (SImode, const0_rtx); + + emit_insn (gen_movstricthi + (gen_lowpart (HImode, op2), op0)); + } + else + { + op2 = gen_reg_rtx (SImode); + + emit_insn (gen_zero_extendhisi2 (op2, op0)); + } + } + else if (mode0 == SImode) + op2 = op0; + else + op2 = gen_rtx_SUBREG (SImode, op0, 0); + + if (target == 0 + || !register_operand (target, SImode)) + target = gen_reg_rtx (SImode); + + pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG), + const0_rtx); + emit_insn (gen_rtx_SET (target, + gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1))); + return target; + + case IX86_BUILTIN_RDSEED16_STEP: + icode = CODE_FOR_rdseedhi_1; + mode0 = HImode; + goto rdseed_step; + + case IX86_BUILTIN_RDSEED32_STEP: + icode = CODE_FOR_rdseedsi_1; + mode0 = SImode; + goto rdseed_step; + + case IX86_BUILTIN_RDSEED64_STEP: + icode = CODE_FOR_rdseeddi_1; + mode0 = DImode; + +rdseed_step: + arg0 = CALL_EXPR_ARG (exp, 0); + op1 = expand_normal (arg0); + if (!address_operand (op1, VOIDmode)) + { + op1 = convert_memory_address (Pmode, op1); + op1 = copy_addr_to_reg (op1); + } + + op0 = gen_reg_rtx (mode0); + emit_insn (GEN_FCN (icode) (op0)); + + emit_move_insn (gen_rtx_MEM (mode0, op1), op0); + + op2 = gen_reg_rtx (QImode); + + pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG), + const0_rtx); + emit_insn (gen_rtx_SET (op2, pat)); + + if (target == 0 + || !register_operand (target, SImode)) + target = gen_reg_rtx (SImode); + + emit_insn (gen_zero_extendqisi2 (target, op2)); + return target; + + case IX86_BUILTIN_SBB32: + icode = CODE_FOR_subborrowsi; + icode2 = CODE_FOR_subborrowsi_0; + mode0 = SImode; + mode1 = DImode; + mode2 = CCmode; + goto handlecarry; + + case IX86_BUILTIN_SBB64: + icode = CODE_FOR_subborrowdi; + icode2 = CODE_FOR_subborrowdi_0; + mode0 = DImode; + mode1 = TImode; + mode2 = CCmode; + goto handlecarry; + + case IX86_BUILTIN_ADDCARRYX32: + icode = CODE_FOR_addcarrysi; + icode2 = CODE_FOR_addcarrysi_0; + mode0 = SImode; + mode1 = DImode; + mode2 = CCCmode; + goto handlecarry; + + case IX86_BUILTIN_ADDCARRYX64: + icode = CODE_FOR_addcarrydi; + icode2 = CODE_FOR_addcarrydi_0; + mode0 = DImode; + mode1 = TImode; + mode2 = CCCmode; + + handlecarry: + arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */ + arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */ + arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */ + arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */ + + op1 = expand_normal (arg0); + if (!integer_zerop (arg0)) + op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1)); + + op2 = expand_normal (arg1); + if (!register_operand (op2, mode0)) + op2 = copy_to_mode_reg (mode0, op2); + + op3 = expand_normal (arg2); + if (!register_operand (op3, mode0)) + op3 = copy_to_mode_reg (mode0, op3); + + op4 = expand_normal (arg3); + if (!address_operand (op4, VOIDmode)) + { + op4 = convert_memory_address (Pmode, op4); + op4 = copy_addr_to_reg (op4); + } + + op0 = gen_reg_rtx (mode0); + if (integer_zerop (arg0)) + { + /* If arg0 is 0, optimize right away into add or sub + instruction that sets CCCmode flags. */ + op1 = gen_rtx_REG (mode2, FLAGS_REG); + emit_insn (GEN_FCN (icode2) (op0, op2, op3)); + } + else + { + /* Generate CF from input operand. */ + emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx)); + + /* Generate instruction that consumes CF. */ + op1 = gen_rtx_REG (CCCmode, FLAGS_REG); + pat = gen_rtx_LTU (mode1, op1, const0_rtx); + pat2 = gen_rtx_LTU (mode0, op1, const0_rtx); + emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2)); + } + + /* Return current CF value. */ + if (target == 0) + target = gen_reg_rtx (QImode); + + pat = gen_rtx_LTU (QImode, op1, const0_rtx); + emit_insn (gen_rtx_SET (target, pat)); + + /* Store the result. */ + emit_move_insn (gen_rtx_MEM (mode0, op4), op0); + + return target; + + case IX86_BUILTIN_READ_FLAGS: + emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG))); + + if (optimize + || target == NULL_RTX + || !nonimmediate_operand (target, word_mode) + || GET_MODE (target) != word_mode) + target = gen_reg_rtx (word_mode); + + emit_insn (gen_pop (target)); + return target; + + case IX86_BUILTIN_WRITE_FLAGS: + + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + if (!general_no_elim_operand (op0, word_mode)) + op0 = copy_to_mode_reg (word_mode, op0); + + emit_insn (gen_push (op0)); + emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG))); + return 0; + + case IX86_BUILTIN_KTESTC8: + icode = CODE_FOR_ktestqi; + mode3 = CCCmode; + goto kortest; + + case IX86_BUILTIN_KTESTZ8: + icode = CODE_FOR_ktestqi; + mode3 = CCZmode; + goto kortest; + + case IX86_BUILTIN_KTESTC16: + icode = CODE_FOR_ktesthi; + mode3 = CCCmode; + goto kortest; + + case IX86_BUILTIN_KTESTZ16: + icode = CODE_FOR_ktesthi; + mode3 = CCZmode; + goto kortest; + + case IX86_BUILTIN_KTESTC32: + icode = CODE_FOR_ktestsi; + mode3 = CCCmode; + goto kortest; + + case IX86_BUILTIN_KTESTZ32: + icode = CODE_FOR_ktestsi; + mode3 = CCZmode; + goto kortest; + + case IX86_BUILTIN_KTESTC64: + icode = CODE_FOR_ktestdi; + mode3 = CCCmode; + goto kortest; + + case IX86_BUILTIN_KTESTZ64: + icode = CODE_FOR_ktestdi; + mode3 = CCZmode; + goto kortest; + + case IX86_BUILTIN_KORTESTC8: + icode = CODE_FOR_kortestqi; + mode3 = CCCmode; + goto kortest; + + case IX86_BUILTIN_KORTESTZ8: + icode = CODE_FOR_kortestqi; + mode3 = CCZmode; + goto kortest; + + case IX86_BUILTIN_KORTESTC16: + icode = CODE_FOR_kortesthi; + mode3 = CCCmode; + goto kortest; + + case IX86_BUILTIN_KORTESTZ16: + icode = CODE_FOR_kortesthi; + mode3 = CCZmode; + goto kortest; + + case IX86_BUILTIN_KORTESTC32: + icode = CODE_FOR_kortestsi; + mode3 = CCCmode; + goto kortest; + + case IX86_BUILTIN_KORTESTZ32: + icode = CODE_FOR_kortestsi; + mode3 = CCZmode; + goto kortest; + + case IX86_BUILTIN_KORTESTC64: + icode = CODE_FOR_kortestdi; + mode3 = CCCmode; + goto kortest; + + case IX86_BUILTIN_KORTESTZ64: + icode = CODE_FOR_kortestdi; + mode3 = CCZmode; + + kortest: + arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */ + arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */ + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + + mode0 = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + + if (GET_MODE (op0) != VOIDmode) + op0 = force_reg (GET_MODE (op0), op0); + + op0 = gen_lowpart (mode0, op0); + + if (!insn_data[icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + + if (GET_MODE (op1) != VOIDmode) + op1 = force_reg (GET_MODE (op1), op1); + + op1 = gen_lowpart (mode1, op1); + + if (!insn_data[icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + target = gen_reg_rtx (QImode); + + /* Emit kortest. */ + emit_insn (GEN_FCN (icode) (op0, op1)); + /* And use setcc to return result from flags. */ + ix86_expand_setcc (target, EQ, + gen_rtx_REG (mode3, FLAGS_REG), const0_rtx); + return target; + + case IX86_BUILTIN_GATHERSIV2DF: + icode = CODE_FOR_avx2_gathersiv2df; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV4DF: + icode = CODE_FOR_avx2_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV2DF: + icode = CODE_FOR_avx2_gatherdiv2df; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV4DF: + icode = CODE_FOR_avx2_gatherdiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV4SF: + icode = CODE_FOR_avx2_gathersiv4sf; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV8SF: + icode = CODE_FOR_avx2_gathersiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV4SF: + icode = CODE_FOR_avx2_gatherdiv4sf; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV8SF: + icode = CODE_FOR_avx2_gatherdiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV2DI: + icode = CODE_FOR_avx2_gathersiv2di; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV4DI: + icode = CODE_FOR_avx2_gathersiv4di; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV2DI: + icode = CODE_FOR_avx2_gatherdiv2di; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV4DI: + icode = CODE_FOR_avx2_gatherdiv4di; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV4SI: + icode = CODE_FOR_avx2_gathersiv4si; + goto gather_gen; + case IX86_BUILTIN_GATHERSIV8SI: + icode = CODE_FOR_avx2_gathersiv8si; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV4SI: + icode = CODE_FOR_avx2_gatherdiv4si; + goto gather_gen; + case IX86_BUILTIN_GATHERDIV8SI: + icode = CODE_FOR_avx2_gatherdiv8si; + goto gather_gen; + case IX86_BUILTIN_GATHERALTSIV4DF: + icode = CODE_FOR_avx2_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERALTDIV8SF: + icode = CODE_FOR_avx2_gatherdiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHERALTSIV4DI: + icode = CODE_FOR_avx2_gathersiv4di; + goto gather_gen; + case IX86_BUILTIN_GATHERALTDIV8SI: + icode = CODE_FOR_avx2_gatherdiv8si; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV16SF: + icode = CODE_FOR_avx512f_gathersiv16sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV8DF: + icode = CODE_FOR_avx512f_gathersiv8df; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV16SF: + icode = CODE_FOR_avx512f_gatherdiv16sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV8DF: + icode = CODE_FOR_avx512f_gatherdiv8df; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV16SI: + icode = CODE_FOR_avx512f_gathersiv16si; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV8DI: + icode = CODE_FOR_avx512f_gathersiv8di; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV16SI: + icode = CODE_FOR_avx512f_gatherdiv16si; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV8DI: + icode = CODE_FOR_avx512f_gatherdiv8di; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTSIV8DF: + icode = CODE_FOR_avx512f_gathersiv8df; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTDIV16SF: + icode = CODE_FOR_avx512f_gatherdiv16sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTSIV8DI: + icode = CODE_FOR_avx512f_gathersiv8di; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTDIV16SI: + icode = CODE_FOR_avx512f_gatherdiv16si; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV2DF: + icode = CODE_FOR_avx512vl_gathersiv2df; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV4DF: + icode = CODE_FOR_avx512vl_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV2DF: + icode = CODE_FOR_avx512vl_gatherdiv2df; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV4DF: + icode = CODE_FOR_avx512vl_gatherdiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV4SF: + icode = CODE_FOR_avx512vl_gathersiv4sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV8SF: + icode = CODE_FOR_avx512vl_gathersiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV4SF: + icode = CODE_FOR_avx512vl_gatherdiv4sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV8SF: + icode = CODE_FOR_avx512vl_gatherdiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV2DI: + icode = CODE_FOR_avx512vl_gathersiv2di; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV4DI: + icode = CODE_FOR_avx512vl_gathersiv4di; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV2DI: + icode = CODE_FOR_avx512vl_gatherdiv2di; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV4DI: + icode = CODE_FOR_avx512vl_gatherdiv4di; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV4SI: + icode = CODE_FOR_avx512vl_gathersiv4si; + goto gather_gen; + case IX86_BUILTIN_GATHER3SIV8SI: + icode = CODE_FOR_avx512vl_gathersiv8si; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV4SI: + icode = CODE_FOR_avx512vl_gatherdiv4si; + goto gather_gen; + case IX86_BUILTIN_GATHER3DIV8SI: + icode = CODE_FOR_avx512vl_gatherdiv8si; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTSIV4DF: + icode = CODE_FOR_avx512vl_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTDIV8SF: + icode = CODE_FOR_avx512vl_gatherdiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTSIV4DI: + icode = CODE_FOR_avx512vl_gathersiv4di; + goto gather_gen; + case IX86_BUILTIN_GATHER3ALTDIV8SI: + icode = CODE_FOR_avx512vl_gatherdiv8si; + goto gather_gen; + case IX86_BUILTIN_SCATTERSIV16SF: + icode = CODE_FOR_avx512f_scattersiv16sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV8DF: + icode = CODE_FOR_avx512f_scattersiv8df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV16SF: + icode = CODE_FOR_avx512f_scatterdiv16sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV8DF: + icode = CODE_FOR_avx512f_scatterdiv8df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV16SI: + icode = CODE_FOR_avx512f_scattersiv16si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV8DI: + icode = CODE_FOR_avx512f_scattersiv8di; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV16SI: + icode = CODE_FOR_avx512f_scatterdiv16si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV8DI: + icode = CODE_FOR_avx512f_scatterdiv8di; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV8SF: + icode = CODE_FOR_avx512vl_scattersiv8sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV4SF: + icode = CODE_FOR_avx512vl_scattersiv4sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV4DF: + icode = CODE_FOR_avx512vl_scattersiv4df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV2DF: + icode = CODE_FOR_avx512vl_scattersiv2df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV8SF: + icode = CODE_FOR_avx512vl_scatterdiv8sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV4SF: + icode = CODE_FOR_avx512vl_scatterdiv4sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV4DF: + icode = CODE_FOR_avx512vl_scatterdiv4df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV2DF: + icode = CODE_FOR_avx512vl_scatterdiv2df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV8SI: + icode = CODE_FOR_avx512vl_scattersiv8si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV4SI: + icode = CODE_FOR_avx512vl_scattersiv4si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV4DI: + icode = CODE_FOR_avx512vl_scattersiv4di; + goto scatter_gen; + case IX86_BUILTIN_SCATTERSIV2DI: + icode = CODE_FOR_avx512vl_scattersiv2di; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV8SI: + icode = CODE_FOR_avx512vl_scatterdiv8si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV4SI: + icode = CODE_FOR_avx512vl_scatterdiv4si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV4DI: + icode = CODE_FOR_avx512vl_scatterdiv4di; + goto scatter_gen; + case IX86_BUILTIN_SCATTERDIV2DI: + icode = CODE_FOR_avx512vl_scatterdiv2di; + goto scatter_gen; + case IX86_BUILTIN_GATHERPFDPD: + icode = CODE_FOR_avx512pf_gatherpfv8sidf; + goto vec_prefetch_gen; + case IX86_BUILTIN_SCATTERALTSIV8DF: + icode = CODE_FOR_avx512f_scattersiv8df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTDIV16SF: + icode = CODE_FOR_avx512f_scatterdiv16sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTSIV8DI: + icode = CODE_FOR_avx512f_scattersiv8di; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTDIV16SI: + icode = CODE_FOR_avx512f_scatterdiv16si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTSIV4DF: + icode = CODE_FOR_avx512vl_scattersiv4df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTDIV8SF: + icode = CODE_FOR_avx512vl_scatterdiv8sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTSIV4DI: + icode = CODE_FOR_avx512vl_scattersiv4di; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTDIV8SI: + icode = CODE_FOR_avx512vl_scatterdiv8si; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTSIV2DF: + icode = CODE_FOR_avx512vl_scattersiv2df; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTDIV4SF: + icode = CODE_FOR_avx512vl_scatterdiv4sf; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTSIV2DI: + icode = CODE_FOR_avx512vl_scattersiv2di; + goto scatter_gen; + case IX86_BUILTIN_SCATTERALTDIV4SI: + icode = CODE_FOR_avx512vl_scatterdiv4si; + goto scatter_gen; + case IX86_BUILTIN_GATHERPFDPS: + icode = CODE_FOR_avx512pf_gatherpfv16sisf; + goto vec_prefetch_gen; + case IX86_BUILTIN_GATHERPFQPD: + icode = CODE_FOR_avx512pf_gatherpfv8didf; + goto vec_prefetch_gen; + case IX86_BUILTIN_GATHERPFQPS: + icode = CODE_FOR_avx512pf_gatherpfv8disf; + goto vec_prefetch_gen; + case IX86_BUILTIN_SCATTERPFDPD: + icode = CODE_FOR_avx512pf_scatterpfv8sidf; + goto vec_prefetch_gen; + case IX86_BUILTIN_SCATTERPFDPS: + icode = CODE_FOR_avx512pf_scatterpfv16sisf; + goto vec_prefetch_gen; + case IX86_BUILTIN_SCATTERPFQPD: + icode = CODE_FOR_avx512pf_scatterpfv8didf; + goto vec_prefetch_gen; + case IX86_BUILTIN_SCATTERPFQPS: + icode = CODE_FOR_avx512pf_scatterpfv8disf; + goto vec_prefetch_gen; + + gather_gen: + rtx half; + rtx (*gen) (rtx, rtx); + + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + arg3 = CALL_EXPR_ARG (exp, 3); + arg4 = CALL_EXPR_ARG (exp, 4); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + op4 = expand_normal (arg4); + /* Note the arg order is different from the operand order. */ + mode0 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[3].mode; + mode3 = insn_data[icode].operand[4].mode; + mode4 = insn_data[icode].operand[5].mode; + + if (target == NULL_RTX + || GET_MODE (target) != insn_data[icode].operand[0].mode + || !insn_data[icode].operand[0].predicate (target, + GET_MODE (target))) + subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode); + else + subtarget = target; + + switch (fcode) + { + case IX86_BUILTIN_GATHER3ALTSIV8DF: + case IX86_BUILTIN_GATHER3ALTSIV8DI: + half = gen_reg_rtx (V8SImode); + if (!nonimmediate_operand (op2, V16SImode)) + op2 = copy_to_mode_reg (V16SImode, op2); + emit_insn (gen_vec_extract_lo_v16si (half, op2)); + op2 = half; + break; + case IX86_BUILTIN_GATHER3ALTSIV4DF: + case IX86_BUILTIN_GATHER3ALTSIV4DI: + case IX86_BUILTIN_GATHERALTSIV4DF: + case IX86_BUILTIN_GATHERALTSIV4DI: + half = gen_reg_rtx (V4SImode); + if (!nonimmediate_operand (op2, V8SImode)) + op2 = copy_to_mode_reg (V8SImode, op2); + emit_insn (gen_vec_extract_lo_v8si (half, op2)); + op2 = half; + break; + case IX86_BUILTIN_GATHER3ALTDIV16SF: + case IX86_BUILTIN_GATHER3ALTDIV16SI: + half = gen_reg_rtx (mode0); + if (mode0 == V8SFmode) + gen = gen_vec_extract_lo_v16sf; + else + gen = gen_vec_extract_lo_v16si; + if (!nonimmediate_operand (op0, GET_MODE (op0))) + op0 = copy_to_mode_reg (GET_MODE (op0), op0); + emit_insn (gen (half, op0)); + op0 = half; + op3 = lowpart_subreg (QImode, op3, HImode); + break; + case IX86_BUILTIN_GATHER3ALTDIV8SF: + case IX86_BUILTIN_GATHER3ALTDIV8SI: + case IX86_BUILTIN_GATHERALTDIV8SF: + case IX86_BUILTIN_GATHERALTDIV8SI: + half = gen_reg_rtx (mode0); + if (mode0 == V4SFmode) + gen = gen_vec_extract_lo_v8sf; + else + gen = gen_vec_extract_lo_v8si; + if (!nonimmediate_operand (op0, GET_MODE (op0))) + op0 = copy_to_mode_reg (GET_MODE (op0), op0); + emit_insn (gen (half, op0)); + op0 = half; + if (VECTOR_MODE_P (GET_MODE (op3))) + { + half = gen_reg_rtx (mode0); + if (!nonimmediate_operand (op3, GET_MODE (op3))) + op3 = copy_to_mode_reg (GET_MODE (op3), op3); + emit_insn (gen (half, op3)); + op3 = half; + } + break; + default: + break; + } + + /* Force memory operand only with base register here. But we + don't want to do it on memory operand for other builtin + functions. */ + op1 = ix86_zero_extend_to_Pmode (op1); + + if (!insn_data[icode].operand[1].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if (!insn_data[icode].operand[2].predicate (op1, Pmode)) + op1 = copy_to_mode_reg (Pmode, op1); + if (!insn_data[icode].operand[3].predicate (op2, mode2)) + op2 = copy_to_mode_reg (mode2, op2); + + op3 = fixup_modeless_constant (op3, mode3); + + if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode) + { + if (!insn_data[icode].operand[4].predicate (op3, mode3)) + op3 = copy_to_mode_reg (mode3, op3); + } + else + { + op3 = copy_to_reg (op3); + op3 = lowpart_subreg (mode3, op3, GET_MODE (op3)); + } + if (!insn_data[icode].operand[5].predicate (op4, mode4)) + { + error ("the last argument must be scale 1, 2, 4, 8"); + return const0_rtx; + } + + /* Optimize. If mask is known to have all high bits set, + replace op0 with pc_rtx to signal that the instruction + overwrites the whole destination and doesn't use its + previous contents. */ + if (optimize) + { + if (TREE_CODE (arg3) == INTEGER_CST) + { + if (integer_all_onesp (arg3)) + op0 = pc_rtx; + } + else if (TREE_CODE (arg3) == VECTOR_CST) + { + unsigned int negative = 0; + for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i) + { + tree cst = VECTOR_CST_ELT (arg3, i); + if (TREE_CODE (cst) == INTEGER_CST + && tree_int_cst_sign_bit (cst)) + negative++; + else if (TREE_CODE (cst) == REAL_CST + && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst))) + negative++; + } + if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3))) + op0 = pc_rtx; + } + else if (TREE_CODE (arg3) == SSA_NAME + && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE) + { + /* Recognize also when mask is like: + __v2df src = _mm_setzero_pd (); + __v2df mask = _mm_cmpeq_pd (src, src); + or + __v8sf src = _mm256_setzero_ps (); + __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); + as that is a cheaper way to load all ones into + a register than having to load a constant from + memory. */ + gimple *def_stmt = SSA_NAME_DEF_STMT (arg3); + if (is_gimple_call (def_stmt)) + { + tree fndecl = gimple_call_fndecl (def_stmt); + if (fndecl + && fndecl_built_in_p (fndecl, BUILT_IN_MD)) + switch ((unsigned int) DECL_FUNCTION_CODE (fndecl)) + { + case IX86_BUILTIN_CMPPD: + case IX86_BUILTIN_CMPPS: + case IX86_BUILTIN_CMPPD256: + case IX86_BUILTIN_CMPPS256: + if (!integer_zerop (gimple_call_arg (def_stmt, 2))) + break; + /* FALLTHRU */ + case IX86_BUILTIN_CMPEQPD: + case IX86_BUILTIN_CMPEQPS: + if (initializer_zerop (gimple_call_arg (def_stmt, 0)) + && initializer_zerop (gimple_call_arg (def_stmt, + 1))) + op0 = pc_rtx; + break; + default: + break; + } + } + } + } + + pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4); + if (! pat) + return const0_rtx; + emit_insn (pat); + + switch (fcode) + { + case IX86_BUILTIN_GATHER3DIV16SF: + if (target == NULL_RTX) + target = gen_reg_rtx (V8SFmode); + emit_insn (gen_vec_extract_lo_v16sf (target, subtarget)); + break; + case IX86_BUILTIN_GATHER3DIV16SI: + if (target == NULL_RTX) + target = gen_reg_rtx (V8SImode); + emit_insn (gen_vec_extract_lo_v16si (target, subtarget)); + break; + case IX86_BUILTIN_GATHER3DIV8SF: + case IX86_BUILTIN_GATHERDIV8SF: + if (target == NULL_RTX) + target = gen_reg_rtx (V4SFmode); + emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); + break; + case IX86_BUILTIN_GATHER3DIV8SI: + case IX86_BUILTIN_GATHERDIV8SI: + if (target == NULL_RTX) + target = gen_reg_rtx (V4SImode); + emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); + break; + default: + target = subtarget; + break; + } + return target; + + scatter_gen: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + arg3 = CALL_EXPR_ARG (exp, 3); + arg4 = CALL_EXPR_ARG (exp, 4); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + op4 = expand_normal (arg4); + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + mode4 = insn_data[icode].operand[4].mode; + + /* Scatter instruction stores operand op3 to memory with + indices from op2 and scale from op4 under writemask op1. + If index operand op2 has more elements then source operand + op3 one need to use only its low half. And vice versa. */ + switch (fcode) + { + case IX86_BUILTIN_SCATTERALTSIV8DF: + case IX86_BUILTIN_SCATTERALTSIV8DI: + half = gen_reg_rtx (V8SImode); + if (!nonimmediate_operand (op2, V16SImode)) + op2 = copy_to_mode_reg (V16SImode, op2); + emit_insn (gen_vec_extract_lo_v16si (half, op2)); + op2 = half; + break; + case IX86_BUILTIN_SCATTERALTDIV16SF: + case IX86_BUILTIN_SCATTERALTDIV16SI: + half = gen_reg_rtx (mode3); + if (mode3 == V8SFmode) + gen = gen_vec_extract_lo_v16sf; + else + gen = gen_vec_extract_lo_v16si; + if (!nonimmediate_operand (op3, GET_MODE (op3))) + op3 = copy_to_mode_reg (GET_MODE (op3), op3); + emit_insn (gen (half, op3)); + op3 = half; + break; + case IX86_BUILTIN_SCATTERALTSIV4DF: + case IX86_BUILTIN_SCATTERALTSIV4DI: + half = gen_reg_rtx (V4SImode); + if (!nonimmediate_operand (op2, V8SImode)) + op2 = copy_to_mode_reg (V8SImode, op2); + emit_insn (gen_vec_extract_lo_v8si (half, op2)); + op2 = half; + break; + case IX86_BUILTIN_SCATTERALTDIV8SF: + case IX86_BUILTIN_SCATTERALTDIV8SI: + half = gen_reg_rtx (mode3); + if (mode3 == V4SFmode) + gen = gen_vec_extract_lo_v8sf; + else + gen = gen_vec_extract_lo_v8si; + if (!nonimmediate_operand (op3, GET_MODE (op3))) + op3 = copy_to_mode_reg (GET_MODE (op3), op3); + emit_insn (gen (half, op3)); + op3 = half; + break; + case IX86_BUILTIN_SCATTERALTSIV2DF: + case IX86_BUILTIN_SCATTERALTSIV2DI: + if (!nonimmediate_operand (op2, V4SImode)) + op2 = copy_to_mode_reg (V4SImode, op2); + break; + case IX86_BUILTIN_SCATTERALTDIV4SF: + case IX86_BUILTIN_SCATTERALTDIV4SI: + if (!nonimmediate_operand (op3, GET_MODE (op3))) + op3 = copy_to_mode_reg (GET_MODE (op3), op3); + break; + default: + break; + } + + /* Force memory operand only with base register here. But we + don't want to do it on memory operand for other builtin + functions. */ + op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1)); + + if (!insn_data[icode].operand[0].predicate (op0, Pmode)) + op0 = copy_to_mode_reg (Pmode, op0); + + op1 = fixup_modeless_constant (op1, mode1); + + if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode) + { + if (!insn_data[icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + } + else + { + op1 = copy_to_reg (op1); + op1 = lowpart_subreg (mode1, op1, GET_MODE (op1)); + } + + if (!insn_data[icode].operand[2].predicate (op2, mode2)) + op2 = copy_to_mode_reg (mode2, op2); + + if (!insn_data[icode].operand[3].predicate (op3, mode3)) + op3 = copy_to_mode_reg (mode3, op3); + + if (!insn_data[icode].operand[4].predicate (op4, mode4)) + { + error ("the last argument must be scale 1, 2, 4, 8"); + return const0_rtx; + } + + pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); + if (! pat) + return const0_rtx; + + emit_insn (pat); + return 0; + + vec_prefetch_gen: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + arg3 = CALL_EXPR_ARG (exp, 3); + arg4 = CALL_EXPR_ARG (exp, 4); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + op4 = expand_normal (arg4); + mode0 = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode3 = insn_data[icode].operand[3].mode; + mode4 = insn_data[icode].operand[4].mode; + + op0 = fixup_modeless_constant (op0, mode0); + + if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode) + { + if (!insn_data[icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + } + else + { + op0 = copy_to_reg (op0); + op0 = lowpart_subreg (mode0, op0, GET_MODE (op0)); + } + + if (!insn_data[icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + /* Force memory operand only with base register here. But we + don't want to do it on memory operand for other builtin + functions. */ + op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1)); + + if (!insn_data[icode].operand[2].predicate (op2, Pmode)) + op2 = copy_to_mode_reg (Pmode, op2); + + if (!insn_data[icode].operand[3].predicate (op3, mode3)) + { + error ("the forth argument must be scale 1, 2, 4, 8"); + return const0_rtx; + } + + if (!insn_data[icode].operand[4].predicate (op4, mode4)) + { + error ("incorrect hint operand"); + return const0_rtx; + } + + pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); + if (! pat) + return const0_rtx; + + emit_insn (pat); + + return 0; + + case IX86_BUILTIN_XABORT: + icode = CODE_FOR_xabort; + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + mode0 = insn_data[icode].operand[0].mode; + if (!insn_data[icode].operand[0].predicate (op0, mode0)) + { + error ("the argument to % intrinsic must " + "be an 8-bit immediate"); + return const0_rtx; + } + emit_insn (gen_xabort (op0)); + return 0; + + case IX86_BUILTIN_RSTORSSP: + case IX86_BUILTIN_CLRSSBSY: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + icode = (fcode == IX86_BUILTIN_RSTORSSP + ? CODE_FOR_rstorssp + : CODE_FOR_clrssbsy); + if (!address_operand (op0, VOIDmode)) + { + op1 = convert_memory_address (Pmode, op0); + op0 = copy_addr_to_reg (op1); + } + emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0))); + return 0; + + case IX86_BUILTIN_WRSSD: + case IX86_BUILTIN_WRSSQ: + case IX86_BUILTIN_WRUSSD: + case IX86_BUILTIN_WRUSSQ: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + arg1 = CALL_EXPR_ARG (exp, 1); + op1 = expand_normal (arg1); + switch (fcode) + { + case IX86_BUILTIN_WRSSD: + icode = CODE_FOR_wrsssi; + mode = SImode; + break; + case IX86_BUILTIN_WRSSQ: + icode = CODE_FOR_wrssdi; + mode = DImode; + break; + case IX86_BUILTIN_WRUSSD: + icode = CODE_FOR_wrusssi; + mode = SImode; + break; + case IX86_BUILTIN_WRUSSQ: + icode = CODE_FOR_wrussdi; + mode = DImode; + break; + } + op0 = force_reg (mode, op0); + if (!address_operand (op1, VOIDmode)) + { + op2 = convert_memory_address (Pmode, op1); + op1 = copy_addr_to_reg (op2); + } + emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1))); + return 0; + + default: + break; + } + + if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST + && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST; + return ix86_expand_special_args_builtin (bdesc_special_args + i, exp, + target); + } + + if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST + && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST; + rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL; + rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx); + rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx); + int masked = 1; + machine_mode mode, wide_mode, nar_mode; + + nar_mode = V4SFmode; + mode = V16SFmode; + wide_mode = V64SFmode; + fcn_mask = gen_avx5124fmaddps_4fmaddps_mask; + fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz; + + switch (fcode) + { + case IX86_BUILTIN_4FMAPS: + fcn = gen_avx5124fmaddps_4fmaddps; + masked = 0; + goto v4fma_expand; + + case IX86_BUILTIN_4DPWSSD: + nar_mode = V4SImode; + mode = V16SImode; + wide_mode = V64SImode; + fcn = gen_avx5124vnniw_vp4dpwssd; + masked = 0; + goto v4fma_expand; + + case IX86_BUILTIN_4DPWSSDS: + nar_mode = V4SImode; + mode = V16SImode; + wide_mode = V64SImode; + fcn = gen_avx5124vnniw_vp4dpwssds; + masked = 0; + goto v4fma_expand; + + case IX86_BUILTIN_4FNMAPS: + fcn = gen_avx5124fmaddps_4fnmaddps; + masked = 0; + goto v4fma_expand; + + case IX86_BUILTIN_4FNMAPS_MASK: + fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask; + fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz; + goto v4fma_expand; + + case IX86_BUILTIN_4DPWSSD_MASK: + nar_mode = V4SImode; + mode = V16SImode; + wide_mode = V64SImode; + fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask; + fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz; + goto v4fma_expand; + + case IX86_BUILTIN_4DPWSSDS_MASK: + nar_mode = V4SImode; + mode = V16SImode; + wide_mode = V64SImode; + fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask; + fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz; + goto v4fma_expand; + + case IX86_BUILTIN_4FMAPS_MASK: + { + tree args[4]; + rtx ops[4]; + rtx wide_reg; + rtx accum; + rtx addr; + rtx mem; + +v4fma_expand: + wide_reg = gen_reg_rtx (wide_mode); + for (i = 0; i < 4; i++) + { + args[i] = CALL_EXPR_ARG (exp, i); + ops[i] = expand_normal (args[i]); + + emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64), + ops[i]); + } + + accum = expand_normal (CALL_EXPR_ARG (exp, 4)); + accum = force_reg (mode, accum); + + addr = expand_normal (CALL_EXPR_ARG (exp, 5)); + addr = force_reg (Pmode, addr); + + mem = gen_rtx_MEM (nar_mode, addr); + + target = gen_reg_rtx (mode); + + emit_move_insn (target, accum); + + if (! masked) + emit_insn (fcn (target, accum, wide_reg, mem)); + else + { + rtx merge, mask; + merge = expand_normal (CALL_EXPR_ARG (exp, 6)); + + mask = expand_normal (CALL_EXPR_ARG (exp, 7)); + + if (CONST_INT_P (mask)) + mask = fixup_modeless_constant (mask, HImode); + + mask = force_reg (HImode, mask); + + if (GET_MODE (mask) != HImode) + mask = gen_rtx_SUBREG (HImode, mask, 0); + + /* If merge is 0 then we're about to emit z-masked variant. */ + if (const0_operand (merge, mode)) + emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask)); + /* If merge is the same as accum then emit merge-masked variant. */ + else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4)) + { + merge = force_reg (mode, merge); + emit_insn (fcn_mask (target, wide_reg, mem, merge, mask)); + } + /* Merge with something unknown might happen if we z-mask w/ -O0. */ + else + { + target = gen_reg_rtx (mode); + emit_move_insn (target, merge); + emit_insn (fcn_mask (target, wide_reg, mem, target, mask)); + } + } + return target; + } + + case IX86_BUILTIN_4FNMASS: + fcn = gen_avx5124fmaddps_4fnmaddss; + masked = 0; + goto s4fma_expand; + + case IX86_BUILTIN_4FMASS: + fcn = gen_avx5124fmaddps_4fmaddss; + masked = 0; + goto s4fma_expand; + + case IX86_BUILTIN_4FNMASS_MASK: + fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask; + fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz; + goto s4fma_expand; + + case IX86_BUILTIN_4FMASS_MASK: + { + tree args[4]; + rtx ops[4]; + rtx wide_reg; + rtx accum; + rtx addr; + rtx mem; + + fcn_mask = gen_avx5124fmaddps_4fmaddss_mask; + fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz; + +s4fma_expand: + mode = V4SFmode; + wide_reg = gen_reg_rtx (V64SFmode); + for (i = 0; i < 4; i++) + { + rtx tmp; + args[i] = CALL_EXPR_ARG (exp, i); + ops[i] = expand_normal (args[i]); + + tmp = gen_reg_rtx (SFmode); + emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0)); + + emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64), + gen_rtx_SUBREG (V16SFmode, tmp, 0)); + } + + accum = expand_normal (CALL_EXPR_ARG (exp, 4)); + accum = force_reg (V4SFmode, accum); + + addr = expand_normal (CALL_EXPR_ARG (exp, 5)); + addr = force_reg (Pmode, addr); + + mem = gen_rtx_MEM (V4SFmode, addr); + + target = gen_reg_rtx (V4SFmode); + + emit_move_insn (target, accum); + + if (! masked) + emit_insn (fcn (target, accum, wide_reg, mem)); + else + { + rtx merge, mask; + merge = expand_normal (CALL_EXPR_ARG (exp, 6)); + + mask = expand_normal (CALL_EXPR_ARG (exp, 7)); + + if (CONST_INT_P (mask)) + mask = fixup_modeless_constant (mask, QImode); + + mask = force_reg (QImode, mask); + + if (GET_MODE (mask) != QImode) + mask = gen_rtx_SUBREG (QImode, mask, 0); + + /* If merge is 0 then we're about to emit z-masked variant. */ + if (const0_operand (merge, mode)) + emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask)); + /* If merge is the same as accum then emit merge-masked + variant. */ + else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4)) + { + merge = force_reg (mode, merge); + emit_insn (fcn_mask (target, wide_reg, mem, merge, mask)); + } + /* Merge with something unknown might happen if we z-mask + w/ -O0. */ + else + { + target = gen_reg_rtx (mode); + emit_move_insn (target, merge); + emit_insn (fcn_mask (target, wide_reg, mem, target, mask)); + } + } + return target; + } + case IX86_BUILTIN_RDPID: + return ix86_expand_special_args_builtin (bdesc_args + i, exp, + target); + case IX86_BUILTIN_FABSQ: + case IX86_BUILTIN_COPYSIGNQ: + if (!TARGET_SSE) + /* Emit a normal call if SSE isn't available. */ + return expand_call (exp, target, ignore); + /* FALLTHRU */ + default: + return ix86_expand_args_builtin (bdesc_args + i, exp, target); + } + } + + if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST + && fcode <= IX86_BUILTIN__BDESC_COMI_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST; + return ix86_expand_sse_comi (bdesc_comi + i, exp, target); + } + + if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST + && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST; + return ix86_expand_round_builtin (bdesc_round_args + i, exp, target); + } + + if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST + && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST; + return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target); + } + + if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST + && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST; + return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target); + } + + if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST + && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST; + const struct builtin_description *d = bdesc_multi_arg + i; + return ix86_expand_multi_arg_builtin (d->icode, exp, target, + (enum ix86_builtin_func_type) + d->flag, d->comparison); + } + + if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST + && fcode <= IX86_BUILTIN__BDESC_CET_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_CET_FIRST; + return ix86_expand_special_args_builtin (bdesc_cet + i, exp, + target); + } + + if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST + && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST) + { + i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST; + return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp, + target); + } + + gcc_unreachable (); +} + +/* A subroutine of ix86_expand_vector_init_duplicate. Tries to + fill target with val via vec_duplicate. */ + +static bool +ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val) +{ + bool ok; + rtx_insn *insn; + rtx dup; + + /* First attempt to recognize VAL as-is. */ + dup = gen_vec_duplicate (mode, val); + insn = emit_insn (gen_rtx_SET (target, dup)); + if (recog_memoized (insn) < 0) + { + rtx_insn *seq; + machine_mode innermode = GET_MODE_INNER (mode); + rtx reg; + + /* If that fails, force VAL into a register. */ + + start_sequence (); + reg = force_reg (innermode, val); + if (GET_MODE (reg) != innermode) + reg = gen_lowpart (innermode, reg); + SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg); + seq = get_insns (); + end_sequence (); + if (seq) + emit_insn_before (seq, insn); + + ok = recog_memoized (insn) >= 0; + gcc_assert (ok); + } + return true; +} + +/* Get a vector mode of the same size as the original but with elements + twice as wide. This is only guaranteed to apply to integral vectors. */ + +static machine_mode +get_mode_wider_vector (machine_mode o) +{ + /* ??? Rely on the ordering that genmodes.c gives to vectors. */ + machine_mode n = GET_MODE_WIDER_MODE (o).require (); + gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2); + gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n)); + return n; +} + +static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); +static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); + +/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector + with all elements equal to VAR. Return true if successful. */ + +static bool +ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, + rtx target, rtx val) +{ + bool ok; + + switch (mode) + { + case E_V2SImode: + case E_V2SFmode: + if (!mmx_ok) + return false; + /* FALLTHRU */ + + case E_V4DFmode: + case E_V4DImode: + case E_V8SFmode: + case E_V8SImode: + case E_V2DFmode: + case E_V2DImode: + case E_V4SFmode: + case E_V4SImode: + case E_V16SImode: + case E_V8DImode: + case E_V16SFmode: + case E_V8DFmode: + return ix86_vector_duplicate_value (mode, target, val); + + case E_V4HImode: + if (!mmx_ok) + return false; + if (TARGET_SSE || TARGET_3DNOW_A) + { + rtx x; + + val = gen_lowpart (SImode, val); + x = gen_rtx_TRUNCATE (HImode, val); + x = gen_rtx_VEC_DUPLICATE (mode, x); + emit_insn (gen_rtx_SET (target, x)); + return true; + } + goto widen; + + case E_V8QImode: + if (!mmx_ok) + return false; + goto widen; + + case E_V8HImode: + if (TARGET_AVX2) + return ix86_vector_duplicate_value (mode, target, val); + + if (TARGET_SSE2) + { + struct expand_vec_perm_d dperm; + rtx tmp1, tmp2; + + permute: + memset (&dperm, 0, sizeof (dperm)); + dperm.target = target; + dperm.vmode = mode; + dperm.nelt = GET_MODE_NUNITS (mode); + dperm.op0 = dperm.op1 = gen_reg_rtx (mode); + dperm.one_operand_p = true; + + /* Extend to SImode using a paradoxical SUBREG. */ + tmp1 = gen_reg_rtx (SImode); + emit_move_insn (tmp1, gen_lowpart (SImode, val)); + + /* Insert the SImode value as low element of a V4SImode vector. */ + tmp2 = gen_reg_rtx (V4SImode); + emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1)); + emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2)); + + ok = (expand_vec_perm_1 (&dperm) + || expand_vec_perm_broadcast_1 (&dperm)); + gcc_assert (ok); + return ok; + } + goto widen; + + case E_V16QImode: + if (TARGET_AVX2) + return ix86_vector_duplicate_value (mode, target, val); + + if (TARGET_SSE2) + goto permute; + goto widen; + + widen: + /* Replicate the value once into the next wider mode and recurse. */ + { + machine_mode smode, wsmode, wvmode; + rtx x; + + smode = GET_MODE_INNER (mode); + wvmode = get_mode_wider_vector (mode); + wsmode = GET_MODE_INNER (wvmode); + + val = convert_modes (wsmode, smode, val, true); + x = expand_simple_binop (wsmode, ASHIFT, val, + GEN_INT (GET_MODE_BITSIZE (smode)), + NULL_RTX, 1, OPTAB_LIB_WIDEN); + val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN); + + x = gen_reg_rtx (wvmode); + ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val); + gcc_assert (ok); + emit_move_insn (target, gen_lowpart (GET_MODE (target), x)); + return ok; + } + + case E_V16HImode: + case E_V32QImode: + if (TARGET_AVX2) + return ix86_vector_duplicate_value (mode, target, val); + else + { + machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode); + rtx x = gen_reg_rtx (hvmode); + + ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); + gcc_assert (ok); + + x = gen_rtx_VEC_CONCAT (mode, x, x); + emit_insn (gen_rtx_SET (target, x)); + } + return true; + + case E_V64QImode: + case E_V32HImode: + if (TARGET_AVX512BW) + return ix86_vector_duplicate_value (mode, target, val); + else + { + machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode); + rtx x = gen_reg_rtx (hvmode); + + ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); + gcc_assert (ok); + + x = gen_rtx_VEC_CONCAT (mode, x, x); + emit_insn (gen_rtx_SET (target, x)); + } + return true; + + default: + return false; + } +} + +/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector + whose ONE_VAR element is VAR, and other elements are zero. Return true + if successful. */ + +static bool +ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, + rtx target, rtx var, int one_var) +{ + machine_mode vsimode; + rtx new_target; + rtx x, tmp; + bool use_vector_set = false; + rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL; + + switch (mode) + { + case E_V2DImode: + /* For SSE4.1, we normally use vector set. But if the second + element is zero and inter-unit moves are OK, we use movq + instead. */ + use_vector_set = (TARGET_64BIT && TARGET_SSE4_1 + && !(TARGET_INTER_UNIT_MOVES_TO_VEC + && one_var == 0)); + break; + case E_V16QImode: + case E_V4SImode: + case E_V4SFmode: + use_vector_set = TARGET_SSE4_1; + break; + case E_V8HImode: + use_vector_set = TARGET_SSE2; + break; + case E_V4HImode: + use_vector_set = TARGET_SSE || TARGET_3DNOW_A; + break; + case E_V32QImode: + case E_V16HImode: + use_vector_set = TARGET_AVX; + break; + case E_V8SImode: + use_vector_set = TARGET_AVX; + gen_vec_set_0 = gen_vec_setv8si_0; + break; + case E_V8SFmode: + use_vector_set = TARGET_AVX; + gen_vec_set_0 = gen_vec_setv8sf_0; + break; + case E_V4DFmode: + use_vector_set = TARGET_AVX; + gen_vec_set_0 = gen_vec_setv4df_0; + break; + case E_V4DImode: + /* Use ix86_expand_vector_set in 64bit mode only. */ + use_vector_set = TARGET_AVX && TARGET_64BIT; + gen_vec_set_0 = gen_vec_setv4di_0; + break; + case E_V16SImode: + use_vector_set = TARGET_AVX512F && one_var == 0; + gen_vec_set_0 = gen_vec_setv16si_0; + break; + case E_V16SFmode: + use_vector_set = TARGET_AVX512F && one_var == 0; + gen_vec_set_0 = gen_vec_setv16sf_0; + break; + case E_V8DFmode: + use_vector_set = TARGET_AVX512F && one_var == 0; + gen_vec_set_0 = gen_vec_setv8df_0; + break; + case E_V8DImode: + /* Use ix86_expand_vector_set in 64bit mode only. */ + use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0; + gen_vec_set_0 = gen_vec_setv8di_0; + break; + default: + break; + } + + if (use_vector_set) + { + if (gen_vec_set_0 && one_var == 0) + { + var = force_reg (GET_MODE_INNER (mode), var); + emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var)); + return true; + } + emit_insn (gen_rtx_SET (target, CONST0_RTX (mode))); + var = force_reg (GET_MODE_INNER (mode), var); + ix86_expand_vector_set (mmx_ok, target, var, one_var); + return true; + } + + switch (mode) + { + case E_V2SFmode: + case E_V2SImode: + if (!mmx_ok) + return false; + /* FALLTHRU */ + + case E_V2DFmode: + case E_V2DImode: + if (one_var != 0) + return false; + var = force_reg (GET_MODE_INNER (mode), var); + x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode))); + emit_insn (gen_rtx_SET (target, x)); + return true; + + case E_V4SFmode: + case E_V4SImode: + if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER) + new_target = gen_reg_rtx (mode); + else + new_target = target; + var = force_reg (GET_MODE_INNER (mode), var); + x = gen_rtx_VEC_DUPLICATE (mode, var); + x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx); + emit_insn (gen_rtx_SET (new_target, x)); + if (one_var != 0) + { + /* We need to shuffle the value to the correct position, so + create a new pseudo to store the intermediate result. */ + + /* With SSE2, we can use the integer shuffle insns. */ + if (mode != V4SFmode && TARGET_SSE2) + { + emit_insn (gen_sse2_pshufd_1 (new_target, new_target, + const1_rtx, + GEN_INT (one_var == 1 ? 0 : 1), + GEN_INT (one_var == 2 ? 0 : 1), + GEN_INT (one_var == 3 ? 0 : 1))); + if (target != new_target) + emit_move_insn (target, new_target); + return true; + } + + /* Otherwise convert the intermediate result to V4SFmode and + use the SSE1 shuffle instructions. */ + if (mode != V4SFmode) + { + tmp = gen_reg_rtx (V4SFmode); + emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target)); + } + else + tmp = new_target; + + emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp, + const1_rtx, + GEN_INT (one_var == 1 ? 0 : 1), + GEN_INT (one_var == 2 ? 0+4 : 1+4), + GEN_INT (one_var == 3 ? 0+4 : 1+4))); + + if (mode != V4SFmode) + emit_move_insn (target, gen_lowpart (V4SImode, tmp)); + else if (tmp != target) + emit_move_insn (target, tmp); + } + else if (target != new_target) + emit_move_insn (target, new_target); + return true; + + case E_V8HImode: + case E_V16QImode: + vsimode = V4SImode; + goto widen; + case E_V4HImode: + case E_V8QImode: + if (!mmx_ok) + return false; + vsimode = V2SImode; + goto widen; + widen: + if (one_var != 0) + return false; + + /* Zero extend the variable element to SImode and recurse. */ + var = convert_modes (SImode, GET_MODE_INNER (mode), var, true); + + x = gen_reg_rtx (vsimode); + if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x, + var, one_var)) + gcc_unreachable (); + + emit_move_insn (target, gen_lowpart (mode, x)); + return true; + + default: + return false; + } +} + +/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector + consisting of the values in VALS. It is known that all elements + except ONE_VAR are constants. Return true if successful. */ + +static bool +ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode, + rtx target, rtx vals, int one_var) +{ + rtx var = XVECEXP (vals, 0, one_var); + machine_mode wmode; + rtx const_vec, x; + + const_vec = copy_rtx (vals); + XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode)); + const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0)); + + switch (mode) + { + case E_V2DFmode: + case E_V2DImode: + case E_V2SFmode: + case E_V2SImode: + /* For the two element vectors, it's just as easy to use + the general case. */ + return false; + + case E_V4DImode: + /* Use ix86_expand_vector_set in 64bit mode only. */ + if (!TARGET_64BIT) + return false; + /* FALLTHRU */ + case E_V4DFmode: + case E_V8SFmode: + case E_V8SImode: + case E_V16HImode: + case E_V32QImode: + case E_V4SFmode: + case E_V4SImode: + case E_V8HImode: + case E_V4HImode: + break; + + case E_V16QImode: + if (TARGET_SSE4_1) + break; + wmode = V8HImode; + goto widen; + case E_V8QImode: + wmode = V4HImode; + goto widen; + widen: + /* There's no way to set one QImode entry easily. Combine + the variable value with its adjacent constant value, and + promote to an HImode set. */ + x = XVECEXP (vals, 0, one_var ^ 1); + if (one_var & 1) + { + var = convert_modes (HImode, QImode, var, true); + var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8), + NULL_RTX, 1, OPTAB_LIB_WIDEN); + x = GEN_INT (INTVAL (x) & 0xff); + } + else + { + var = convert_modes (HImode, QImode, var, true); + x = gen_int_mode (UINTVAL (x) << 8, HImode); + } + if (x != const0_rtx) + var = expand_simple_binop (HImode, IOR, var, x, var, + 1, OPTAB_LIB_WIDEN); + + x = gen_reg_rtx (wmode); + emit_move_insn (x, gen_lowpart (wmode, const_vec)); + ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1); + + emit_move_insn (target, gen_lowpart (mode, x)); + return true; + + default: + return false; + } + + emit_move_insn (target, const_vec); + ix86_expand_vector_set (mmx_ok, target, var, one_var); + return true; +} + +/* A subroutine of ix86_expand_vector_init_general. Use vector + concatenate to handle the most general case: all values variable, + and none identical. */ + +static void +ix86_expand_vector_init_concat (machine_mode mode, + rtx target, rtx *ops, int n) +{ + machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode; + rtx first[16], second[8], third[4]; + rtvec v; + int i, j; + + switch (n) + { + case 2: + switch (mode) + { + case E_V16SImode: + cmode = V8SImode; + break; + case E_V16SFmode: + cmode = V8SFmode; + break; + case E_V8DImode: + cmode = V4DImode; + break; + case E_V8DFmode: + cmode = V4DFmode; + break; + case E_V8SImode: + cmode = V4SImode; + break; + case E_V8SFmode: + cmode = V4SFmode; + break; + case E_V4DImode: + cmode = V2DImode; + break; + case E_V4DFmode: + cmode = V2DFmode; + break; + case E_V4SImode: + cmode = V2SImode; + break; + case E_V4SFmode: + cmode = V2SFmode; + break; + case E_V2DImode: + cmode = DImode; + break; + case E_V2SImode: + cmode = SImode; + break; + case E_V2DFmode: + cmode = DFmode; + break; + case E_V2SFmode: + cmode = SFmode; + break; + default: + gcc_unreachable (); + } + + if (!register_operand (ops[1], cmode)) + ops[1] = force_reg (cmode, ops[1]); + if (!register_operand (ops[0], cmode)) + ops[0] = force_reg (cmode, ops[0]); + emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0], + ops[1]))); + break; + + case 4: + switch (mode) + { + case E_V4DImode: + cmode = V2DImode; + break; + case E_V4DFmode: + cmode = V2DFmode; + break; + case E_V4SImode: + cmode = V2SImode; + break; + case E_V4SFmode: + cmode = V2SFmode; + break; + default: + gcc_unreachable (); + } + goto half; + + case 8: + switch (mode) + { + case E_V8DImode: + cmode = V2DImode; + hmode = V4DImode; + break; + case E_V8DFmode: + cmode = V2DFmode; + hmode = V4DFmode; + break; + case E_V8SImode: + cmode = V2SImode; + hmode = V4SImode; + break; + case E_V8SFmode: + cmode = V2SFmode; + hmode = V4SFmode; + break; + default: + gcc_unreachable (); + } + goto half; + + case 16: + switch (mode) + { + case E_V16SImode: + cmode = V2SImode; + hmode = V4SImode; + gmode = V8SImode; + break; + case E_V16SFmode: + cmode = V2SFmode; + hmode = V4SFmode; + gmode = V8SFmode; + break; + default: + gcc_unreachable (); + } + goto half; + +half: + /* FIXME: We process inputs backward to help RA. PR 36222. */ + i = n - 1; + j = (n >> 1) - 1; + for (; i > 0; i -= 2, j--) + { + first[j] = gen_reg_rtx (cmode); + v = gen_rtvec (2, ops[i - 1], ops[i]); + ix86_expand_vector_init (false, first[j], + gen_rtx_PARALLEL (cmode, v)); + } + + n >>= 1; + if (n > 4) + { + gcc_assert (hmode != VOIDmode); + gcc_assert (gmode != VOIDmode); + for (i = j = 0; i < n; i += 2, j++) + { + second[j] = gen_reg_rtx (hmode); + ix86_expand_vector_init_concat (hmode, second [j], + &first [i], 2); + } + n >>= 1; + for (i = j = 0; i < n; i += 2, j++) + { + third[j] = gen_reg_rtx (gmode); + ix86_expand_vector_init_concat (gmode, third[j], + &second[i], 2); + } + n >>= 1; + ix86_expand_vector_init_concat (mode, target, third, n); + } + else if (n > 2) + { + gcc_assert (hmode != VOIDmode); + for (i = j = 0; i < n; i += 2, j++) + { + second[j] = gen_reg_rtx (hmode); + ix86_expand_vector_init_concat (hmode, second [j], + &first [i], 2); + } + n >>= 1; + ix86_expand_vector_init_concat (mode, target, second, n); + } + else + ix86_expand_vector_init_concat (mode, target, first, n); + break; + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_vector_init_general. Use vector + interleave to handle the most general case: all values variable, + and none identical. */ + +static void +ix86_expand_vector_init_interleave (machine_mode mode, + rtx target, rtx *ops, int n) +{ + machine_mode first_imode, second_imode, third_imode, inner_mode; + int i, j; + rtx op0, op1; + rtx (*gen_load_even) (rtx, rtx, rtx); + rtx (*gen_interleave_first_low) (rtx, rtx, rtx); + rtx (*gen_interleave_second_low) (rtx, rtx, rtx); + + switch (mode) + { + case E_V8HImode: + gen_load_even = gen_vec_setv8hi; + gen_interleave_first_low = gen_vec_interleave_lowv4si; + gen_interleave_second_low = gen_vec_interleave_lowv2di; + inner_mode = HImode; + first_imode = V4SImode; + second_imode = V2DImode; + third_imode = VOIDmode; + break; + case E_V16QImode: + gen_load_even = gen_vec_setv16qi; + gen_interleave_first_low = gen_vec_interleave_lowv8hi; + gen_interleave_second_low = gen_vec_interleave_lowv4si; + inner_mode = QImode; + first_imode = V8HImode; + second_imode = V4SImode; + third_imode = V2DImode; + break; + default: + gcc_unreachable (); + } + + for (i = 0; i < n; i++) + { + /* Extend the odd elment to SImode using a paradoxical SUBREG. */ + op0 = gen_reg_rtx (SImode); + emit_move_insn (op0, gen_lowpart (SImode, ops [i + i])); + + /* Insert the SImode value as low element of V4SImode vector. */ + op1 = gen_reg_rtx (V4SImode); + op0 = gen_rtx_VEC_MERGE (V4SImode, + gen_rtx_VEC_DUPLICATE (V4SImode, + op0), + CONST0_RTX (V4SImode), + const1_rtx); + emit_insn (gen_rtx_SET (op1, op0)); + + /* Cast the V4SImode vector back to a vector in orignal mode. */ + op0 = gen_reg_rtx (mode); + emit_move_insn (op0, gen_lowpart (mode, op1)); + + /* Load even elements into the second position. */ + emit_insn (gen_load_even (op0, + force_reg (inner_mode, + ops [i + i + 1]), + const1_rtx)); + + /* Cast vector to FIRST_IMODE vector. */ + ops[i] = gen_reg_rtx (first_imode); + emit_move_insn (ops[i], gen_lowpart (first_imode, op0)); + } + + /* Interleave low FIRST_IMODE vectors. */ + for (i = j = 0; i < n; i += 2, j++) + { + op0 = gen_reg_rtx (first_imode); + emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1])); + + /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */ + ops[j] = gen_reg_rtx (second_imode); + emit_move_insn (ops[j], gen_lowpart (second_imode, op0)); + } + + /* Interleave low SECOND_IMODE vectors. */ + switch (second_imode) + { + case E_V4SImode: + for (i = j = 0; i < n / 2; i += 2, j++) + { + op0 = gen_reg_rtx (second_imode); + emit_insn (gen_interleave_second_low (op0, ops[i], + ops[i + 1])); + + /* Cast the SECOND_IMODE vector to the THIRD_IMODE + vector. */ + ops[j] = gen_reg_rtx (third_imode); + emit_move_insn (ops[j], gen_lowpart (third_imode, op0)); + } + second_imode = V2DImode; + gen_interleave_second_low = gen_vec_interleave_lowv2di; + /* FALLTHRU */ + + case E_V2DImode: + op0 = gen_reg_rtx (second_imode); + emit_insn (gen_interleave_second_low (op0, ops[0], + ops[1])); + + /* Cast the SECOND_IMODE vector back to a vector on original + mode. */ + emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0))); + break; + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_vector_init. Handle the most general case: + all values variable, and none identical. */ + +static void +ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode, + rtx target, rtx vals) +{ + rtx ops[64], op0, op1, op2, op3, op4, op5; + machine_mode half_mode = VOIDmode; + machine_mode quarter_mode = VOIDmode; + int n, i; + + switch (mode) + { + case E_V2SFmode: + case E_V2SImode: + if (!mmx_ok && !TARGET_SSE) + break; + /* FALLTHRU */ + + case E_V16SImode: + case E_V16SFmode: + case E_V8DFmode: + case E_V8DImode: + case E_V8SFmode: + case E_V8SImode: + case E_V4DFmode: + case E_V4DImode: + case E_V4SFmode: + case E_V4SImode: + case E_V2DFmode: + case E_V2DImode: + n = GET_MODE_NUNITS (mode); + for (i = 0; i < n; i++) + ops[i] = XVECEXP (vals, 0, i); + ix86_expand_vector_init_concat (mode, target, ops, n); + return; + + case E_V2TImode: + for (i = 0; i < 2; i++) + ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); + op0 = gen_reg_rtx (V4DImode); + ix86_expand_vector_init_concat (V4DImode, op0, ops, 2); + emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); + return; + + case E_V4TImode: + for (i = 0; i < 4; i++) + ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); + ops[4] = gen_reg_rtx (V4DImode); + ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2); + ops[5] = gen_reg_rtx (V4DImode); + ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2); + op0 = gen_reg_rtx (V8DImode); + ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2); + emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); + return; + + case E_V32QImode: + half_mode = V16QImode; + goto half; + + case E_V16HImode: + half_mode = V8HImode; + goto half; + +half: + n = GET_MODE_NUNITS (mode); + for (i = 0; i < n; i++) + ops[i] = XVECEXP (vals, 0, i); + op0 = gen_reg_rtx (half_mode); + op1 = gen_reg_rtx (half_mode); + ix86_expand_vector_init_interleave (half_mode, op0, ops, + n >> 2); + ix86_expand_vector_init_interleave (half_mode, op1, + &ops [n >> 1], n >> 2); + emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1))); + return; + + case E_V64QImode: + quarter_mode = V16QImode; + half_mode = V32QImode; + goto quarter; + + case E_V32HImode: + quarter_mode = V8HImode; + half_mode = V16HImode; + goto quarter; + +quarter: + n = GET_MODE_NUNITS (mode); + for (i = 0; i < n; i++) + ops[i] = XVECEXP (vals, 0, i); + op0 = gen_reg_rtx (quarter_mode); + op1 = gen_reg_rtx (quarter_mode); + op2 = gen_reg_rtx (quarter_mode); + op3 = gen_reg_rtx (quarter_mode); + op4 = gen_reg_rtx (half_mode); + op5 = gen_reg_rtx (half_mode); + ix86_expand_vector_init_interleave (quarter_mode, op0, ops, + n >> 3); + ix86_expand_vector_init_interleave (quarter_mode, op1, + &ops [n >> 2], n >> 3); + ix86_expand_vector_init_interleave (quarter_mode, op2, + &ops [n >> 1], n >> 3); + ix86_expand_vector_init_interleave (quarter_mode, op3, + &ops [(n >> 1) | (n >> 2)], n >> 3); + emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1))); + emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3))); + emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5))); + return; + + case E_V16QImode: + if (!TARGET_SSE4_1) + break; + /* FALLTHRU */ + + case E_V8HImode: + if (!TARGET_SSE2) + break; + + /* Don't use ix86_expand_vector_init_interleave if we can't + move from GPR to SSE register directly. */ + if (!TARGET_INTER_UNIT_MOVES_TO_VEC) + break; + + n = GET_MODE_NUNITS (mode); + for (i = 0; i < n; i++) + ops[i] = XVECEXP (vals, 0, i); + ix86_expand_vector_init_interleave (mode, target, ops, n >> 1); + return; + + case E_V4HImode: + case E_V8QImode: + break; + + default: + gcc_unreachable (); + } + + { + int i, j, n_elts, n_words, n_elt_per_word; + machine_mode inner_mode; + rtx words[4], shift; + + inner_mode = GET_MODE_INNER (mode); + n_elts = GET_MODE_NUNITS (mode); + n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD; + n_elt_per_word = n_elts / n_words; + shift = GEN_INT (GET_MODE_BITSIZE (inner_mode)); + + for (i = 0; i < n_words; ++i) + { + rtx word = NULL_RTX; + + for (j = 0; j < n_elt_per_word; ++j) + { + rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1); + elt = convert_modes (word_mode, inner_mode, elt, true); + + if (j == 0) + word = elt; + else + { + word = expand_simple_binop (word_mode, ASHIFT, word, shift, + word, 1, OPTAB_LIB_WIDEN); + word = expand_simple_binop (word_mode, IOR, word, elt, + word, 1, OPTAB_LIB_WIDEN); + } + } + + words[i] = word; + } + + if (n_words == 1) + emit_move_insn (target, gen_lowpart (mode, words[0])); + else if (n_words == 2) + { + rtx tmp = gen_reg_rtx (mode); + emit_clobber (tmp); + emit_move_insn (gen_lowpart (word_mode, tmp), words[0]); + emit_move_insn (gen_highpart (word_mode, tmp), words[1]); + emit_move_insn (target, tmp); + } + else if (n_words == 4) + { + rtx tmp = gen_reg_rtx (V4SImode); + gcc_assert (word_mode == SImode); + vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words)); + ix86_expand_vector_init_general (false, V4SImode, tmp, vals); + emit_move_insn (target, gen_lowpart (mode, tmp)); + } + else + gcc_unreachable (); + } +} + +/* Initialize vector TARGET via VALS. Suppress the use of MMX + instructions unless MMX_OK is true. */ + +void +ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals) +{ + machine_mode mode = GET_MODE (target); + machine_mode inner_mode = GET_MODE_INNER (mode); + int n_elts = GET_MODE_NUNITS (mode); + int n_var = 0, one_var = -1; + bool all_same = true, all_const_zero = true; + int i; + rtx x; + + /* Handle first initialization from vector elts. */ + if (n_elts != XVECLEN (vals, 0)) + { + rtx subtarget = target; + x = XVECEXP (vals, 0, 0); + gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode); + if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts) + { + rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) }; + if (inner_mode == QImode || inner_mode == HImode) + { + unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode); + mode = mode_for_vector (SImode, n_bits / 4).require (); + inner_mode = mode_for_vector (SImode, n_bits / 8).require (); + ops[0] = gen_lowpart (inner_mode, ops[0]); + ops[1] = gen_lowpart (inner_mode, ops[1]); + subtarget = gen_reg_rtx (mode); + } + ix86_expand_vector_init_concat (mode, subtarget, ops, 2); + if (subtarget != target) + emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget)); + return; + } + gcc_unreachable (); + } + + for (i = 0; i < n_elts; ++i) + { + x = XVECEXP (vals, 0, i); + if (!(CONST_SCALAR_INT_P (x) + || CONST_DOUBLE_P (x) + || CONST_FIXED_P (x))) + n_var++, one_var = i; + else if (x != CONST0_RTX (inner_mode)) + all_const_zero = false; + if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) + all_same = false; + } + + /* Constants are best loaded from the constant pool. */ + if (n_var == 0) + { + emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0))); + return; + } + + /* If all values are identical, broadcast the value. */ + if (all_same + && ix86_expand_vector_init_duplicate (mmx_ok, mode, target, + XVECEXP (vals, 0, 0))) + return; + + /* Values where only one field is non-constant are best loaded from + the pool and overwritten via move later. */ + if (n_var == 1) + { + if (all_const_zero + && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target, + XVECEXP (vals, 0, one_var), + one_var)) + return; + + if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var)) + return; + } + + ix86_expand_vector_init_general (mmx_ok, mode, target, vals); +} + +void +ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) +{ + machine_mode mode = GET_MODE (target); + machine_mode inner_mode = GET_MODE_INNER (mode); + machine_mode half_mode; + bool use_vec_merge = false; + rtx tmp; + static rtx (*gen_extract[6][2]) (rtx, rtx) + = { + { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi }, + { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi }, + { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si }, + { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di }, + { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf }, + { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df } + }; + static rtx (*gen_insert[6][2]) (rtx, rtx, rtx) + = { + { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi }, + { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi }, + { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si }, + { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di }, + { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf }, + { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df } + }; + int i, j, n; + machine_mode mmode = VOIDmode; + rtx (*gen_blendm) (rtx, rtx, rtx, rtx); + + switch (mode) + { + case E_V2SFmode: + case E_V2SImode: + if (mmx_ok) + { + tmp = gen_reg_rtx (GET_MODE_INNER (mode)); + ix86_expand_vector_extract (true, tmp, target, 1 - elt); + if (elt == 0) + tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); + else + tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); + emit_insn (gen_rtx_SET (target, tmp)); + return; + } + break; + + case E_V2DImode: + use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT; + if (use_vec_merge) + break; + + tmp = gen_reg_rtx (GET_MODE_INNER (mode)); + ix86_expand_vector_extract (false, tmp, target, 1 - elt); + if (elt == 0) + tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); + else + tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); + emit_insn (gen_rtx_SET (target, tmp)); + return; + + case E_V2DFmode: + { + rtx op0, op1; + + /* For the two element vectors, we implement a VEC_CONCAT with + the extraction of the other element. */ + + tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt))); + tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp); + + if (elt == 0) + op0 = val, op1 = tmp; + else + op0 = tmp, op1 = val; + + tmp = gen_rtx_VEC_CONCAT (mode, op0, op1); + emit_insn (gen_rtx_SET (target, tmp)); + } + return; + + case E_V4SFmode: + use_vec_merge = TARGET_SSE4_1; + if (use_vec_merge) + break; + + switch (elt) + { + case 0: + use_vec_merge = true; + break; + + case 1: + /* tmp = target = A B C D */ + tmp = copy_to_reg (target); + /* target = A A B B */ + emit_insn (gen_vec_interleave_lowv4sf (target, target, target)); + /* target = X A B B */ + ix86_expand_vector_set (false, target, val, 0); + /* target = A X C D */ + emit_insn (gen_sse_shufps_v4sf (target, target, tmp, + const1_rtx, const0_rtx, + GEN_INT (2+4), GEN_INT (3+4))); + return; + + case 2: + /* tmp = target = A B C D */ + tmp = copy_to_reg (target); + /* tmp = X B C D */ + ix86_expand_vector_set (false, tmp, val, 0); + /* target = A B X D */ + emit_insn (gen_sse_shufps_v4sf (target, target, tmp, + const0_rtx, const1_rtx, + GEN_INT (0+4), GEN_INT (3+4))); + return; + + case 3: + /* tmp = target = A B C D */ + tmp = copy_to_reg (target); + /* tmp = X B C D */ + ix86_expand_vector_set (false, tmp, val, 0); + /* target = A B X D */ + emit_insn (gen_sse_shufps_v4sf (target, target, tmp, + const0_rtx, const1_rtx, + GEN_INT (2+4), GEN_INT (0+4))); + return; + + default: + gcc_unreachable (); + } + break; + + case E_V4SImode: + use_vec_merge = TARGET_SSE4_1; + if (use_vec_merge) + break; + + /* Element 0 handled by vec_merge below. */ + if (elt == 0) + { + use_vec_merge = true; + break; + } + + if (TARGET_SSE2) + { + /* With SSE2, use integer shuffles to swap element 0 and ELT, + store into element 0, then shuffle them back. */ + + rtx order[4]; + + order[0] = GEN_INT (elt); + order[1] = const1_rtx; + order[2] = const2_rtx; + order[3] = GEN_INT (3); + order[elt] = const0_rtx; + + emit_insn (gen_sse2_pshufd_1 (target, target, order[0], + order[1], order[2], order[3])); + + ix86_expand_vector_set (false, target, val, 0); + + emit_insn (gen_sse2_pshufd_1 (target, target, order[0], + order[1], order[2], order[3])); + } + else + { + /* For SSE1, we have to reuse the V4SF code. */ + rtx t = gen_reg_rtx (V4SFmode); + emit_move_insn (t, gen_lowpart (V4SFmode, target)); + ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt); + emit_move_insn (target, gen_lowpart (mode, t)); + } + return; + + case E_V8HImode: + use_vec_merge = TARGET_SSE2; + break; + case E_V4HImode: + use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); + break; + + case E_V16QImode: + use_vec_merge = TARGET_SSE4_1; + break; + + case E_V8QImode: + break; + + case E_V32QImode: + half_mode = V16QImode; + j = 0; + n = 16; + goto half; + + case E_V16HImode: + half_mode = V8HImode; + j = 1; + n = 8; + goto half; + + case E_V8SImode: + half_mode = V4SImode; + j = 2; + n = 4; + goto half; + + case E_V4DImode: + half_mode = V2DImode; + j = 3; + n = 2; + goto half; + + case E_V8SFmode: + half_mode = V4SFmode; + j = 4; + n = 4; + goto half; + + case E_V4DFmode: + half_mode = V2DFmode; + j = 5; + n = 2; + goto half; + +half: + /* Compute offset. */ + i = elt / n; + elt %= n; + + gcc_assert (i <= 1); + + /* Extract the half. */ + tmp = gen_reg_rtx (half_mode); + emit_insn (gen_extract[j][i] (tmp, target)); + + /* Put val in tmp at elt. */ + ix86_expand_vector_set (false, tmp, val, elt); + + /* Put it back. */ + emit_insn (gen_insert[j][i] (target, target, tmp)); + return; + + case E_V8DFmode: + if (TARGET_AVX512F) + { + mmode = QImode; + gen_blendm = gen_avx512f_blendmv8df; + } + break; + + case E_V8DImode: + if (TARGET_AVX512F) + { + mmode = QImode; + gen_blendm = gen_avx512f_blendmv8di; + } + break; + + case E_V16SFmode: + if (TARGET_AVX512F) + { + mmode = HImode; + gen_blendm = gen_avx512f_blendmv16sf; + } + break; + + case E_V16SImode: + if (TARGET_AVX512F) + { + mmode = HImode; + gen_blendm = gen_avx512f_blendmv16si; + } + break; + + case E_V32HImode: + if (TARGET_AVX512BW) + { + mmode = SImode; + gen_blendm = gen_avx512bw_blendmv32hi; + } + else if (TARGET_AVX512F) + { + half_mode = E_V8HImode; + n = 8; + goto quarter; + } + break; + + case E_V64QImode: + if (TARGET_AVX512BW) + { + mmode = DImode; + gen_blendm = gen_avx512bw_blendmv64qi; + } + else if (TARGET_AVX512F) + { + half_mode = E_V16QImode; + n = 16; + goto quarter; + } + break; + +quarter: + /* Compute offset. */ + i = elt / n; + elt %= n; + + gcc_assert (i <= 3); + + { + /* Extract the quarter. */ + tmp = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_lowpart (V16SImode, target); + rtx mask = gen_reg_rtx (QImode); + + emit_move_insn (mask, constm1_rtx); + emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i), + tmp, mask)); + + tmp2 = gen_reg_rtx (half_mode); + emit_move_insn (tmp2, gen_lowpart (half_mode, tmp)); + tmp = tmp2; + + /* Put val in tmp at elt. */ + ix86_expand_vector_set (false, tmp, val, elt); + + /* Put it back. */ + tmp2 = gen_reg_rtx (V16SImode); + rtx tmp3 = gen_lowpart (V16SImode, target); + mask = gen_reg_rtx (HImode); + emit_move_insn (mask, constm1_rtx); + tmp = gen_lowpart (V4SImode, tmp); + emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i), + tmp3, mask)); + emit_move_insn (target, gen_lowpart (mode, tmp2)); + } + return; + + default: + break; + } + + if (mmode != VOIDmode) + { + tmp = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val))); + /* The avx512*_blendm expanders have different operand order + from VEC_MERGE. In VEC_MERGE, the first input operand is used for + elements where the mask is set and second input operand otherwise, + in {sse,avx}*_*blend* the first input operand is used for elements + where the mask is clear and second input operand otherwise. */ + emit_insn (gen_blendm (target, target, tmp, + force_reg (mmode, + gen_int_mode (HOST_WIDE_INT_1U << elt, + mmode)))); + } + else if (use_vec_merge) + { + tmp = gen_rtx_VEC_DUPLICATE (mode, val); + tmp = gen_rtx_VEC_MERGE (mode, tmp, target, + GEN_INT (HOST_WIDE_INT_1U << elt)); + emit_insn (gen_rtx_SET (target, tmp)); + } + else + { + rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); + + emit_move_insn (mem, target); + + tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode)); + emit_move_insn (tmp, val); + + emit_move_insn (target, mem); + } +} + +void +ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) +{ + machine_mode mode = GET_MODE (vec); + machine_mode inner_mode = GET_MODE_INNER (mode); + bool use_vec_extr = false; + rtx tmp; + + switch (mode) + { + case E_V2SImode: + case E_V2SFmode: + if (!mmx_ok) + break; + /* FALLTHRU */ + + case E_V2DFmode: + case E_V2DImode: + case E_V2TImode: + case E_V4TImode: + use_vec_extr = true; + break; + + case E_V4SFmode: + use_vec_extr = TARGET_SSE4_1; + if (use_vec_extr) + break; + + switch (elt) + { + case 0: + tmp = vec; + break; + + case 1: + case 3: + tmp = gen_reg_rtx (mode); + emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec, + GEN_INT (elt), GEN_INT (elt), + GEN_INT (elt+4), GEN_INT (elt+4))); + break; + + case 2: + tmp = gen_reg_rtx (mode); + emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec)); + break; + + default: + gcc_unreachable (); + } + vec = tmp; + use_vec_extr = true; + elt = 0; + break; + + case E_V4SImode: + use_vec_extr = TARGET_SSE4_1; + if (use_vec_extr) + break; + + if (TARGET_SSE2) + { + switch (elt) + { + case 0: + tmp = vec; + break; + + case 1: + case 3: + tmp = gen_reg_rtx (mode); + emit_insn (gen_sse2_pshufd_1 (tmp, vec, + GEN_INT (elt), GEN_INT (elt), + GEN_INT (elt), GEN_INT (elt))); + break; + + case 2: + tmp = gen_reg_rtx (mode); + emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec)); + break; + + default: + gcc_unreachable (); + } + vec = tmp; + use_vec_extr = true; + elt = 0; + } + else + { + /* For SSE1, we have to reuse the V4SF code. */ + ix86_expand_vector_extract (false, gen_lowpart (SFmode, target), + gen_lowpart (V4SFmode, vec), elt); + return; + } + break; + + case E_V8HImode: + use_vec_extr = TARGET_SSE2; + break; + case E_V4HImode: + use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); + break; + + case E_V16QImode: + use_vec_extr = TARGET_SSE4_1; + break; + + case E_V8SFmode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V4SFmode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8sf (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8sf (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + } + break; + + case E_V4DFmode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V2DFmode); + if (elt < 2) + emit_insn (gen_vec_extract_lo_v4df (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v4df (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 1); + return; + } + break; + + case E_V32QImode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V16QImode); + if (elt < 16) + emit_insn (gen_vec_extract_lo_v32qi (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v32qi (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 15); + return; + } + break; + + case E_V16HImode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V8HImode); + if (elt < 8) + emit_insn (gen_vec_extract_lo_v16hi (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v16hi (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 7); + return; + } + break; + + case E_V8SImode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V4SImode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8si (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8si (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + } + break; + + case E_V4DImode: + if (TARGET_AVX) + { + tmp = gen_reg_rtx (V2DImode); + if (elt < 2) + emit_insn (gen_vec_extract_lo_v4di (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v4di (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 1); + return; + } + break; + + case E_V32HImode: + if (TARGET_AVX512BW) + { + tmp = gen_reg_rtx (V16HImode); + if (elt < 16) + emit_insn (gen_vec_extract_lo_v32hi (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v32hi (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 15); + return; + } + break; + + case E_V64QImode: + if (TARGET_AVX512BW) + { + tmp = gen_reg_rtx (V32QImode); + if (elt < 32) + emit_insn (gen_vec_extract_lo_v64qi (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v64qi (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 31); + return; + } + break; + + case E_V16SFmode: + tmp = gen_reg_rtx (V8SFmode); + if (elt < 8) + emit_insn (gen_vec_extract_lo_v16sf (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v16sf (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 7); + return; + + case E_V8DFmode: + tmp = gen_reg_rtx (V4DFmode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8df (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8df (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + + case E_V16SImode: + tmp = gen_reg_rtx (V8SImode); + if (elt < 8) + emit_insn (gen_vec_extract_lo_v16si (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v16si (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 7); + return; + + case E_V8DImode: + tmp = gen_reg_rtx (V4DImode); + if (elt < 4) + emit_insn (gen_vec_extract_lo_v8di (tmp, vec)); + else + emit_insn (gen_vec_extract_hi_v8di (tmp, vec)); + ix86_expand_vector_extract (false, target, tmp, elt & 3); + return; + + case E_V8QImode: + /* ??? Could extract the appropriate HImode element and shift. */ + default: + break; + } + + if (use_vec_extr) + { + tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt))); + tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp); + + /* Let the rtl optimizers know about the zero extension performed. */ + if (inner_mode == QImode || inner_mode == HImode) + { + tmp = gen_rtx_ZERO_EXTEND (SImode, tmp); + target = gen_lowpart (SImode, target); + } + + emit_insn (gen_rtx_SET (target, tmp)); + } + else + { + rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); + + emit_move_insn (mem, vec); + + tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode)); + emit_move_insn (target, tmp); + } +} + +/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC + to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode. + The upper bits of DEST are undefined, though they shouldn't cause + exceptions (some bits from src or all zeros are ok). */ + +static void +emit_reduc_half (rtx dest, rtx src, int i) +{ + rtx tem, d = dest; + switch (GET_MODE (src)) + { + case E_V4SFmode: + if (i == 128) + tem = gen_sse_movhlps (dest, src, src); + else + tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx, + GEN_INT (1 + 4), GEN_INT (1 + 4)); + break; + case E_V2DFmode: + tem = gen_vec_interleave_highv2df (dest, src, src); + break; + case E_V16QImode: + case E_V8HImode: + case E_V4SImode: + case E_V2DImode: + d = gen_reg_rtx (V1TImode); + tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src), + GEN_INT (i / 2)); + break; + case E_V8SFmode: + if (i == 256) + tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx); + else + tem = gen_avx_shufps256 (dest, src, src, + GEN_INT (i == 128 ? 2 + (3 << 2) : 1)); + break; + case E_V4DFmode: + if (i == 256) + tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx); + else + tem = gen_avx_shufpd256 (dest, src, src, const1_rtx); + break; + case E_V32QImode: + case E_V16HImode: + case E_V8SImode: + case E_V4DImode: + if (i == 256) + { + if (GET_MODE (dest) != V4DImode) + d = gen_reg_rtx (V4DImode); + tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src), + gen_lowpart (V4DImode, src), + const1_rtx); + } + else + { + d = gen_reg_rtx (V2TImode); + tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src), + GEN_INT (i / 2)); + } + break; + case E_V64QImode: + case E_V32HImode: + case E_V16SImode: + case E_V16SFmode: + case E_V8DImode: + case E_V8DFmode: + if (i > 128) + tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest), + gen_lowpart (V16SImode, src), + gen_lowpart (V16SImode, src), + GEN_INT (0x4 + (i == 512 ? 4 : 0)), + GEN_INT (0x5 + (i == 512 ? 4 : 0)), + GEN_INT (0x6 + (i == 512 ? 4 : 0)), + GEN_INT (0x7 + (i == 512 ? 4 : 0)), + GEN_INT (0xC), GEN_INT (0xD), + GEN_INT (0xE), GEN_INT (0xF), + GEN_INT (0x10), GEN_INT (0x11), + GEN_INT (0x12), GEN_INT (0x13), + GEN_INT (0x14), GEN_INT (0x15), + GEN_INT (0x16), GEN_INT (0x17)); + else + tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest), + gen_lowpart (V16SImode, src), + GEN_INT (i == 128 ? 0x2 : 0x1), + GEN_INT (0x3), + GEN_INT (0x3), + GEN_INT (0x3), + GEN_INT (i == 128 ? 0x6 : 0x5), + GEN_INT (0x7), + GEN_INT (0x7), + GEN_INT (0x7), + GEN_INT (i == 128 ? 0xA : 0x9), + GEN_INT (0xB), + GEN_INT (0xB), + GEN_INT (0xB), + GEN_INT (i == 128 ? 0xE : 0xD), + GEN_INT (0xF), + GEN_INT (0xF), + GEN_INT (0xF)); + break; + default: + gcc_unreachable (); + } + emit_insn (tem); + if (d != dest) + emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); +} + +/* Expand a vector reduction. FN is the binary pattern to reduce; + DEST is the destination; IN is the input vector. */ + +void +ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in) +{ + rtx half, dst, vec = in; + machine_mode mode = GET_MODE (in); + int i; + + /* SSE4 has a special instruction for V8HImode UMIN reduction. */ + if (TARGET_SSE4_1 + && mode == V8HImode + && fn == gen_uminv8hi3) + { + emit_insn (gen_sse4_1_phminposuw (dest, in)); + return; + } + + for (i = GET_MODE_BITSIZE (mode); + i > GET_MODE_UNIT_BITSIZE (mode); + i >>= 1) + { + half = gen_reg_rtx (mode); + emit_reduc_half (half, vec, i); + if (i == GET_MODE_UNIT_BITSIZE (mode) * 2) + dst = dest; + else + dst = gen_reg_rtx (mode); + emit_insn (fn (dst, half, vec)); + vec = dst; + } +} + +/* Output code to perform a conditional jump to LABEL, if C2 flag in + FP status register is set. */ + +void +ix86_emit_fp_unordered_jump (rtx label) +{ + rtx reg = gen_reg_rtx (HImode); + rtx_insn *insn; + rtx temp; + + emit_insn (gen_x86_fnstsw_1 (reg)); + + if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ())) + { + emit_insn (gen_x86_sahf_1 (reg)); + + temp = gen_rtx_REG (CCmode, FLAGS_REG); + temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx); + } + else + { + emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04))); + + temp = gen_rtx_REG (CCNOmode, FLAGS_REG); + temp = gen_rtx_NE (VOIDmode, temp, const0_rtx); + } + + temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp, + gen_rtx_LABEL_REF (VOIDmode, label), + pc_rtx); + insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp)); + predict_jump (REG_BR_PROB_BASE * 10 / 100); + JUMP_LABEL (insn) = label; +} + +/* Output code to perform an sinh XFmode calculation. */ + +void ix86_emit_i387_sinh (rtx op0, rtx op1) +{ + rtx e1 = gen_reg_rtx (XFmode); + rtx e2 = gen_reg_rtx (XFmode); + rtx scratch = gen_reg_rtx (HImode); + rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); + rtx half = const_double_from_real_value (dconsthalf, XFmode); + rtx cst1, tmp; + rtx_code_label *jump_label = gen_label_rtx (); + rtx_insn *insn; + + /* scratch = fxam (op1) */ + emit_insn (gen_fxamxf2_i387 (scratch, op1)); + + /* e1 = expm1 (|op1|) */ + emit_insn (gen_absxf2 (e2, op1)); + emit_insn (gen_expm1xf2 (e1, e2)); + + /* e2 = e1 / (e1 + 1.0) + e1 */ + cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); + emit_insn (gen_addxf3 (e2, e1, cst1)); + emit_insn (gen_divxf3 (e2, e1, e2)); + emit_insn (gen_addxf3 (e2, e2, e1)); + + /* flags = signbit (op1) */ + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); + + /* if (flags) then e2 = -e2 */ + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, + gen_rtx_EQ (VOIDmode, flags, const0_rtx), + gen_rtx_LABEL_REF (VOIDmode, jump_label), + pc_rtx); + insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + predict_jump (REG_BR_PROB_BASE * 50 / 100); + JUMP_LABEL (insn) = jump_label; + + emit_insn (gen_negxf2 (e2, e2)); + + emit_label (jump_label); + LABEL_NUSES (jump_label) = 1; + + /* op0 = 0.5 * e2 */ + half = force_reg (XFmode, half); + emit_insn (gen_mulxf3 (op0, e2, half)); +} + +/* Output code to perform an cosh XFmode calculation. */ + +void ix86_emit_i387_cosh (rtx op0, rtx op1) +{ + rtx e1 = gen_reg_rtx (XFmode); + rtx e2 = gen_reg_rtx (XFmode); + rtx half = const_double_from_real_value (dconsthalf, XFmode); + rtx cst1; + + /* e1 = exp (op1) */ + emit_insn (gen_expxf2 (e1, op1)); + + /* e2 = e1 + 1.0 / e1 */ + cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); + emit_insn (gen_divxf3 (e2, cst1, e1)); + emit_insn (gen_addxf3 (e2, e1, e2)); + + /* op0 = 0.5 * e2 */ + half = force_reg (XFmode, half); + emit_insn (gen_mulxf3 (op0, e2, half)); +} + +/* Output code to perform an tanh XFmode calculation. */ + +void ix86_emit_i387_tanh (rtx op0, rtx op1) +{ + rtx e1 = gen_reg_rtx (XFmode); + rtx e2 = gen_reg_rtx (XFmode); + rtx scratch = gen_reg_rtx (HImode); + rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); + rtx cst2, tmp; + rtx_code_label *jump_label = gen_label_rtx (); + rtx_insn *insn; + + /* scratch = fxam (op1) */ + emit_insn (gen_fxamxf2_i387 (scratch, op1)); + + /* e1 = expm1 (-|2 * op1|) */ + emit_insn (gen_addxf3 (e2, op1, op1)); + emit_insn (gen_absxf2 (e2, e2)); + emit_insn (gen_negxf2 (e2, e2)); + emit_insn (gen_expm1xf2 (e1, e2)); + + /* e2 = e1 / (e1 + 2.0) */ + cst2 = force_reg (XFmode, CONST2_RTX (XFmode)); + emit_insn (gen_addxf3 (e2, e1, cst2)); + emit_insn (gen_divxf3 (e2, e1, e2)); + + /* flags = signbit (op1) */ + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); + + /* if (!flags) then e2 = -e2 */ + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, + gen_rtx_NE (VOIDmode, flags, const0_rtx), + gen_rtx_LABEL_REF (VOIDmode, jump_label), + pc_rtx); + insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + predict_jump (REG_BR_PROB_BASE * 50 / 100); + JUMP_LABEL (insn) = jump_label; + + emit_insn (gen_negxf2 (e2, e2)); + + emit_label (jump_label); + LABEL_NUSES (jump_label) = 1; + + emit_move_insn (op0, e2); +} + +/* Output code to perform an asinh XFmode calculation. */ + +void ix86_emit_i387_asinh (rtx op0, rtx op1) +{ + rtx e1 = gen_reg_rtx (XFmode); + rtx e2 = gen_reg_rtx (XFmode); + rtx scratch = gen_reg_rtx (HImode); + rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); + rtx cst1, tmp; + rtx_code_label *jump_label = gen_label_rtx (); + rtx_insn *insn; + + /* e2 = sqrt (op1^2 + 1.0) + 1.0 */ + emit_insn (gen_mulxf3 (e1, op1, op1)); + cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); + emit_insn (gen_addxf3 (e2, e1, cst1)); + emit_insn (gen_sqrtxf2 (e2, e2)); + emit_insn (gen_addxf3 (e2, e2, cst1)); + + /* e1 = e1 / e2 */ + emit_insn (gen_divxf3 (e1, e1, e2)); + + /* scratch = fxam (op1) */ + emit_insn (gen_fxamxf2_i387 (scratch, op1)); + + /* e1 = e1 + |op1| */ + emit_insn (gen_absxf2 (e2, op1)); + emit_insn (gen_addxf3 (e1, e1, e2)); + + /* e2 = log1p (e1) */ + ix86_emit_i387_log1p (e2, e1); + + /* flags = signbit (op1) */ + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); + + /* if (flags) then e2 = -e2 */ + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, + gen_rtx_EQ (VOIDmode, flags, const0_rtx), + gen_rtx_LABEL_REF (VOIDmode, jump_label), + pc_rtx); + insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + predict_jump (REG_BR_PROB_BASE * 50 / 100); + JUMP_LABEL (insn) = jump_label; + + emit_insn (gen_negxf2 (e2, e2)); + + emit_label (jump_label); + LABEL_NUSES (jump_label) = 1; + + emit_move_insn (op0, e2); +} + +/* Output code to perform an acosh XFmode calculation. */ + +void ix86_emit_i387_acosh (rtx op0, rtx op1) +{ + rtx e1 = gen_reg_rtx (XFmode); + rtx e2 = gen_reg_rtx (XFmode); + rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); + + /* e2 = sqrt (op1 + 1.0) */ + emit_insn (gen_addxf3 (e2, op1, cst1)); + emit_insn (gen_sqrtxf2 (e2, e2)); + + /* e1 = sqrt (op1 - 1.0) */ + emit_insn (gen_subxf3 (e1, op1, cst1)); + emit_insn (gen_sqrtxf2 (e1, e1)); + + /* e1 = e1 * e2 */ + emit_insn (gen_mulxf3 (e1, e1, e2)); + + /* e1 = e1 + op1 */ + emit_insn (gen_addxf3 (e1, e1, op1)); + + /* op0 = log (e1) */ + emit_insn (gen_logxf2 (op0, e1)); +} + +/* Output code to perform an atanh XFmode calculation. */ + +void ix86_emit_i387_atanh (rtx op0, rtx op1) +{ + rtx e1 = gen_reg_rtx (XFmode); + rtx e2 = gen_reg_rtx (XFmode); + rtx scratch = gen_reg_rtx (HImode); + rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); + rtx half = const_double_from_real_value (dconsthalf, XFmode); + rtx cst1, tmp; + rtx_code_label *jump_label = gen_label_rtx (); + rtx_insn *insn; + + /* scratch = fxam (op1) */ + emit_insn (gen_fxamxf2_i387 (scratch, op1)); + + /* e2 = |op1| */ + emit_insn (gen_absxf2 (e2, op1)); + + /* e1 = -(e2 + e2) / (e2 + 1.0) */ + cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); + emit_insn (gen_addxf3 (e1, e2, cst1)); + emit_insn (gen_addxf3 (e2, e2, e2)); + emit_insn (gen_negxf2 (e2, e2)); + emit_insn (gen_divxf3 (e1, e2, e1)); + + /* e2 = log1p (e1) */ + ix86_emit_i387_log1p (e2, e1); + + /* flags = signbit (op1) */ + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); + + /* if (!flags) then e2 = -e2 */ + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, + gen_rtx_NE (VOIDmode, flags, const0_rtx), + gen_rtx_LABEL_REF (VOIDmode, jump_label), + pc_rtx); + insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + predict_jump (REG_BR_PROB_BASE * 50 / 100); + JUMP_LABEL (insn) = jump_label; + + emit_insn (gen_negxf2 (e2, e2)); + + emit_label (jump_label); + LABEL_NUSES (jump_label) = 1; + + /* op0 = 0.5 * e2 */ + half = force_reg (XFmode, half); + emit_insn (gen_mulxf3 (op0, e2, half)); +} + +/* Output code to perform a log1p XFmode calculation. */ + +void ix86_emit_i387_log1p (rtx op0, rtx op1) +{ + rtx_code_label *label1 = gen_label_rtx (); + rtx_code_label *label2 = gen_label_rtx (); + + rtx tmp = gen_reg_rtx (XFmode); + rtx res = gen_reg_rtx (XFmode); + rtx cst, cstln2, cst1; + rtx_insn *insn; + + cst = const_double_from_real_value + (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode); + cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */ + + emit_insn (gen_absxf2 (tmp, op1)); + + cst = force_reg (XFmode, cst); + ix86_expand_branch (GE, tmp, cst, label1); + predict_jump (REG_BR_PROB_BASE * 10 / 100); + insn = get_last_insn (); + JUMP_LABEL (insn) = label1; + + emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2)); + emit_jump (label2); + + emit_label (label1); + LABEL_NUSES (label1) = 1; + + cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); + emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1))); + emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2)); + + emit_label (label2); + LABEL_NUSES (label2) = 1; + + emit_move_insn (op0, res); +} + +/* Emit code for round calculation. */ +void ix86_emit_i387_round (rtx op0, rtx op1) +{ + machine_mode inmode = GET_MODE (op1); + machine_mode outmode = GET_MODE (op0); + rtx e1 = gen_reg_rtx (XFmode); + rtx e2 = gen_reg_rtx (XFmode); + rtx scratch = gen_reg_rtx (HImode); + rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); + rtx half = const_double_from_real_value (dconsthalf, XFmode); + rtx res = gen_reg_rtx (outmode); + rtx_code_label *jump_label = gen_label_rtx (); + rtx (*floor_insn) (rtx, rtx); + rtx (*neg_insn) (rtx, rtx); + rtx_insn *insn; + rtx tmp; + + switch (inmode) + { + case E_SFmode: + case E_DFmode: + tmp = gen_reg_rtx (XFmode); + + emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1))); + op1 = tmp; + break; + case E_XFmode: + break; + default: + gcc_unreachable (); + } + + switch (outmode) + { + case E_SFmode: + floor_insn = gen_frndintxf2_floor; + neg_insn = gen_negsf2; + break; + case E_DFmode: + floor_insn = gen_frndintxf2_floor; + neg_insn = gen_negdf2; + break; + case E_XFmode: + floor_insn = gen_frndintxf2_floor; + neg_insn = gen_negxf2; + break; + case E_HImode: + floor_insn = gen_lfloorxfhi2; + neg_insn = gen_neghi2; + break; + case E_SImode: + floor_insn = gen_lfloorxfsi2; + neg_insn = gen_negsi2; + break; + case E_DImode: + floor_insn = gen_lfloorxfdi2; + neg_insn = gen_negdi2; + break; + default: + gcc_unreachable (); + } + + /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */ + + /* scratch = fxam(op1) */ + emit_insn (gen_fxamxf2_i387 (scratch, op1)); + + /* e1 = fabs(op1) */ + emit_insn (gen_absxf2 (e1, op1)); + + /* e2 = e1 + 0.5 */ + half = force_reg (XFmode, half); + emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half))); + + /* res = floor(e2) */ + switch (outmode) + { + case E_SFmode: + case E_DFmode: + { + tmp = gen_reg_rtx (XFmode); + + emit_insn (floor_insn (tmp, e2)); + emit_insn (gen_rtx_SET (res, + gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp), + UNSPEC_TRUNC_NOOP))); + } + break; + default: + emit_insn (floor_insn (res, e2)); + } + + /* flags = signbit(a) */ + emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); + + /* if (flags) then res = -res */ + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, + gen_rtx_EQ (VOIDmode, flags, const0_rtx), + gen_rtx_LABEL_REF (VOIDmode, jump_label), + pc_rtx); + insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + predict_jump (REG_BR_PROB_BASE * 50 / 100); + JUMP_LABEL (insn) = jump_label; + + emit_insn (neg_insn (res, res)); + + emit_label (jump_label); + LABEL_NUSES (jump_label) = 1; + + emit_move_insn (op0, res); +} + +/* Output code to perform a Newton-Rhapson approximation of a single precision + floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */ + +void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode) +{ + rtx x0, x1, e0, e1; + + x0 = gen_reg_rtx (mode); + e0 = gen_reg_rtx (mode); + e1 = gen_reg_rtx (mode); + x1 = gen_reg_rtx (mode); + + /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ + + b = force_reg (mode, b); + + /* x0 = rcp(b) estimate */ + if (mode == V16SFmode || mode == V8DFmode) + { + if (TARGET_AVX512ER) + { + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP28))); + /* res = a * x0 */ + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0))); + return; + } + else + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP14))); + } + else + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), + UNSPEC_RCP))); + + /* e0 = x0 * b */ + emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b))); + + /* e0 = x0 * e0 */ + emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0))); + + /* e1 = x0 + x0 */ + emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0))); + + /* x1 = e1 - e0 */ + emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0))); + + /* res = a * x1 */ + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1))); +} + +/* Output code to perform a Newton-Rhapson approximation of a + single precision floating point [reciprocal] square root. */ + +void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip) +{ + rtx x0, e0, e1, e2, e3, mthree, mhalf; + REAL_VALUE_TYPE r; + int unspec; + + x0 = gen_reg_rtx (mode); + e0 = gen_reg_rtx (mode); + e1 = gen_reg_rtx (mode); + e2 = gen_reg_rtx (mode); + e3 = gen_reg_rtx (mode); + + if (TARGET_AVX512ER && mode == V16SFmode) + { + if (recip) + /* res = rsqrt28(a) estimate */ + emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), + UNSPEC_RSQRT28))); + else + { + /* x0 = rsqrt28(a) estimate */ + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), + UNSPEC_RSQRT28))); + /* res = rcp28(x0) estimate */ + emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0), + UNSPEC_RCP28))); + } + return; + } + + real_from_integer (&r, VOIDmode, -3, SIGNED); + mthree = const_double_from_real_value (r, SFmode); + + real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL); + mhalf = const_double_from_real_value (r, SFmode); + unspec = UNSPEC_RSQRT; + + if (VECTOR_MODE_P (mode)) + { + mthree = ix86_build_const_vector (mode, true, mthree); + mhalf = ix86_build_const_vector (mode, true, mhalf); + /* There is no 512-bit rsqrt. There is however rsqrt14. */ + if (GET_MODE_SIZE (mode) == 64) + unspec = UNSPEC_RSQRT14; + } + + /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) + rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */ + + a = force_reg (mode, a); + + /* x0 = rsqrt(a) estimate */ + emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), + unspec))); + + /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */ + if (!recip) + { + rtx zero = force_reg (mode, CONST0_RTX(mode)); + rtx mask; + + /* Handle masked compare. */ + if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) + { + mask = gen_reg_rtx (HImode); + /* Imm value 0x4 corresponds to not-equal comparison. */ + emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4))); + emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask)); + } + else + { + mask = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a))); + emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask))); + } + } + + /* e0 = x0 * a */ + emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a))); + /* e1 = e0 * x0 */ + emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0))); + + /* e2 = e1 - 3. */ + mthree = force_reg (mode, mthree); + emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree))); + + mhalf = force_reg (mode, mhalf); + if (recip) + /* e3 = -.5 * x0 */ + emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf))); + else + /* e3 = -.5 * e0 */ + emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf))); + /* ret = e2 * e3 */ + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3))); +} + +/* Expand fabs (OP0) and return a new rtx that holds the result. The + mask for masking out the sign-bit is stored in *SMASK, if that is + non-null. */ + +static rtx +ix86_expand_sse_fabs (rtx op0, rtx *smask) +{ + machine_mode vmode, mode = GET_MODE (op0); + rtx xa, mask; + + xa = gen_reg_rtx (mode); + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true); + if (!VECTOR_MODE_P (mode)) + { + /* We need to generate a scalar mode mask in this case. */ + rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); + mask = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (mask, tmp)); + } + emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask))); + + if (smask) + *smask = mask; + + return xa; +} + +/* Expands a comparison of OP0 with OP1 using comparison code CODE, + swapping the operands if SWAP_OPERANDS is true. The expanded + code is a forward jump to a newly created label in case the + comparison is true. The generated label rtx is returned. */ +static rtx_code_label * +ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1, + bool swap_operands) +{ + bool unordered_compare = ix86_unordered_fp_compare (code); + rtx_code_label *label; + rtx tmp, reg; + + if (swap_operands) + std::swap (op0, op1); + + label = gen_label_rtx (); + tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); + if (unordered_compare) + tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); + reg = gen_rtx_REG (CCFPmode, FLAGS_REG); + emit_insn (gen_rtx_SET (reg, tmp)); + tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx); + tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); + JUMP_LABEL (tmp) = label; + + return label; +} + +/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1 + using comparison code CODE. Operands are swapped for the comparison if + SWAP_OPERANDS is true. Returns a rtx for the generated mask. */ +static rtx +ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1, + bool swap_operands) +{ + rtx (*insn)(rtx, rtx, rtx, rtx); + machine_mode mode = GET_MODE (op0); + rtx mask = gen_reg_rtx (mode); + + if (swap_operands) + std::swap (op0, op1); + + insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse; + + emit_insn (insn (mask, op0, op1, + gen_rtx_fmt_ee (code, mode, op0, op1))); + return mask; +} + +/* Expand copysign from SIGN to the positive value ABS_VALUE + storing in RESULT. If MASK is non-null, it shall be a mask to mask out + the sign-bit. */ + +static void +ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) +{ + machine_mode mode = GET_MODE (sign); + rtx sgn = gen_reg_rtx (mode); + if (mask == NULL_RTX) + { + machine_mode vmode; + + if (mode == SFmode) + vmode = V4SFmode; + else if (mode == DFmode) + vmode = V2DFmode; + else + vmode = mode; + + mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false); + if (!VECTOR_MODE_P (mode)) + { + /* We need to generate a scalar mode mask in this case. */ + rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); + mask = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (mask, tmp)); + } + } + else + mask = gen_rtx_NOT (mode, mask); + emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign))); + emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn))); +} + +/* Expand SSE sequence for computing lround from OP1 storing + into OP0. */ + +void +ix86_expand_lround (rtx op0, rtx op1) +{ + /* C code for the stuff we're doing below: + tmp = op1 + copysign (nextafter (0.5, 0.0), op1) + return (long)tmp; + */ + machine_mode mode = GET_MODE (op1); + const struct real_format *fmt; + REAL_VALUE_TYPE pred_half, half_minus_pred_half; + rtx adj; + + /* load nextafter (0.5, 0.0) */ + fmt = REAL_MODE_FORMAT (mode); + real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); + real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); + + /* adj = copysign (0.5, op1) */ + adj = force_reg (mode, const_double_from_real_value (pred_half, mode)); + ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX); + + /* adj = op1 + adj */ + adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT); + + /* op0 = (imode)adj */ + expand_fix (op0, adj, 0); +} + +/* Expand SSE2 sequence for computing lround from OPERAND1 storing + into OPERAND0. */ + +void +ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor) +{ + /* C code for the stuff we're doing below (for do_floor): + xi = (long)op1; + xi -= (double)xi > op1 ? 1 : 0; + return xi; + */ + machine_mode fmode = GET_MODE (op1); + machine_mode imode = GET_MODE (op0); + rtx ireg, freg, tmp; + rtx_code_label *label; + + /* reg = (long)op1 */ + ireg = gen_reg_rtx (imode); + expand_fix (ireg, op1, 0); + + /* freg = (double)reg */ + freg = gen_reg_rtx (fmode); + expand_float (freg, ireg, 0); + + /* ireg = (freg > op1) ? ireg - 1 : ireg */ + label = ix86_expand_sse_compare_and_jump (UNLE, + freg, op1, !do_floor); + tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS, + ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (ireg, tmp); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (op0, ireg); +} + +/* Generate and return a rtx of mode MODE for 2**n where n is the number + of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */ + +static rtx +ix86_gen_TWO52 (machine_mode mode) +{ + REAL_VALUE_TYPE TWO52r; + rtx TWO52; + + real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23); + TWO52 = const_double_from_real_value (TWO52r, mode); + TWO52 = force_reg (mode, TWO52); + + return TWO52; +} + +/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */ + +void +ix86_expand_rint (rtx operand0, rtx operand1) +{ + /* C code for the stuff we're doing below: + xa = fabs (operand1); + if (!isless (xa, 2**52)) + return operand1; + two52 = 2**52; + if (flag_rounding_math) + { + two52 = copysign (two52, operand1); + xa = operand1; + } + xa = xa + two52 - two52; + return copysign (xa, operand1); + */ + machine_mode mode = GET_MODE (operand0); + rtx res, xa, TWO52, two52, mask; + rtx_code_label *label; + + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + TWO52 = ix86_gen_TWO52 (mode); + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + two52 = TWO52; + if (flag_rounding_math) + { + two52 = gen_reg_rtx (mode); + ix86_sse_copysign_to_positive (two52, TWO52, res, mask); + xa = res; + } + + xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT); + + ix86_sse_copysign_to_positive (res, xa, res, mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor) +{ + /* C code for the stuff we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + xa = xa + TWO52 - TWO52; + x2 = copysign (xa, x); + Compensate. Floor: + if (x2 > x) + x2 -= 1; + Compensate. Ceil: + if (x2 < x) + x2 -= -1; + return x2; + */ + machine_mode mode = GET_MODE (operand0); + rtx xa, TWO52, tmp, one, res, mask; + rtx_code_label *label; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* xa = xa + TWO52 - TWO52; */ + xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); + + /* xa = copysign (xa, operand1) */ + ix86_sse_copysign_to_positive (xa, xa, res, mask); + + /* generate 1.0 or -1.0 */ + one = force_reg (mode, + const_double_from_real_value (do_floor + ? dconst1 : dconstm1, mode)); + + /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); + emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); + /* We always need to subtract here to preserve signed zero. */ + tmp = expand_simple_binop (mode, MINUS, + xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor) +{ + /* C code for the stuff we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + x2 = (double)(long)x; + Compensate. Floor: + if (x2 > x) + x2 -= 1; + Compensate. Ceil: + if (x2 < x) + x2 += 1; + if (HONOR_SIGNED_ZEROS (mode)) + return copysign (x2, x); + return x2; + */ + machine_mode mode = GET_MODE (operand0); + rtx xa, xi, TWO52, tmp, one, res, mask; + rtx_code_label *label; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* xa = (double)(long)x */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, res, 0); + expand_float (xa, xi, 0); + + /* generate 1.0 */ + one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + + /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); + emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); + tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, + xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + if (HONOR_SIGNED_ZEROS (mode)) + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing round from OPERAND1 storing + into OPERAND0. Sequence that works without relying on DImode truncation + via cvttsd2siq that is only available on 64bit targets. */ +void +ix86_expand_rounddf_32 (rtx operand0, rtx operand1) +{ + /* C code for the stuff we expand below. + double xa = fabs (x), xa2, x2; + if (!isless (xa, TWO52)) + return x; + Using the absolute value and copying back sign makes + -0.0 -> -0.0 correct. + xa2 = xa + TWO52 - TWO52; + Compensate. + dxa = xa2 - xa; + if (dxa <= -0.5) + xa2 += 1; + else if (dxa > 0.5) + xa2 -= 1; + x2 = copysign (xa2, x); + return x2; + */ + machine_mode mode = GET_MODE (operand0); + rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask; + rtx_code_label *label; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* xa2 = xa + TWO52 - TWO52; */ + xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT); + + /* dxa = xa2 - xa; */ + dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT); + + /* generate 0.5, 1.0 and -0.5 */ + half = force_reg (mode, const_double_from_real_value (dconsthalf, mode)); + one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT); + mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX, + 0, OPTAB_DIRECT); + + /* Compensate. */ + tmp = gen_reg_rtx (mode); + /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false); + emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); + xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); + /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false); + emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); + xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); + + /* res = copysign (xa2, operand1) */ + ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing trunc from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_trunc (rtx operand0, rtx operand1) +{ + /* C code for SSE variant we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + x2 = (double)(long)x; + if (HONOR_SIGNED_ZEROS (mode)) + return copysign (x2, x); + return x2; + */ + machine_mode mode = GET_MODE (operand0); + rtx xa, xi, TWO52, res, mask; + rtx_code_label *label; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* x = (double)(long)x */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, res, 0); + expand_float (res, xi, 0); + + if (HONOR_SIGNED_ZEROS (mode)) + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing trunc from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_truncdf_32 (rtx operand0, rtx operand1) +{ + machine_mode mode = GET_MODE (operand0); + rtx xa, mask, TWO52, one, res, smask, tmp; + rtx_code_label *label; + + /* C code for SSE variant we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + xa2 = xa + TWO52 - TWO52; + Compensate: + if (xa2 > xa) + xa2 -= 1.0; + x2 = copysign (xa2, x); + return x2; + */ + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &smask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* res = xa + TWO52 - TWO52; */ + tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + /* generate 1.0 */ + one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + + /* Compensate: res = xa2 - (res > xa ? 1 : 0) */ + mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false); + emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one))); + tmp = expand_simple_binop (mode, MINUS, + res, mask, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + /* res = copysign (res, operand1) */ + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing round from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_round (rtx operand0, rtx operand1) +{ + /* C code for the stuff we're doing below: + double xa = fabs (x); + if (!isless (xa, TWO52)) + return x; + xa = (double)(long)(xa + nextafter (0.5, 0.0)); + return copysign (xa, x); + */ + machine_mode mode = GET_MODE (operand0); + rtx res, TWO52, xa, xi, half, mask; + rtx_code_label *label; + const struct real_format *fmt; + REAL_VALUE_TYPE pred_half, half_minus_pred_half; + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + TWO52 = ix86_gen_TWO52 (mode); + xa = ix86_expand_sse_fabs (res, &mask); + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* load nextafter (0.5, 0.0) */ + fmt = REAL_MODE_FORMAT (mode); + real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); + real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); + + /* xa = xa + 0.5 */ + half = force_reg (mode, const_double_from_real_value (pred_half, mode)); + xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT); + + /* xa = (double)(int64_t)xa */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, xa, 0); + expand_float (xa, xi, 0); + + /* res = copysign (xa, operand1) */ + ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing round + from OP1 storing into OP0 using sse4 round insn. */ +void +ix86_expand_round_sse4 (rtx op0, rtx op1) +{ + machine_mode mode = GET_MODE (op0); + rtx e1, e2, res, half; + const struct real_format *fmt; + REAL_VALUE_TYPE pred_half, half_minus_pred_half; + rtx (*gen_copysign) (rtx, rtx, rtx); + rtx (*gen_round) (rtx, rtx, rtx); + + switch (mode) + { + case E_SFmode: + gen_copysign = gen_copysignsf3; + gen_round = gen_sse4_1_roundsf2; + break; + case E_DFmode: + gen_copysign = gen_copysigndf3; + gen_round = gen_sse4_1_rounddf2; + break; + default: + gcc_unreachable (); + } + + /* round (a) = trunc (a + copysign (0.5, a)) */ + + /* load nextafter (0.5, 0.0) */ + fmt = REAL_MODE_FORMAT (mode); + real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); + real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); + half = const_double_from_real_value (pred_half, mode); + + /* e1 = copysign (0.5, op1) */ + e1 = gen_reg_rtx (mode); + emit_insn (gen_copysign (e1, half, op1)); + + /* e2 = op1 + e1 */ + e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT); + + /* res = trunc (e2) */ + res = gen_reg_rtx (mode); + emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC))); + + emit_move_insn (op0, res); +} + +/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel []))) + insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh + insn every time. */ + +static GTY(()) rtx_insn *vselect_insn; + +/* Initialize vselect_insn. */ + +static void +init_vselect_insn (void) +{ + unsigned i; + rtx x; + + x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN)); + for (i = 0; i < MAX_VECT_LEN; ++i) + XVECEXP (x, 0, i) = const0_rtx; + x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx, + const0_rtx), x); + x = gen_rtx_SET (const0_rtx, x); + start_sequence (); + vselect_insn = emit_insn (x); + end_sequence (); +} + +/* Construct (set target (vec_select op0 (parallel perm))) and + return true if that's a valid instruction in the active ISA. */ + +static bool +expand_vselect (rtx target, rtx op0, const unsigned char *perm, + unsigned nelt, bool testing_p) +{ + unsigned int i; + rtx x, save_vconcat; + int icode; + + if (vselect_insn == NULL_RTX) + init_vselect_insn (); + + x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1); + PUT_NUM_ELEM (XVEC (x, 0), nelt); + for (i = 0; i < nelt; ++i) + XVECEXP (x, 0, i) = GEN_INT (perm[i]); + save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0); + XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0; + PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target)); + SET_DEST (PATTERN (vselect_insn)) = target; + icode = recog_memoized (vselect_insn); + + if (icode >= 0 && !testing_p) + emit_insn (copy_rtx (PATTERN (vselect_insn))); + + SET_DEST (PATTERN (vselect_insn)) = const0_rtx; + XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat; + INSN_CODE (vselect_insn) = -1; + + return icode >= 0; +} + +/* Similar, but generate a vec_concat from op0 and op1 as well. */ + +static bool +expand_vselect_vconcat (rtx target, rtx op0, rtx op1, + const unsigned char *perm, unsigned nelt, + bool testing_p) +{ + machine_mode v2mode; + rtx x; + bool ok; + + if (vselect_insn == NULL_RTX) + init_vselect_insn (); + + if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode)) + return false; + x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0); + PUT_MODE (x, v2mode); + XEXP (x, 0) = op0; + XEXP (x, 1) = op1; + ok = expand_vselect (target, x, perm, nelt, testing_p); + XEXP (x, 0) = const0_rtx; + XEXP (x, 1) = const0_rtx; + return ok; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + using movss or movsd. */ +static bool +expand_vec_perm_movs (struct expand_vec_perm_d *d) +{ + machine_mode vmode = d->vmode; + unsigned i, nelt = d->nelt; + rtx x; + + if (d->one_operand_p) + return false; + + if (!(TARGET_SSE && vmode == V4SFmode) + && !(TARGET_SSE2 && vmode == V2DFmode)) + return false; + + /* Only the first element is changed. */ + if (d->perm[0] != nelt && d->perm[0] != 0) + return false; + for (i = 1; i < nelt; ++i) + if (d->perm[i] != i + nelt - d->perm[0]) + return false; + + if (d->testing_p) + return true; + + if (d->perm[0] == nelt) + x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1)); + else + x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1)); + + emit_insn (gen_rtx_SET (d->target, x)); + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */ + +static bool +expand_vec_perm_blend (struct expand_vec_perm_d *d) +{ + machine_mode mmode, vmode = d->vmode; + unsigned i, mask, nelt = d->nelt; + rtx target, op0, op1, maskop, x; + rtx rperm[32], vperm; + + if (d->one_operand_p) + return false; + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 + && (TARGET_AVX512BW + || GET_MODE_UNIT_SIZE (vmode) >= 4)) + ; + else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + ; + else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) + ; + else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + ; + else + return false; + + /* This is a blend, not a permute. Elements must stay in their + respective lanes. */ + for (i = 0; i < nelt; ++i) + { + unsigned e = d->perm[i]; + if (!(e == i || e == i + nelt)) + return false; + } + + if (d->testing_p) + return true; + + /* ??? Without SSE4.1, we could implement this with and/andn/or. This + decision should be extracted elsewhere, so that we only try that + sequence once all budget==3 options have been tried. */ + target = d->target; + op0 = d->op0; + op1 = d->op1; + mask = 0; + + switch (vmode) + { + case E_V8DFmode: + case E_V16SFmode: + case E_V4DFmode: + case E_V8SFmode: + case E_V2DFmode: + case E_V4SFmode: + case E_V8HImode: + case E_V8SImode: + case E_V32HImode: + case E_V64QImode: + case E_V16SImode: + case E_V8DImode: + for (i = 0; i < nelt; ++i) + mask |= (d->perm[i] >= nelt) << i; + break; + + case E_V2DImode: + for (i = 0; i < 2; ++i) + mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4); + vmode = V8HImode; + goto do_subreg; + + case E_V4SImode: + for (i = 0; i < 4; ++i) + mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); + vmode = V8HImode; + goto do_subreg; + + case E_V16QImode: + /* See if bytes move in pairs so we can use pblendw with + an immediate argument, rather than pblendvb with a vector + argument. */ + for (i = 0; i < 16; i += 2) + if (d->perm[i] + 1 != d->perm[i + 1]) + { + use_pblendvb: + for (i = 0; i < nelt; ++i) + rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx); + + finish_pblendvb: + vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); + vperm = force_reg (vmode, vperm); + + if (GET_MODE_SIZE (vmode) == 16) + emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm)); + else + emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm)); + if (target != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); + return true; + } + + for (i = 0; i < 8; ++i) + mask |= (d->perm[i * 2] >= 16) << i; + vmode = V8HImode; + /* FALLTHRU */ + + do_subreg: + target = gen_reg_rtx (vmode); + op0 = gen_lowpart (vmode, op0); + op1 = gen_lowpart (vmode, op1); + break; + + case E_V32QImode: + /* See if bytes move in pairs. If not, vpblendvb must be used. */ + for (i = 0; i < 32; i += 2) + if (d->perm[i] + 1 != d->perm[i + 1]) + goto use_pblendvb; + /* See if bytes move in quadruplets. If yes, vpblendd + with immediate can be used. */ + for (i = 0; i < 32; i += 4) + if (d->perm[i] + 2 != d->perm[i + 2]) + break; + if (i < 32) + { + /* See if bytes move the same in both lanes. If yes, + vpblendw with immediate can be used. */ + for (i = 0; i < 16; i += 2) + if (d->perm[i] + 16 != d->perm[i + 16]) + goto use_pblendvb; + + /* Use vpblendw. */ + for (i = 0; i < 16; ++i) + mask |= (d->perm[i * 2] >= 32) << i; + vmode = V16HImode; + goto do_subreg; + } + + /* Use vpblendd. */ + for (i = 0; i < 8; ++i) + mask |= (d->perm[i * 4] >= 32) << i; + vmode = V8SImode; + goto do_subreg; + + case E_V16HImode: + /* See if words move in pairs. If yes, vpblendd can be used. */ + for (i = 0; i < 16; i += 2) + if (d->perm[i] + 1 != d->perm[i + 1]) + break; + if (i < 16) + { + /* See if words move the same in both lanes. If not, + vpblendvb must be used. */ + for (i = 0; i < 8; i++) + if (d->perm[i] + 8 != d->perm[i + 8]) + { + /* Use vpblendvb. */ + for (i = 0; i < 32; ++i) + rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx); + + vmode = V32QImode; + nelt = 32; + target = gen_reg_rtx (vmode); + op0 = gen_lowpart (vmode, op0); + op1 = gen_lowpart (vmode, op1); + goto finish_pblendvb; + } + + /* Use vpblendw. */ + for (i = 0; i < 16; ++i) + mask |= (d->perm[i] >= 16) << i; + break; + } + + /* Use vpblendd. */ + for (i = 0; i < 8; ++i) + mask |= (d->perm[i * 2] >= 16) << i; + vmode = V8SImode; + goto do_subreg; + + case E_V4DImode: + /* Use vpblendd. */ + for (i = 0; i < 4; ++i) + mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); + vmode = V8SImode; + goto do_subreg; + + default: + gcc_unreachable (); + } + + switch (vmode) + { + case E_V8DFmode: + case E_V8DImode: + mmode = QImode; + break; + case E_V16SFmode: + case E_V16SImode: + mmode = HImode; + break; + case E_V32HImode: + mmode = SImode; + break; + case E_V64QImode: + mmode = DImode; + break; + default: + mmode = VOIDmode; + } + + if (mmode != VOIDmode) + maskop = force_reg (mmode, gen_int_mode (mask, mmode)); + else + maskop = GEN_INT (mask); + + /* This matches five different patterns with the different modes. */ + x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop); + x = gen_rtx_SET (target, x); + emit_insn (x); + if (target != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + in terms of the variable form of vpermilps. + + Note that we will have already failed the immediate input vpermilps, + which requires that the high and low part shuffle be identical; the + variable form doesn't require that. */ + +static bool +expand_vec_perm_vpermil (struct expand_vec_perm_d *d) +{ + rtx rperm[8], vperm; + unsigned i; + + if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p) + return false; + + /* We can only permute within the 128-bit lane. */ + for (i = 0; i < 8; ++i) + { + unsigned e = d->perm[i]; + if (i < 4 ? e >= 4 : e < 4) + return false; + } + + if (d->testing_p) + return true; + + for (i = 0; i < 8; ++i) + { + unsigned e = d->perm[i]; + + /* Within each 128-bit lane, the elements of op0 are numbered + from 0 and the elements of op1 are numbered from 4. */ + if (e >= 8 + 4) + e -= 8; + else if (e >= 4) + e -= 4; + + rperm[i] = GEN_INT (e); + } + + vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm)); + vperm = force_reg (V8SImode, vperm); + emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm)); + + return true; +} + +/* Return true if permutation D can be performed as VMODE permutation + instead. */ + +static bool +valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d) +{ + unsigned int i, j, chunk; + + if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT + || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT + || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode)) + return false; + + if (GET_MODE_NUNITS (vmode) >= d->nelt) + return true; + + chunk = d->nelt / GET_MODE_NUNITS (vmode); + for (i = 0; i < d->nelt; i += chunk) + if (d->perm[i] & (chunk - 1)) + return false; + else + for (j = 1; j < chunk; ++j) + if (d->perm[i] + j != d->perm[i + j]) + return false; + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */ + +static bool +expand_vec_perm_pshufb (struct expand_vec_perm_d *d) +{ + unsigned i, nelt, eltsz, mask; + unsigned char perm[64]; + machine_mode vmode = V16QImode; + rtx rperm[64], vperm, target, op0, op1; + + nelt = d->nelt; + + if (!d->one_operand_p) + { + if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16) + { + if (TARGET_AVX2 + && valid_perm_using_mode_p (V2TImode, d)) + { + if (d->testing_p) + return true; + + /* Use vperm2i128 insn. The pattern uses + V4DImode instead of V2TImode. */ + target = d->target; + if (d->vmode != V4DImode) + target = gen_reg_rtx (V4DImode); + op0 = gen_lowpart (V4DImode, d->op0); + op1 = gen_lowpart (V4DImode, d->op1); + rperm[0] + = GEN_INT ((d->perm[0] / (nelt / 2)) + | ((d->perm[nelt / 2] / (nelt / 2)) * 16)); + emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0])); + if (target != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); + return true; + } + return false; + } + } + else + { + if (GET_MODE_SIZE (d->vmode) == 16) + { + if (!TARGET_SSSE3) + return false; + } + else if (GET_MODE_SIZE (d->vmode) == 32) + { + if (!TARGET_AVX2) + return false; + + /* V4DImode should be already handled through + expand_vselect by vpermq instruction. */ + gcc_assert (d->vmode != V4DImode); + + vmode = V32QImode; + if (d->vmode == V8SImode + || d->vmode == V16HImode + || d->vmode == V32QImode) + { + /* First see if vpermq can be used for + V8SImode/V16HImode/V32QImode. */ + if (valid_perm_using_mode_p (V4DImode, d)) + { + for (i = 0; i < 4; i++) + perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3; + if (d->testing_p) + return true; + target = gen_reg_rtx (V4DImode); + if (expand_vselect (target, gen_lowpart (V4DImode, d->op0), + perm, 4, false)) + { + emit_move_insn (d->target, + gen_lowpart (d->vmode, target)); + return true; + } + return false; + } + + /* Next see if vpermd can be used. */ + if (valid_perm_using_mode_p (V8SImode, d)) + vmode = V8SImode; + } + /* Or if vpermps can be used. */ + else if (d->vmode == V8SFmode) + vmode = V8SImode; + + if (vmode == V32QImode) + { + /* vpshufb only works intra lanes, it is not + possible to shuffle bytes in between the lanes. */ + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (nelt / 2)) + return false; + } + } + else if (GET_MODE_SIZE (d->vmode) == 64) + { + if (!TARGET_AVX512BW) + return false; + + /* If vpermq didn't work, vpshufb won't work either. */ + if (d->vmode == V8DFmode || d->vmode == V8DImode) + return false; + + vmode = V64QImode; + if (d->vmode == V16SImode + || d->vmode == V32HImode + || d->vmode == V64QImode) + { + /* First see if vpermq can be used for + V16SImode/V32HImode/V64QImode. */ + if (valid_perm_using_mode_p (V8DImode, d)) + { + for (i = 0; i < 8; i++) + perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7; + if (d->testing_p) + return true; + target = gen_reg_rtx (V8DImode); + if (expand_vselect (target, gen_lowpart (V8DImode, d->op0), + perm, 8, false)) + { + emit_move_insn (d->target, + gen_lowpart (d->vmode, target)); + return true; + } + return false; + } + + /* Next see if vpermd can be used. */ + if (valid_perm_using_mode_p (V16SImode, d)) + vmode = V16SImode; + } + /* Or if vpermps can be used. */ + else if (d->vmode == V16SFmode) + vmode = V16SImode; + if (vmode == V64QImode) + { + /* vpshufb only works intra lanes, it is not + possible to shuffle bytes in between the lanes. */ + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (nelt / 4)) + return false; + } + } + else + return false; + } + + if (d->testing_p) + return true; + + if (vmode == V8SImode) + for (i = 0; i < 8; ++i) + rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7); + else if (vmode == V16SImode) + for (i = 0; i < 16; ++i) + rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15); + else + { + eltsz = GET_MODE_UNIT_SIZE (d->vmode); + if (!d->one_operand_p) + mask = 2 * nelt - 1; + else if (vmode == V16QImode) + mask = nelt - 1; + else if (vmode == V64QImode) + mask = nelt / 4 - 1; + else + mask = nelt / 2 - 1; + + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i] & mask; + for (j = 0; j < eltsz; ++j) + rperm[i * eltsz + j] = GEN_INT (e * eltsz + j); + } + } + + vperm = gen_rtx_CONST_VECTOR (vmode, + gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm)); + vperm = force_reg (vmode, vperm); + + target = d->target; + if (d->vmode != vmode) + target = gen_reg_rtx (vmode); + op0 = gen_lowpart (vmode, d->op0); + if (d->one_operand_p) + { + if (vmode == V16QImode) + emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); + else if (vmode == V32QImode) + emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); + else if (vmode == V64QImode) + emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); + else if (vmode == V8SFmode) + emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); + else if (vmode == V8SImode) + emit_insn (gen_avx2_permvarv8si (target, op0, vperm)); + else if (vmode == V16SFmode) + emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm)); + else if (vmode == V16SImode) + emit_insn (gen_avx512f_permvarv16si (target, op0, vperm)); + else + gcc_unreachable (); + } + else + { + op1 = gen_lowpart (vmode, d->op1); + emit_insn (gen_xop_pperm (target, op0, op1, vperm)); + } + if (target != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, target)); + + return true; +} + +/* For V*[QHS]Imode permutations, check if the same permutation + can't be performed in a 2x, 4x or 8x wider inner mode. */ + +static bool +canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, + struct expand_vec_perm_d *nd) +{ + int i; + machine_mode mode = VOIDmode; + + switch (d->vmode) + { + case E_V16QImode: mode = V8HImode; break; + case E_V32QImode: mode = V16HImode; break; + case E_V64QImode: mode = V32HImode; break; + case E_V8HImode: mode = V4SImode; break; + case E_V16HImode: mode = V8SImode; break; + case E_V32HImode: mode = V16SImode; break; + case E_V4SImode: mode = V2DImode; break; + case E_V8SImode: mode = V4DImode; break; + case E_V16SImode: mode = V8DImode; break; + default: return false; + } + for (i = 0; i < d->nelt; i += 2) + if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) + return false; + nd->vmode = mode; + nd->nelt = d->nelt / 2; + for (i = 0; i < nd->nelt; i++) + nd->perm[i] = d->perm[2 * i] / 2; + if (GET_MODE_INNER (mode) != DImode) + canonicalize_vector_int_perm (nd, nd); + if (nd != d) + { + nd->one_operand_p = d->one_operand_p; + nd->testing_p = d->testing_p; + if (d->op0 == d->op1) + nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); + else + { + nd->op0 = gen_lowpart (nd->vmode, d->op0); + nd->op1 = gen_lowpart (nd->vmode, d->op1); + } + if (d->testing_p) + nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); + else + nd->target = gen_reg_rtx (nd->vmode); + } + return true; +} + +/* Try to expand one-operand permutation with constant mask. */ + +static bool +ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) +{ + machine_mode mode = GET_MODE (d->op0); + machine_mode maskmode = mode; + rtx (*gen) (rtx, rtx, rtx) = NULL; + rtx target, op0, mask; + rtx vec[64]; + + if (!rtx_equal_p (d->op0, d->op1)) + return false; + + if (!TARGET_AVX512F) + return false; + + switch (mode) + { + case E_V16SImode: + gen = gen_avx512f_permvarv16si; + break; + case E_V16SFmode: + gen = gen_avx512f_permvarv16sf; + maskmode = V16SImode; + break; + case E_V8DImode: + gen = gen_avx512f_permvarv8di; + break; + case E_V8DFmode: + gen = gen_avx512f_permvarv8df; + maskmode = V8DImode; + break; + default: + return false; + } + + target = d->target; + op0 = d->op0; + for (int i = 0; i < d->nelt; ++i) + vec[i] = GEN_INT (d->perm[i]); + mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); + emit_insn (gen (target, op0, force_reg (maskmode, mask))); + return true; +} + +static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool); + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D + in a single instruction. */ + +static bool +expand_vec_perm_1 (struct expand_vec_perm_d *d) +{ + unsigned i, nelt = d->nelt; + struct expand_vec_perm_d nd; + + /* Check plain VEC_SELECT first, because AVX has instructions that could + match both SEL and SEL+CONCAT, but the plain SEL will allow a memory + input where SEL+CONCAT may not. */ + if (d->one_operand_p) + { + int mask = nelt - 1; + bool identity_perm = true; + bool broadcast_perm = true; + + for (i = 0; i < nelt; i++) + { + nd.perm[i] = d->perm[i] & mask; + if (nd.perm[i] != i) + identity_perm = false; + if (nd.perm[i]) + broadcast_perm = false; + } + + if (identity_perm) + { + if (!d->testing_p) + emit_move_insn (d->target, d->op0); + return true; + } + else if (broadcast_perm && TARGET_AVX2) + { + /* Use vpbroadcast{b,w,d}. */ + rtx (*gen) (rtx, rtx) = NULL; + switch (d->vmode) + { + case E_V64QImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vec_dupv64qi_1; + break; + case E_V32QImode: + gen = gen_avx2_pbroadcastv32qi_1; + break; + case E_V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vec_dupv32hi_1; + break; + case E_V16HImode: + gen = gen_avx2_pbroadcastv16hi_1; + break; + case E_V16SImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16si_1; + break; + case E_V8SImode: + gen = gen_avx2_pbroadcastv8si_1; + break; + case E_V16QImode: + gen = gen_avx2_pbroadcastv16qi; + break; + case E_V8HImode: + gen = gen_avx2_pbroadcastv8hi; + break; + case E_V16SFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16sf_1; + break; + case E_V8SFmode: + gen = gen_avx2_vec_dupv8sf_1; + break; + case E_V8DFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8df_1; + break; + case E_V8DImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8di_1; + break; + /* For other modes prefer other shuffles this function creates. */ + default: break; + } + if (gen != NULL) + { + if (!d->testing_p) + emit_insn (gen (d->target, d->op0)); + return true; + } + } + + if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p)) + return true; + + /* There are plenty of patterns in sse.md that are written for + SEL+CONCAT and are not replicated for a single op. Perhaps + that should be changed, to avoid the nastiness here. */ + + /* Recognize interleave style patterns, which means incrementing + every other permutation operand. */ + for (i = 0; i < nelt; i += 2) + { + nd.perm[i] = d->perm[i] & mask; + nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt; + } + if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, + d->testing_p)) + return true; + + /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */ + if (nelt >= 4) + { + for (i = 0; i < nelt; i += 4) + { + nd.perm[i + 0] = d->perm[i + 0] & mask; + nd.perm[i + 1] = d->perm[i + 1] & mask; + nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt; + nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt; + } + + if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, + d->testing_p)) + return true; + } + } + + /* Try movss/movsd instructions. */ + if (expand_vec_perm_movs (d)) + return true; + + /* Finally, try the fully general two operand permute. */ + if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt, + d->testing_p)) + return true; + + /* Recognize interleave style patterns with reversed operands. */ + if (!d->one_operand_p) + { + for (i = 0; i < nelt; ++i) + { + unsigned e = d->perm[i]; + if (e >= nelt) + e -= nelt; + else + e += nelt; + nd.perm[i] = e; + } + + if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt, + d->testing_p)) + return true; + } + + /* Try the SSE4.1 blend variable merge instructions. */ + if (expand_vec_perm_blend (d)) + return true; + + /* Try one of the AVX vpermil variable permutations. */ + if (expand_vec_perm_vpermil (d)) + return true; + + /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128, + vpshufb, vpermd, vpermps or vpermq variable permutation. */ + if (expand_vec_perm_pshufb (d)) + return true; + + /* Try the AVX2 vpalignr instruction. */ + if (expand_vec_perm_palignr (d, true)) + return true; + + /* Try the AVX512F vperm{s,d} instructions. */ + if (ix86_expand_vec_one_operand_perm_avx512 (d)) + return true; + + /* Try the AVX512F vpermt2/vpermi2 instructions. */ + if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) + return true; + + /* See if we can get the same permutation in different vector integer + mode. */ + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) + { + if (!d->testing_p) + emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); + return true; + } + return false; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + in terms of a pair of pshuflw + pshufhw instructions. */ + +static bool +expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d) +{ + unsigned char perm2[MAX_VECT_LEN]; + unsigned i; + bool ok; + + if (d->vmode != V8HImode || !d->one_operand_p) + return false; + + /* The two permutations only operate in 64-bit lanes. */ + for (i = 0; i < 4; ++i) + if (d->perm[i] >= 4) + return false; + for (i = 4; i < 8; ++i) + if (d->perm[i] < 4) + return false; + + if (d->testing_p) + return true; + + /* Emit the pshuflw. */ + memcpy (perm2, d->perm, 4); + for (i = 4; i < 8; ++i) + perm2[i] = i; + ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p); + gcc_assert (ok); + + /* Emit the pshufhw. */ + memcpy (perm2 + 4, d->perm + 4, 4); + for (i = 0; i < 4; ++i) + perm2[i] = i; + ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p); + gcc_assert (ok); + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify + the permutation using the SSSE3 palignr instruction. This succeeds + when all of the elements in PERM fit within one vector and we merely + need to shift them down so that a single vector permutation has a + chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only + the vpalignr instruction itself can perform the requested permutation. */ + +static bool +expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p) +{ + unsigned i, nelt = d->nelt; + unsigned min, max, minswap, maxswap; + bool in_order, ok, swap = false; + rtx shift, target; + struct expand_vec_perm_d dcopy; + + /* Even with AVX, palignr only operates on 128-bit vectors, + in AVX2 palignr operates on both 128-bit lanes. */ + if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) + && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32)) + return false; + + min = 2 * nelt; + max = 0; + minswap = 2 * nelt; + maxswap = 0; + for (i = 0; i < nelt; ++i) + { + unsigned e = d->perm[i]; + unsigned eswap = d->perm[i] ^ nelt; + if (GET_MODE_SIZE (d->vmode) == 32) + { + e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1); + eswap = e ^ (nelt / 2); + } + if (e < min) + min = e; + if (e > max) + max = e; + if (eswap < minswap) + minswap = eswap; + if (eswap > maxswap) + maxswap = eswap; + } + if (min == 0 + || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt)) + { + if (d->one_operand_p + || minswap == 0 + || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32 + ? nelt / 2 : nelt)) + return false; + swap = true; + min = minswap; + max = maxswap; + } + + /* Given that we have SSSE3, we know we'll be able to implement the + single operand permutation after the palignr with pshufb for + 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed + first. */ + if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p) + return true; + + dcopy = *d; + if (swap) + { + dcopy.op0 = d->op1; + dcopy.op1 = d->op0; + for (i = 0; i < nelt; ++i) + dcopy.perm[i] ^= nelt; + } + + in_order = true; + for (i = 0; i < nelt; ++i) + { + unsigned e = dcopy.perm[i]; + if (GET_MODE_SIZE (d->vmode) == 32 + && e >= nelt + && (e & (nelt / 2 - 1)) < min) + e = e - min - (nelt / 2); + else + e = e - min; + if (e != i) + in_order = false; + dcopy.perm[i] = e; + } + dcopy.one_operand_p = true; + + if (single_insn_only_p && !in_order) + return false; + + /* For AVX2, test whether we can permute the result in one instruction. */ + if (d->testing_p) + { + if (in_order) + return true; + dcopy.op1 = dcopy.op0; + return expand_vec_perm_1 (&dcopy); + } + + shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode)); + if (GET_MODE_SIZE (d->vmode) == 16) + { + target = gen_reg_rtx (TImode); + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1), + gen_lowpart (TImode, dcopy.op0), shift)); + } + else + { + target = gen_reg_rtx (V2TImode); + emit_insn (gen_avx2_palignrv2ti (target, + gen_lowpart (V2TImode, dcopy.op1), + gen_lowpart (V2TImode, dcopy.op0), + shift)); + } + + dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); + + /* Test for the degenerate case where the alignment by itself + produces the desired permutation. */ + if (in_order) + { + emit_move_insn (d->target, dcopy.op0); + return true; + } + + ok = expand_vec_perm_1 (&dcopy); + gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32); + + return ok; +} + +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify + the permutation using the SSE4_1 pblendv instruction. Potentially + reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */ + +static bool +expand_vec_perm_pblendv (struct expand_vec_perm_d *d) +{ + unsigned i, which, nelt = d->nelt; + struct expand_vec_perm_d dcopy, dcopy1; + machine_mode vmode = d->vmode; + bool ok; + + /* Use the same checks as in expand_vec_perm_blend. */ + if (d->one_operand_p) + return false; + if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + ; + else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) + ; + else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + ; + else + return false; + + /* Figure out where permutation elements stay not in their + respective lanes. */ + for (i = 0, which = 0; i < nelt; ++i) + { + unsigned e = d->perm[i]; + if (e != i) + which |= (e < nelt ? 1 : 2); + } + /* We can pblend the part where elements stay not in their + respective lanes only when these elements are all in one + half of a permutation. + {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective + lanes, but both 8 and 9 >= 8 + {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their + respective lanes and 8 >= 8, but 2 not. */ + if (which != 1 && which != 2) + return false; + if (d->testing_p && GET_MODE_SIZE (vmode) == 16) + return true; + + /* First we apply one operand permutation to the part where + elements stay not in their respective lanes. */ + dcopy = *d; + if (which == 2) + dcopy.op0 = dcopy.op1 = d->op1; + else + dcopy.op0 = dcopy.op1 = d->op0; + if (!d->testing_p) + dcopy.target = gen_reg_rtx (vmode); + dcopy.one_operand_p = true; + + for (i = 0; i < nelt; ++i) + dcopy.perm[i] = d->perm[i] & (nelt - 1); + + ok = expand_vec_perm_1 (&dcopy); + if (GET_MODE_SIZE (vmode) != 16 && !ok) + return false; + else + gcc_assert (ok); + if (d->testing_p) + return true; + + /* Next we put permuted elements into their positions. */ + dcopy1 = *d; + if (which == 2) + dcopy1.op1 = dcopy.target; + else + dcopy1.op0 = dcopy.target; + + for (i = 0; i < nelt; ++i) + dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i); + + ok = expand_vec_perm_blend (&dcopy1); + gcc_assert (ok); + + return true; +} + +static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d); + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify + a two vector permutation into a single vector permutation by using + an interleave operation to merge the vectors. */ + +static bool +expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) +{ + struct expand_vec_perm_d dremap, dfinal; + unsigned i, nelt = d->nelt, nelt2 = nelt / 2; + unsigned HOST_WIDE_INT contents; + unsigned char remap[2 * MAX_VECT_LEN]; + rtx_insn *seq; + bool ok, same_halves = false; + + if (GET_MODE_SIZE (d->vmode) == 16) + { + if (d->one_operand_p) + return false; + } + else if (GET_MODE_SIZE (d->vmode) == 32) + { + if (!TARGET_AVX) + return false; + /* For 32-byte modes allow even d->one_operand_p. + The lack of cross-lane shuffling in some instructions + might prevent a single insn shuffle. */ + dfinal = *d; + dfinal.testing_p = true; + /* If expand_vec_perm_interleave3 can expand this into + a 3 insn sequence, give up and let it be expanded as + 3 insn sequence. While that is one insn longer, + it doesn't need a memory operand and in the common + case that both interleave low and high permutations + with the same operands are adjacent needs 4 insns + for both after CSE. */ + if (expand_vec_perm_interleave3 (&dfinal)) + return false; + } + else + return false; + + /* Examine from whence the elements come. */ + contents = 0; + for (i = 0; i < nelt; ++i) + contents |= HOST_WIDE_INT_1U << d->perm[i]; + + memset (remap, 0xff, sizeof (remap)); + dremap = *d; + + if (GET_MODE_SIZE (d->vmode) == 16) + { + unsigned HOST_WIDE_INT h1, h2, h3, h4; + + /* Split the two input vectors into 4 halves. */ + h1 = (HOST_WIDE_INT_1U << nelt2) - 1; + h2 = h1 << nelt2; + h3 = h2 << nelt2; + h4 = h3 << nelt2; + + /* If the elements from the low halves use interleave low, and similarly + for interleave high. If the elements are from mis-matched halves, we + can use shufps for V4SF/V4SI or do a DImode shuffle. */ + if ((contents & (h1 | h3)) == contents) + { + /* punpckl* */ + for (i = 0; i < nelt2; ++i) + { + remap[i] = i * 2; + remap[i + nelt] = i * 2 + 1; + dremap.perm[i * 2] = i; + dremap.perm[i * 2 + 1] = i + nelt; + } + if (!TARGET_SSE2 && d->vmode == V4SImode) + dremap.vmode = V4SFmode; + } + else if ((contents & (h2 | h4)) == contents) + { + /* punpckh* */ + for (i = 0; i < nelt2; ++i) + { + remap[i + nelt2] = i * 2; + remap[i + nelt + nelt2] = i * 2 + 1; + dremap.perm[i * 2] = i + nelt2; + dremap.perm[i * 2 + 1] = i + nelt + nelt2; + } + if (!TARGET_SSE2 && d->vmode == V4SImode) + dremap.vmode = V4SFmode; + } + else if ((contents & (h1 | h4)) == contents) + { + /* shufps */ + for (i = 0; i < nelt2; ++i) + { + remap[i] = i; + remap[i + nelt + nelt2] = i + nelt2; + dremap.perm[i] = i; + dremap.perm[i + nelt2] = i + nelt + nelt2; + } + if (nelt != 4) + { + /* shufpd */ + dremap.vmode = V2DImode; + dremap.nelt = 2; + dremap.perm[0] = 0; + dremap.perm[1] = 3; + } + } + else if ((contents & (h2 | h3)) == contents) + { + /* shufps */ + for (i = 0; i < nelt2; ++i) + { + remap[i + nelt2] = i; + remap[i + nelt] = i + nelt2; + dremap.perm[i] = i + nelt2; + dremap.perm[i + nelt2] = i + nelt; + } + if (nelt != 4) + { + /* shufpd */ + dremap.vmode = V2DImode; + dremap.nelt = 2; + dremap.perm[0] = 1; + dremap.perm[1] = 2; + } + } + else + return false; + } + else + { + unsigned int nelt4 = nelt / 4, nzcnt = 0; + unsigned HOST_WIDE_INT q[8]; + unsigned int nonzero_halves[4]; + + /* Split the two input vectors into 8 quarters. */ + q[0] = (HOST_WIDE_INT_1U << nelt4) - 1; + for (i = 1; i < 8; ++i) + q[i] = q[0] << (nelt4 * i); + for (i = 0; i < 4; ++i) + if (((q[2 * i] | q[2 * i + 1]) & contents) != 0) + { + nonzero_halves[nzcnt] = i; + ++nzcnt; + } + + if (nzcnt == 1) + { + gcc_assert (d->one_operand_p); + nonzero_halves[1] = nonzero_halves[0]; + same_halves = true; + } + else if (d->one_operand_p) + { + gcc_assert (nonzero_halves[0] == 0); + gcc_assert (nonzero_halves[1] == 1); + } + + if (nzcnt <= 2) + { + if (d->perm[0] / nelt2 == nonzero_halves[1]) + { + /* Attempt to increase the likelihood that dfinal + shuffle will be intra-lane. */ + std::swap (nonzero_halves[0], nonzero_halves[1]); + } + + /* vperm2f128 or vperm2i128. */ + for (i = 0; i < nelt2; ++i) + { + remap[i + nonzero_halves[1] * nelt2] = i + nelt2; + remap[i + nonzero_halves[0] * nelt2] = i; + dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2; + dremap.perm[i] = i + nonzero_halves[0] * nelt2; + } + + if (d->vmode != V8SFmode + && d->vmode != V4DFmode + && d->vmode != V8SImode) + { + dremap.vmode = V8SImode; + dremap.nelt = 8; + for (i = 0; i < 4; ++i) + { + dremap.perm[i] = i + nonzero_halves[0] * 4; + dremap.perm[i + 4] = i + nonzero_halves[1] * 4; + } + } + } + else if (d->one_operand_p) + return false; + else if (TARGET_AVX2 + && (contents & (q[0] | q[2] | q[4] | q[6])) == contents) + { + /* vpunpckl* */ + for (i = 0; i < nelt4; ++i) + { + remap[i] = i * 2; + remap[i + nelt] = i * 2 + 1; + remap[i + nelt2] = i * 2 + nelt2; + remap[i + nelt + nelt2] = i * 2 + nelt2 + 1; + dremap.perm[i * 2] = i; + dremap.perm[i * 2 + 1] = i + nelt; + dremap.perm[i * 2 + nelt2] = i + nelt2; + dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2; + } + } + else if (TARGET_AVX2 + && (contents & (q[1] | q[3] | q[5] | q[7])) == contents) + { + /* vpunpckh* */ + for (i = 0; i < nelt4; ++i) + { + remap[i + nelt4] = i * 2; + remap[i + nelt + nelt4] = i * 2 + 1; + remap[i + nelt2 + nelt4] = i * 2 + nelt2; + remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1; + dremap.perm[i * 2] = i + nelt4; + dremap.perm[i * 2 + 1] = i + nelt + nelt4; + dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4; + dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4; + } + } + else + return false; + } + + /* Use the remapping array set up above to move the elements from their + swizzled locations into their final destinations. */ + dfinal = *d; + for (i = 0; i < nelt; ++i) + { + unsigned e = remap[d->perm[i]]; + gcc_assert (e < nelt); + /* If same_halves is true, both halves of the remapped vector are the + same. Avoid cross-lane accesses if possible. */ + if (same_halves && i >= nelt2) + { + gcc_assert (e < nelt2); + dfinal.perm[i] = e + nelt2; + } + else + dfinal.perm[i] = e; + } + if (!d->testing_p) + { + dremap.target = gen_reg_rtx (dremap.vmode); + dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); + } + dfinal.op1 = dfinal.op0; + dfinal.one_operand_p = true; + + /* Test if the final remap can be done with a single insn. For V4SFmode or + V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */ + start_sequence (); + ok = expand_vec_perm_1 (&dfinal); + seq = get_insns (); + end_sequence (); + + if (!ok) + return false; + + if (d->testing_p) + return true; + + if (dremap.vmode != dfinal.vmode) + { + dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0); + dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1); + } + + ok = expand_vec_perm_1 (&dremap); + gcc_assert (ok); + + emit_insn (seq); + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify + a single vector cross-lane permutation into vpermq followed + by any of the single insn permutations. */ + +static bool +expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d) +{ + struct expand_vec_perm_d dremap, dfinal; + unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4; + unsigned contents[2]; + bool ok; + + if (!(TARGET_AVX2 + && (d->vmode == V32QImode || d->vmode == V16HImode) + && d->one_operand_p)) + return false; + + contents[0] = 0; + contents[1] = 0; + for (i = 0; i < nelt2; ++i) + { + contents[0] |= 1u << (d->perm[i] / nelt4); + contents[1] |= 1u << (d->perm[i + nelt2] / nelt4); + } + + for (i = 0; i < 2; ++i) + { + unsigned int cnt = 0; + for (j = 0; j < 4; ++j) + if ((contents[i] & (1u << j)) != 0 && ++cnt > 2) + return false; + } + + if (d->testing_p) + return true; + + dremap = *d; + dremap.vmode = V4DImode; + dremap.nelt = 4; + dremap.target = gen_reg_rtx (V4DImode); + dremap.op0 = gen_lowpart (V4DImode, d->op0); + dremap.op1 = dremap.op0; + dremap.one_operand_p = true; + for (i = 0; i < 2; ++i) + { + unsigned int cnt = 0; + for (j = 0; j < 4; ++j) + if ((contents[i] & (1u << j)) != 0) + dremap.perm[2 * i + cnt++] = j; + for (; cnt < 2; ++cnt) + dremap.perm[2 * i + cnt] = 0; + } + + dfinal = *d; + dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); + dfinal.op1 = dfinal.op0; + dfinal.one_operand_p = true; + for (i = 0, j = 0; i < nelt; ++i) + { + if (i == nelt2) + j = 2; + dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0); + if ((d->perm[i] / nelt4) == dremap.perm[j]) + ; + else if ((d->perm[i] / nelt4) == dremap.perm[j + 1]) + dfinal.perm[i] |= nelt4; + else + gcc_unreachable (); + } + + ok = expand_vec_perm_1 (&dremap); + gcc_assert (ok); + + ok = expand_vec_perm_1 (&dfinal); + gcc_assert (ok); + + return true; +} + +static bool canonicalize_perm (struct expand_vec_perm_d *d); + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand + a vector permutation using two instructions, vperm2f128 resp. + vperm2i128 followed by any single in-lane permutation. */ + +static bool +expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d) +{ + struct expand_vec_perm_d dfirst, dsecond; + unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm; + bool ok; + + if (!TARGET_AVX + || GET_MODE_SIZE (d->vmode) != 32 + || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)) + return false; + + dsecond = *d; + dsecond.one_operand_p = false; + dsecond.testing_p = true; + + /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128 + immediate. For perm < 16 the second permutation uses + d->op0 as first operand, for perm >= 16 it uses d->op1 + as first operand. The second operand is the result of + vperm2[fi]128. */ + for (perm = 0; perm < 32; perm++) + { + /* Ignore permutations which do not move anything cross-lane. */ + if (perm < 16) + { + /* The second shuffle for e.g. V4DFmode has + 0123 and ABCD operands. + Ignore AB23, as 23 is already in the second lane + of the first operand. */ + if ((perm & 0xc) == (1 << 2)) continue; + /* And 01CD, as 01 is in the first lane of the first + operand. */ + if ((perm & 3) == 0) continue; + /* And 4567, as then the vperm2[fi]128 doesn't change + anything on the original 4567 second operand. */ + if ((perm & 0xf) == ((3 << 2) | 2)) continue; + } + else + { + /* The second shuffle for e.g. V4DFmode has + 4567 and ABCD operands. + Ignore AB67, as 67 is already in the second lane + of the first operand. */ + if ((perm & 0xc) == (3 << 2)) continue; + /* And 45CD, as 45 is in the first lane of the first + operand. */ + if ((perm & 3) == 2) continue; + /* And 0123, as then the vperm2[fi]128 doesn't change + anything on the original 0123 first operand. */ + if ((perm & 0xf) == (1 << 2)) continue; + } + + for (i = 0; i < nelt; i++) + { + j = d->perm[i] / nelt2; + if (j == ((perm >> (2 * (i >= nelt2))) & 3)) + dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1)); + else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16)) + dsecond.perm[i] = d->perm[i] & (nelt - 1); + else + break; + } + + if (i == nelt) + { + start_sequence (); + ok = expand_vec_perm_1 (&dsecond); + end_sequence (); + } + else + ok = false; + + if (ok) + { + if (d->testing_p) + return true; + + /* Found a usable second shuffle. dfirst will be + vperm2f128 on d->op0 and d->op1. */ + dsecond.testing_p = false; + dfirst = *d; + dfirst.target = gen_reg_rtx (d->vmode); + for (i = 0; i < nelt; i++) + dfirst.perm[i] = (i & (nelt2 - 1)) + + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2; + + canonicalize_perm (&dfirst); + ok = expand_vec_perm_1 (&dfirst); + gcc_assert (ok); + + /* And dsecond is some single insn shuffle, taking + d->op0 and result of vperm2f128 (if perm < 16) or + d->op1 and result of vperm2f128 (otherwise). */ + if (perm >= 16) + dsecond.op0 = dsecond.op1; + dsecond.op1 = dfirst.target; + + ok = expand_vec_perm_1 (&dsecond); + gcc_assert (ok); + + return true; + } + + /* For one operand, the only useful vperm2f128 permutation is 0x01 + aka lanes swap. */ + if (d->one_operand_p) + return false; + } + + return false; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify + a two vector permutation using 2 intra-lane interleave insns + and cross-lane shuffle for 32-byte vectors. */ + +static bool +expand_vec_perm_interleave3 (struct expand_vec_perm_d *d) +{ + unsigned i, nelt; + rtx (*gen) (rtx, rtx, rtx); + + if (d->one_operand_p) + return false; + if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32) + ; + else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode)) + ; + else + return false; + + nelt = d->nelt; + if (d->perm[0] != 0 && d->perm[0] != nelt / 2) + return false; + for (i = 0; i < nelt; i += 2) + if (d->perm[i] != d->perm[0] + i / 2 + || d->perm[i + 1] != d->perm[0] + i / 2 + nelt) + return false; + + if (d->testing_p) + return true; + + switch (d->vmode) + { + case E_V32QImode: + if (d->perm[0]) + gen = gen_vec_interleave_highv32qi; + else + gen = gen_vec_interleave_lowv32qi; + break; + case E_V16HImode: + if (d->perm[0]) + gen = gen_vec_interleave_highv16hi; + else + gen = gen_vec_interleave_lowv16hi; + break; + case E_V8SImode: + if (d->perm[0]) + gen = gen_vec_interleave_highv8si; + else + gen = gen_vec_interleave_lowv8si; + break; + case E_V4DImode: + if (d->perm[0]) + gen = gen_vec_interleave_highv4di; + else + gen = gen_vec_interleave_lowv4di; + break; + case E_V8SFmode: + if (d->perm[0]) + gen = gen_vec_interleave_highv8sf; + else + gen = gen_vec_interleave_lowv8sf; + break; + case E_V4DFmode: + if (d->perm[0]) + gen = gen_vec_interleave_highv4df; + else + gen = gen_vec_interleave_lowv4df; + break; + default: + gcc_unreachable (); + } + + emit_insn (gen (d->target, d->op0, d->op1)); + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement + a single vector permutation using a single intra-lane vector + permutation, vperm2f128 swapping the lanes and vblend* insn blending + the non-swapped and swapped vectors together. */ + +static bool +expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d) +{ + struct expand_vec_perm_d dfirst, dsecond; + unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2; + rtx_insn *seq; + bool ok; + rtx (*blend) (rtx, rtx, rtx, rtx) = NULL; + + if (!TARGET_AVX + || TARGET_AVX2 + || (d->vmode != V8SFmode && d->vmode != V4DFmode) + || !d->one_operand_p) + return false; + + dfirst = *d; + for (i = 0; i < nelt; i++) + dfirst.perm[i] = 0xff; + for (i = 0, msk = 0; i < nelt; i++) + { + j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2; + if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i]) + return false; + dfirst.perm[j] = d->perm[i]; + if (j != i) + msk |= (1 << i); + } + for (i = 0; i < nelt; i++) + if (dfirst.perm[i] == 0xff) + dfirst.perm[i] = i; + + if (!d->testing_p) + dfirst.target = gen_reg_rtx (dfirst.vmode); + + start_sequence (); + ok = expand_vec_perm_1 (&dfirst); + seq = get_insns (); + end_sequence (); + + if (!ok) + return false; + + if (d->testing_p) + return true; + + emit_insn (seq); + + dsecond = *d; + dsecond.op0 = dfirst.target; + dsecond.op1 = dfirst.target; + dsecond.one_operand_p = true; + dsecond.target = gen_reg_rtx (dsecond.vmode); + for (i = 0; i < nelt; i++) + dsecond.perm[i] = i ^ nelt2; + + ok = expand_vec_perm_1 (&dsecond); + gcc_assert (ok); + + blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256; + emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk))); + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF + permutation using two vperm2f128, followed by a vshufpd insn blending + the two vectors together. */ + +static bool +expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d) +{ + struct expand_vec_perm_d dfirst, dsecond, dthird; + bool ok; + + if (!TARGET_AVX || (d->vmode != V4DFmode)) + return false; + + if (d->testing_p) + return true; + + dfirst = *d; + dsecond = *d; + dthird = *d; + + dfirst.perm[0] = (d->perm[0] & ~1); + dfirst.perm[1] = (d->perm[0] & ~1) + 1; + dfirst.perm[2] = (d->perm[2] & ~1); + dfirst.perm[3] = (d->perm[2] & ~1) + 1; + dsecond.perm[0] = (d->perm[1] & ~1); + dsecond.perm[1] = (d->perm[1] & ~1) + 1; + dsecond.perm[2] = (d->perm[3] & ~1); + dsecond.perm[3] = (d->perm[3] & ~1) + 1; + dthird.perm[0] = (d->perm[0] % 2); + dthird.perm[1] = (d->perm[1] % 2) + 4; + dthird.perm[2] = (d->perm[2] % 2) + 2; + dthird.perm[3] = (d->perm[3] % 2) + 6; + + dfirst.target = gen_reg_rtx (dfirst.vmode); + dsecond.target = gen_reg_rtx (dsecond.vmode); + dthird.op0 = dfirst.target; + dthird.op1 = dsecond.target; + dthird.one_operand_p = false; + + canonicalize_perm (&dfirst); + canonicalize_perm (&dsecond); + + ok = expand_vec_perm_1 (&dfirst) + && expand_vec_perm_1 (&dsecond) + && expand_vec_perm_1 (&dthird); + + gcc_assert (ok); + + return true; +} + +/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word + permutation with two pshufb insns and an ior. We should have already + failed all two instruction sequences. */ + +static bool +expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) +{ + rtx rperm[2][16], vperm, l, h, op, m128; + unsigned int i, nelt, eltsz; + + if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) + return false; + gcc_assert (!d->one_operand_p); + + if (d->testing_p) + return true; + + nelt = d->nelt; + eltsz = GET_MODE_UNIT_SIZE (d->vmode); + + /* Generate two permutation masks. If the required element is within + the given vector it is shuffled into the proper lane. If the required + element is in the other vector, force a zero into the lane by setting + bit 7 in the permutation mask. */ + m128 = GEN_INT (-128); + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i]; + unsigned which = (e >= nelt); + if (e >= nelt) + e -= nelt; + + for (j = 0; j < eltsz; ++j) + { + rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j); + rperm[1-which][i*eltsz + j] = m128; + } + } + + vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0])); + vperm = force_reg (V16QImode, vperm); + + l = gen_reg_rtx (V16QImode); + op = gen_lowpart (V16QImode, d->op0); + emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm)); + + vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1])); + vperm = force_reg (V16QImode, vperm); + + h = gen_reg_rtx (V16QImode); + op = gen_lowpart (V16QImode, d->op1); + emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm)); + + op = d->target; + if (d->vmode != V16QImode) + op = gen_reg_rtx (V16QImode); + emit_insn (gen_iorv16qi3 (op, l, h)); + if (op != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, op)); + + return true; +} + +/* Implement arbitrary permutation of one V32QImode and V16QImode operand + with two vpshufb insns, vpermq and vpor. We should have already failed + all two or three instruction sequences. */ + +static bool +expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) +{ + rtx rperm[2][32], vperm, l, h, hp, op, m128; + unsigned int i, nelt, eltsz; + + if (!TARGET_AVX2 + || !d->one_operand_p + || (d->vmode != V32QImode && d->vmode != V16HImode)) + return false; + + if (d->testing_p) + return true; + + nelt = d->nelt; + eltsz = GET_MODE_UNIT_SIZE (d->vmode); + + /* Generate two permutation masks. If the required element is within + the same lane, it is shuffled in. If the required element from the + other lane, force a zero by setting bit 7 in the permutation mask. + In the other mask the mask has non-negative elements if element + is requested from the other lane, but also moved to the other lane, + so that the result of vpshufb can have the two V2TImode halves + swapped. */ + m128 = GEN_INT (-128); + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i] & (nelt / 2 - 1); + unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; + + for (j = 0; j < eltsz; ++j) + { + rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j); + rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128; + } + } + + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); + vperm = force_reg (V32QImode, vperm); + + h = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); + + /* Swap the 128-byte lanes of h into hp. */ + hp = gen_reg_rtx (V4DImode); + op = gen_lowpart (V4DImode, h); + emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx, + const1_rtx)); + + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); + vperm = force_reg (V32QImode, vperm); + + l = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); + + op = d->target; + if (d->vmode != V32QImode) + op = gen_reg_rtx (V32QImode); + emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp))); + if (op != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, op)); + + return true; +} + +/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even + and extract-odd permutations of two V32QImode and V16QImode operand + with two vpshufb insns, vpor and vpermq. We should have already + failed all two or three instruction sequences. */ + +static bool +expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) +{ + rtx rperm[2][32], vperm, l, h, ior, op, m128; + unsigned int i, nelt, eltsz; + + if (!TARGET_AVX2 + || d->one_operand_p + || (d->vmode != V32QImode && d->vmode != V16HImode)) + return false; + + for (i = 0; i < d->nelt; ++i) + if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2)) + return false; + + if (d->testing_p) + return true; + + nelt = d->nelt; + eltsz = GET_MODE_UNIT_SIZE (d->vmode); + + /* Generate two permutation masks. In the first permutation mask + the first quarter will contain indexes for the first half + of the op0, the second quarter will contain bit 7 set, third quarter + will contain indexes for the second half of the op0 and the + last quarter bit 7 set. In the second permutation mask + the first quarter will contain bit 7 set, the second quarter + indexes for the first half of the op1, the third quarter bit 7 set + and last quarter indexes for the second half of the op1. + I.e. the first mask e.g. for V32QImode extract even will be: + 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128 + (all values masked with 0xf except for -128) and second mask + for extract even will be + -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */ + m128 = GEN_INT (-128); + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i] & (nelt / 2 - 1); + unsigned which = d->perm[i] >= nelt; + unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0; + + for (j = 0; j < eltsz; ++j) + { + rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j); + rperm[1 - which][(i * eltsz + j) ^ xorv] = m128; + } + } + + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); + vperm = force_reg (V32QImode, vperm); + + l = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); + + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); + vperm = force_reg (V32QImode, vperm); + + h = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, d->op1); + emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); + + ior = gen_reg_rtx (V32QImode); + emit_insn (gen_iorv32qi3 (ior, l, h)); + + /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */ + op = gen_reg_rtx (V4DImode); + ior = gen_lowpart (V4DImode, ior); + emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + emit_move_insn (d->target, gen_lowpart (d->vmode, op)); + + return true; +} + +/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even + and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands + with two "and" and "pack" or two "shift" and "pack" insns. We should + have already failed all two instruction sequences. */ + +static bool +expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) +{ + rtx op, dop0, dop1, t; + unsigned i, odd, c, s, nelt = d->nelt; + bool end_perm = false; + machine_mode half_mode; + rtx (*gen_and) (rtx, rtx, rtx); + rtx (*gen_pack) (rtx, rtx, rtx); + rtx (*gen_shift) (rtx, rtx, rtx); + + if (d->one_operand_p) + return false; + + switch (d->vmode) + { + case E_V8HImode: + /* Required for "pack". */ + if (!TARGET_SSE4_1) + return false; + c = 0xffff; + s = 16; + half_mode = V4SImode; + gen_and = gen_andv4si3; + gen_pack = gen_sse4_1_packusdw; + gen_shift = gen_lshrv4si3; + break; + case E_V16QImode: + /* No check as all instructions are SSE2. */ + c = 0xff; + s = 8; + half_mode = V8HImode; + gen_and = gen_andv8hi3; + gen_pack = gen_sse2_packuswb; + gen_shift = gen_lshrv8hi3; + break; + case E_V16HImode: + if (!TARGET_AVX2) + return false; + c = 0xffff; + s = 16; + half_mode = V8SImode; + gen_and = gen_andv8si3; + gen_pack = gen_avx2_packusdw; + gen_shift = gen_lshrv8si3; + end_perm = true; + break; + case E_V32QImode: + if (!TARGET_AVX2) + return false; + c = 0xff; + s = 8; + half_mode = V16HImode; + gen_and = gen_andv16hi3; + gen_pack = gen_avx2_packuswb; + gen_shift = gen_lshrv16hi3; + end_perm = true; + break; + default: + /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than + general shuffles. */ + return false; + } + + /* Check that permutation is even or odd. */ + odd = d->perm[0]; + if (odd > 1) + return false; + + for (i = 1; i < nelt; ++i) + if (d->perm[i] != 2 * i + odd) + return false; + + if (d->testing_p) + return true; + + dop0 = gen_reg_rtx (half_mode); + dop1 = gen_reg_rtx (half_mode); + if (odd == 0) + { + t = gen_const_vec_duplicate (half_mode, GEN_INT (c)); + t = force_reg (half_mode, t); + emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); + emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); + } + else + { + emit_insn (gen_shift (dop0, + gen_lowpart (half_mode, d->op0), + GEN_INT (s))); + emit_insn (gen_shift (dop1, + gen_lowpart (half_mode, d->op1), + GEN_INT (s))); + } + /* In AVX2 for 256 bit case we need to permute pack result. */ + if (TARGET_AVX2 && end_perm) + { + op = gen_reg_rtx (d->vmode); + t = gen_reg_rtx (V4DImode); + emit_insn (gen_pack (op, dop0, dop1)); + emit_insn (gen_avx2_permv4di_1 (t, + gen_lowpart (V4DImode, op), + const0_rtx, + const2_rtx, + const1_rtx, + GEN_INT (3))); + emit_move_insn (d->target, gen_lowpart (d->vmode, t)); + } + else + emit_insn (gen_pack (d->target, dop0, dop1)); + + return true; +} + +/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even + and extract-odd permutations of two V64QI operands + with two "shifts", two "truncs" and one "concat" insns for "odd" + and two "truncs" and one concat insn for "even." + Have already failed all two instruction sequences. */ + +static bool +expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d) +{ + rtx t1, t2, t3, t4; + unsigned i, odd, nelt = d->nelt; + + if (!TARGET_AVX512BW + || d->one_operand_p + || d->vmode != V64QImode) + return false; + + /* Check that permutation is even or odd. */ + odd = d->perm[0]; + if (odd > 1) + return false; + + for (i = 1; i < nelt; ++i) + if (d->perm[i] != 2 * i + odd) + return false; + + if (d->testing_p) + return true; + + + if (odd) + { + t1 = gen_reg_rtx (V32HImode); + t2 = gen_reg_rtx (V32HImode); + emit_insn (gen_lshrv32hi3 (t1, + gen_lowpart (V32HImode, d->op0), + GEN_INT (8))); + emit_insn (gen_lshrv32hi3 (t2, + gen_lowpart (V32HImode, d->op1), + GEN_INT (8))); + } + else + { + t1 = gen_lowpart (V32HImode, d->op0); + t2 = gen_lowpart (V32HImode, d->op1); + } + + t3 = gen_reg_rtx (V32QImode); + t4 = gen_reg_rtx (V32QImode); + emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1)); + emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2)); + emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4)); + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even + and extract-odd permutations. */ + +static bool +expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) +{ + rtx t1, t2, t3, t4, t5; + + switch (d->vmode) + { + case E_V4DFmode: + if (d->testing_p) + break; + t1 = gen_reg_rtx (V4DFmode); + t2 = gen_reg_rtx (V4DFmode); + + /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ + emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20))); + emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31))); + + /* Now an unpck[lh]pd will produce the result required. */ + if (odd) + t3 = gen_avx_unpckhpd256 (d->target, t1, t2); + else + t3 = gen_avx_unpcklpd256 (d->target, t1, t2); + emit_insn (t3); + break; + + case E_V8SFmode: + { + int mask = odd ? 0xdd : 0x88; + + if (d->testing_p) + break; + t1 = gen_reg_rtx (V8SFmode); + t2 = gen_reg_rtx (V8SFmode); + t3 = gen_reg_rtx (V8SFmode); + + /* Shuffle within the 128-bit lanes to produce: + { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */ + emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1, + GEN_INT (mask))); + + /* Shuffle the lanes around to produce: + { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */ + emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1, + GEN_INT (0x3))); + + /* Shuffle within the 128-bit lanes to produce: + { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */ + emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44))); + + /* Shuffle within the 128-bit lanes to produce: + { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */ + emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee))); + + /* Shuffle the lanes around to produce: + { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */ + emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2, + GEN_INT (0x20))); + } + break; + + case E_V2DFmode: + case E_V4SFmode: + case E_V2DImode: + case E_V4SImode: + /* These are always directly implementable by expand_vec_perm_1. */ + gcc_unreachable (); + + case E_V8HImode: + if (TARGET_SSE4_1) + return expand_vec_perm_even_odd_pack (d); + else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) + return expand_vec_perm_pshufb2 (d); + else + { + if (d->testing_p) + break; + /* We need 2*log2(N)-1 operations to achieve odd/even + with interleave. */ + t1 = gen_reg_rtx (V8HImode); + t2 = gen_reg_rtx (V8HImode); + emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1)); + emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1)); + emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1)); + emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1)); + if (odd) + t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2); + else + t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2); + emit_insn (t3); + } + break; + + case E_V16QImode: + return expand_vec_perm_even_odd_pack (d); + + case E_V16HImode: + case E_V32QImode: + return expand_vec_perm_even_odd_pack (d); + + case E_V64QImode: + return expand_vec_perm_even_odd_trunc (d); + + case E_V4DImode: + if (!TARGET_AVX2) + { + struct expand_vec_perm_d d_copy = *d; + d_copy.vmode = V4DFmode; + if (d->testing_p) + d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1); + else + d_copy.target = gen_reg_rtx (V4DFmode); + d_copy.op0 = gen_lowpart (V4DFmode, d->op0); + d_copy.op1 = gen_lowpart (V4DFmode, d->op1); + if (expand_vec_perm_even_odd_1 (&d_copy, odd)) + { + if (!d->testing_p) + emit_move_insn (d->target, + gen_lowpart (V4DImode, d_copy.target)); + return true; + } + return false; + } + + if (d->testing_p) + break; + + t1 = gen_reg_rtx (V4DImode); + t2 = gen_reg_rtx (V4DImode); + + /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ + emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20))); + emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31))); + + /* Now an vpunpck[lh]qdq will produce the result required. */ + if (odd) + t3 = gen_avx2_interleave_highv4di (d->target, t1, t2); + else + t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2); + emit_insn (t3); + break; + + case E_V8SImode: + if (!TARGET_AVX2) + { + struct expand_vec_perm_d d_copy = *d; + d_copy.vmode = V8SFmode; + if (d->testing_p) + d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1); + else + d_copy.target = gen_reg_rtx (V8SFmode); + d_copy.op0 = gen_lowpart (V8SFmode, d->op0); + d_copy.op1 = gen_lowpart (V8SFmode, d->op1); + if (expand_vec_perm_even_odd_1 (&d_copy, odd)) + { + if (!d->testing_p) + emit_move_insn (d->target, + gen_lowpart (V8SImode, d_copy.target)); + return true; + } + return false; + } + + if (d->testing_p) + break; + + t1 = gen_reg_rtx (V8SImode); + t2 = gen_reg_rtx (V8SImode); + t3 = gen_reg_rtx (V4DImode); + t4 = gen_reg_rtx (V4DImode); + t5 = gen_reg_rtx (V4DImode); + + /* Shuffle the lanes around into + { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */ + emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0), + gen_lowpart (V4DImode, d->op1), + GEN_INT (0x20))); + emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0), + gen_lowpart (V4DImode, d->op1), + GEN_INT (0x31))); + + /* Swap the 2nd and 3rd position in each lane into + { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */ + emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3), + GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); + emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4), + GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); + + /* Now an vpunpck[lh]qdq will produce + { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */ + if (odd) + t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, t2)); + else + t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, t2)); + emit_insn (t3); + emit_move_insn (d->target, gen_lowpart (V8SImode, t5)); + break; + + default: + gcc_unreachable (); + } + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match + extract-even and extract-odd permutations. */ + +static bool +expand_vec_perm_even_odd (struct expand_vec_perm_d *d) +{ + unsigned i, odd, nelt = d->nelt; + + odd = d->perm[0]; + if (odd != 0 && odd != 1) + return false; + + for (i = 1; i < nelt; ++i) + if (d->perm[i] != 2 * i + odd) + return false; + + return expand_vec_perm_even_odd_1 (d, odd); +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast + permutations. We assume that expand_vec_perm_1 has already failed. */ + +static bool +expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) +{ + unsigned elt = d->perm[0], nelt2 = d->nelt / 2; + machine_mode vmode = d->vmode; + unsigned char perm2[4]; + rtx op0 = d->op0, dest; + bool ok; + + switch (vmode) + { + case E_V4DFmode: + case E_V8SFmode: + /* These are special-cased in sse.md so that we can optionally + use the vbroadcast instruction. They expand to two insns + if the input happens to be in a register. */ + gcc_unreachable (); + + case E_V2DFmode: + case E_V2DImode: + case E_V4SFmode: + case E_V4SImode: + /* These are always implementable using standard shuffle patterns. */ + gcc_unreachable (); + + case E_V8HImode: + case E_V16QImode: + /* These can be implemented via interleave. We save one insn by + stopping once we have promoted to V4SImode and then use pshufd. */ + if (d->testing_p) + return true; + do + { + rtx dest; + rtx (*gen) (rtx, rtx, rtx) + = vmode == V16QImode ? gen_vec_interleave_lowv16qi + : gen_vec_interleave_lowv8hi; + + if (elt >= nelt2) + { + gen = vmode == V16QImode ? gen_vec_interleave_highv16qi + : gen_vec_interleave_highv8hi; + elt -= nelt2; + } + nelt2 /= 2; + + dest = gen_reg_rtx (vmode); + emit_insn (gen (dest, op0, op0)); + vmode = get_mode_wider_vector (vmode); + op0 = gen_lowpart (vmode, dest); + } + while (vmode != V4SImode); + + memset (perm2, elt, 4); + dest = gen_reg_rtx (V4SImode); + ok = expand_vselect (dest, op0, perm2, 4, d->testing_p); + gcc_assert (ok); + if (!d->testing_p) + emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); + return true; + + case E_V64QImode: + case E_V32QImode: + case E_V16HImode: + case E_V8SImode: + case E_V4DImode: + /* For AVX2 broadcasts of the first element vpbroadcast* or + vpermq should be used by expand_vec_perm_1. */ + gcc_assert (!TARGET_AVX2 || d->perm[0]); + return false; + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match + broadcast permutations. */ + +static bool +expand_vec_perm_broadcast (struct expand_vec_perm_d *d) +{ + unsigned i, elt, nelt = d->nelt; + + if (!d->one_operand_p) + return false; + + elt = d->perm[0]; + for (i = 1; i < nelt; ++i) + if (d->perm[i] != elt) + return false; + + return expand_vec_perm_broadcast_1 (d); +} + +/* Implement arbitrary permutations of two V64QImode operands + with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */ +static bool +expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d) +{ + if (!TARGET_AVX512BW || !(d->vmode == V64QImode)) + return false; + + if (d->testing_p) + return true; + + struct expand_vec_perm_d ds[2]; + rtx rperm[128], vperm, target0, target1; + unsigned int i, nelt; + machine_mode vmode; + + nelt = d->nelt; + vmode = V64QImode; + + for (i = 0; i < 2; i++) + { + ds[i] = *d; + ds[i].vmode = V32HImode; + ds[i].nelt = 32; + ds[i].target = gen_reg_rtx (V32HImode); + ds[i].op0 = gen_lowpart (V32HImode, d->op0); + ds[i].op1 = gen_lowpart (V32HImode, d->op1); + } + + /* Prepare permutations such that the first one takes care of + putting the even bytes into the right positions or one higher + positions (ds[0]) and the second one takes care of + putting the odd bytes into the right positions or one below + (ds[1]). */ + + for (i = 0; i < nelt; i++) + { + ds[i & 1].perm[i / 2] = d->perm[i] / 2; + if (i & 1) + { + rperm[i] = constm1_rtx; + rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1)); + } + else + { + rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1)); + rperm[i + 64] = constm1_rtx; + } + } + + bool ok = expand_vec_perm_1 (&ds[0]); + gcc_assert (ok); + ds[0].target = gen_lowpart (V64QImode, ds[0].target); + + ok = expand_vec_perm_1 (&ds[1]); + gcc_assert (ok); + ds[1].target = gen_lowpart (V64QImode, ds[1].target); + + vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm)); + vperm = force_reg (vmode, vperm); + target0 = gen_reg_rtx (V64QImode); + emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm)); + + vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64)); + vperm = force_reg (vmode, vperm); + target1 = gen_reg_rtx (V64QImode); + emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm)); + + emit_insn (gen_iorv64qi3 (d->target, target0, target1)); + return true; +} + +/* Implement arbitrary permutation of two V32QImode and V16QImode operands + with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed + all the shorter instruction sequences. */ + +static bool +expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d) +{ + rtx rperm[4][32], vperm, l[2], h[2], op, m128; + unsigned int i, nelt, eltsz; + bool used[4]; + + if (!TARGET_AVX2 + || d->one_operand_p + || (d->vmode != V32QImode && d->vmode != V16HImode)) + return false; + + if (d->testing_p) + return true; + + nelt = d->nelt; + eltsz = GET_MODE_UNIT_SIZE (d->vmode); + + /* Generate 4 permutation masks. If the required element is within + the same lane, it is shuffled in. If the required element from the + other lane, force a zero by setting bit 7 in the permutation mask. + In the other mask the mask has non-negative elements if element + is requested from the other lane, but also moved to the other lane, + so that the result of vpshufb can have the two V2TImode halves + swapped. */ + m128 = GEN_INT (-128); + for (i = 0; i < 32; ++i) + { + rperm[0][i] = m128; + rperm[1][i] = m128; + rperm[2][i] = m128; + rperm[3][i] = m128; + } + used[0] = false; + used[1] = false; + used[2] = false; + used[3] = false; + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i] & (nelt / 2 - 1); + unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; + unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0); + + for (j = 0; j < eltsz; ++j) + rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j); + used[which] = true; + } + + for (i = 0; i < 2; ++i) + { + if (!used[2 * i + 1]) + { + h[i] = NULL_RTX; + continue; + } + vperm = gen_rtx_CONST_VECTOR (V32QImode, + gen_rtvec_v (32, rperm[2 * i + 1])); + vperm = force_reg (V32QImode, vperm); + h[i] = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm)); + } + + /* Swap the 128-byte lanes of h[X]. */ + for (i = 0; i < 2; ++i) + { + if (h[i] == NULL_RTX) + continue; + op = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]), + const2_rtx, GEN_INT (3), const0_rtx, + const1_rtx)); + h[i] = gen_lowpart (V32QImode, op); + } + + for (i = 0; i < 2; ++i) + { + if (!used[2 * i]) + { + l[i] = NULL_RTX; + continue; + } + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i])); + vperm = force_reg (V32QImode, vperm); + l[i] = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm)); + } + + for (i = 0; i < 2; ++i) + { + if (h[i] && l[i]) + { + op = gen_reg_rtx (V32QImode); + emit_insn (gen_iorv32qi3 (op, l[i], h[i])); + l[i] = op; + } + else if (h[i]) + l[i] = h[i]; + } + + gcc_assert (l[0] && l[1]); + op = d->target; + if (d->vmode != V32QImode) + op = gen_reg_rtx (V32QImode); + emit_insn (gen_iorv32qi3 (op, l[0], l[1])); + if (op != d->target) + emit_move_insn (d->target, gen_lowpart (d->vmode, op)); + return true; +} + +/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits + taken care of, perform the expansion in D and return true on success. */ + +static bool +ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) +{ + /* Try a single instruction expansion. */ + if (expand_vec_perm_1 (d)) + return true; + + /* Try sequences of two instructions. */ + + if (expand_vec_perm_pshuflw_pshufhw (d)) + return true; + + if (expand_vec_perm_palignr (d, false)) + return true; + + if (expand_vec_perm_interleave2 (d)) + return true; + + if (expand_vec_perm_broadcast (d)) + return true; + + if (expand_vec_perm_vpermq_perm_1 (d)) + return true; + + if (expand_vec_perm_vperm2f128 (d)) + return true; + + if (expand_vec_perm_pblendv (d)) + return true; + + /* Try sequences of three instructions. */ + + if (expand_vec_perm_even_odd_pack (d)) + return true; + + if (expand_vec_perm_2vperm2f128_vshuf (d)) + return true; + + if (expand_vec_perm_pshufb2 (d)) + return true; + + if (expand_vec_perm_interleave3 (d)) + return true; + + if (expand_vec_perm_vperm2f128_vblend (d)) + return true; + + /* Try sequences of four instructions. */ + + if (expand_vec_perm_even_odd_trunc (d)) + return true; + if (expand_vec_perm_vpshufb2_vpermq (d)) + return true; + + if (expand_vec_perm_vpshufb2_vpermq_even_odd (d)) + return true; + + if (expand_vec_perm_vpermt2_vpshub2 (d)) + return true; + + /* ??? Look for narrow permutations whose element orderings would + allow the promotion to a wider mode. */ + + /* ??? Look for sequences of interleave or a wider permute that place + the data into the correct lanes for a half-vector shuffle like + pshuf[lh]w or vpermilps. */ + + /* ??? Look for sequences of interleave that produce the desired results. + The combinatorics of punpck[lh] get pretty ugly... */ + + if (expand_vec_perm_even_odd (d)) + return true; + + /* Even longer sequences. */ + if (expand_vec_perm_vpshufb4_vpermq2 (d)) + return true; + + /* See if we can get the same permutation in different vector integer + mode. */ + struct expand_vec_perm_d nd; + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) + { + if (!d->testing_p) + emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); + return true; + } + + return false; +} + +/* If a permutation only uses one operand, make it clear. Returns true + if the permutation references both operands. */ + +static bool +canonicalize_perm (struct expand_vec_perm_d *d) +{ + int i, which, nelt = d->nelt; + + for (i = which = 0; i < nelt; ++i) + which |= (d->perm[i] < nelt ? 1 : 2); + + d->one_operand_p = true; + switch (which) + { + default: + gcc_unreachable(); + + case 3: + if (!rtx_equal_p (d->op0, d->op1)) + { + d->one_operand_p = false; + break; + } + /* The elements of PERM do not suggest that only the first operand + is used, but both operands are identical. Allow easier matching + of the permutation by folding the permutation into the single + input vector. */ + /* FALLTHRU */ + + case 2: + for (i = 0; i < nelt; ++i) + d->perm[i] &= nelt - 1; + d->op0 = d->op1; + break; + + case 1: + d->op1 = d->op0; + break; + } + + return (which == 3); +} + +/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */ + +bool +ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, + rtx op1, const vec_perm_indices &sel) +{ + struct expand_vec_perm_d d; + unsigned char perm[MAX_VECT_LEN]; + unsigned int i, nelt, which; + bool two_args; + + d.target = target; + d.op0 = op0; + d.op1 = op1; + + d.vmode = vmode; + gcc_assert (VECTOR_MODE_P (d.vmode)); + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.testing_p = !target; + + gcc_assert (sel.length () == nelt); + gcc_checking_assert (sizeof (d.perm) == sizeof (perm)); + + /* Given sufficient ISA support we can just return true here + for selected vector modes. */ + switch (d.vmode) + { + case E_V16SFmode: + case E_V16SImode: + case E_V8DImode: + case E_V8DFmode: + if (!TARGET_AVX512F) + return false; + /* All implementable with a single vperm[it]2 insn. */ + if (d.testing_p) + return true; + break; + case E_V32HImode: + if (!TARGET_AVX512BW) + return false; + if (d.testing_p) + /* All implementable with a single vperm[it]2 insn. */ + return true; + break; + case E_V64QImode: + if (!TARGET_AVX512BW) + return false; + if (d.testing_p) + /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */ + return true; + break; + case E_V8SImode: + case E_V8SFmode: + case E_V4DFmode: + case E_V4DImode: + if (!TARGET_AVX) + return false; + if (d.testing_p && TARGET_AVX512VL) + /* All implementable with a single vperm[it]2 insn. */ + return true; + break; + case E_V16HImode: + if (!TARGET_SSE2) + return false; + if (d.testing_p && TARGET_AVX2) + /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ + return true; + break; + case E_V32QImode: + if (!TARGET_SSE2) + return false; + if (d.testing_p && TARGET_AVX2) + /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ + return true; + break; + case E_V8HImode: + case E_V16QImode: + if (!TARGET_SSE2) + return false; + /* Fall through. */ + case E_V4SImode: + case E_V4SFmode: + if (!TARGET_SSE) + return false; + /* All implementable with a single vpperm insn. */ + if (d.testing_p && TARGET_XOP) + return true; + /* All implementable with 2 pshufb + 1 ior. */ + if (d.testing_p && TARGET_SSSE3) + return true; + break; + case E_V2DImode: + case E_V2DFmode: + if (!TARGET_SSE) + return false; + /* All implementable with shufpd or unpck[lh]pd. */ + if (d.testing_p) + return true; + break; + default: + return false; + } + + for (i = which = 0; i < nelt; ++i) + { + unsigned char e = sel[i]; + gcc_assert (e < 2 * nelt); + d.perm[i] = e; + perm[i] = e; + which |= (e < nelt ? 1 : 2); + } + + if (d.testing_p) + { + /* For all elements from second vector, fold the elements to first. */ + if (which == 2) + for (i = 0; i < nelt; ++i) + d.perm[i] -= nelt; + + /* Check whether the mask can be applied to the vector type. */ + d.one_operand_p = (which != 3); + + /* Implementable with shufps or pshufd. */ + if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode)) + return true; + + /* Otherwise we have to go through the motions and see if we can + figure out how to generate the requested permutation. */ + d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1); + d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2); + if (!d.one_operand_p) + d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3); + + start_sequence (); + bool ret = ix86_expand_vec_perm_const_1 (&d); + end_sequence (); + + return ret; + } + + two_args = canonicalize_perm (&d); + + if (ix86_expand_vec_perm_const_1 (&d)) + return true; + + /* If the selector says both arguments are needed, but the operands are the + same, the above tried to expand with one_operand_p and flattened selector. + If that didn't work, retry without one_operand_p; we succeeded with that + during testing. */ + if (two_args && d.one_operand_p) + { + d.one_operand_p = false; + memcpy (d.perm, perm, sizeof (perm)); + return ix86_expand_vec_perm_const_1 (&d); + } + + return false; +} + +void +ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) +{ + struct expand_vec_perm_d d; + unsigned i, nelt; + + d.target = targ; + d.op0 = op0; + d.op1 = op1; + d.vmode = GET_MODE (targ); + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.one_operand_p = false; + d.testing_p = false; + + for (i = 0; i < nelt; ++i) + d.perm[i] = i * 2 + odd; + + /* We'll either be able to implement the permutation directly... */ + if (expand_vec_perm_1 (&d)) + return; + + /* ... or we use the special-case patterns. */ + expand_vec_perm_even_odd_1 (&d, odd); +} + +static void +ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p) +{ + struct expand_vec_perm_d d; + unsigned i, nelt, base; + bool ok; + + d.target = targ; + d.op0 = op0; + d.op1 = op1; + d.vmode = GET_MODE (targ); + d.nelt = nelt = GET_MODE_NUNITS (d.vmode); + d.one_operand_p = false; + d.testing_p = false; + + base = high_p ? nelt / 2 : 0; + for (i = 0; i < nelt / 2; ++i) + { + d.perm[i * 2] = i + base; + d.perm[i * 2 + 1] = i + base + nelt; + } + + /* Note that for AVX this isn't one instruction. */ + ok = ix86_expand_vec_perm_const_1 (&d); + gcc_assert (ok); +} + + +/* Expand a vector operation CODE for a V*QImode in terms of the + same operation on V*HImode. */ + +void +ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) +{ + machine_mode qimode = GET_MODE (dest); + machine_mode himode; + rtx (*gen_il) (rtx, rtx, rtx); + rtx (*gen_ih) (rtx, rtx, rtx); + rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h; + struct expand_vec_perm_d d; + bool ok, full_interleave; + bool uns_p = false; + int i; + + switch (qimode) + { + case E_V16QImode: + himode = V8HImode; + gen_il = gen_vec_interleave_lowv16qi; + gen_ih = gen_vec_interleave_highv16qi; + break; + case E_V32QImode: + himode = V16HImode; + gen_il = gen_avx2_interleave_lowv32qi; + gen_ih = gen_avx2_interleave_highv32qi; + break; + case E_V64QImode: + himode = V32HImode; + gen_il = gen_avx512bw_interleave_lowv64qi; + gen_ih = gen_avx512bw_interleave_highv64qi; + break; + default: + gcc_unreachable (); + } + + op2_l = op2_h = op2; + switch (code) + { + case MULT: + /* Unpack data such that we've got a source byte in each low byte of + each word. We don't care what goes into the high byte of each word. + Rather than trying to get zero in there, most convenient is to let + it be a copy of the low byte. */ + op2_l = gen_reg_rtx (qimode); + op2_h = gen_reg_rtx (qimode); + emit_insn (gen_il (op2_l, op2, op2)); + emit_insn (gen_ih (op2_h, op2, op2)); + + op1_l = gen_reg_rtx (qimode); + op1_h = gen_reg_rtx (qimode); + emit_insn (gen_il (op1_l, op1, op1)); + emit_insn (gen_ih (op1_h, op1, op1)); + full_interleave = qimode == V16QImode; + break; + + case ASHIFT: + case LSHIFTRT: + uns_p = true; + /* FALLTHRU */ + case ASHIFTRT: + op1_l = gen_reg_rtx (himode); + op1_h = gen_reg_rtx (himode); + ix86_expand_sse_unpack (op1_l, op1, uns_p, false); + ix86_expand_sse_unpack (op1_h, op1, uns_p, true); + full_interleave = true; + break; + default: + gcc_unreachable (); + } + + /* Perform the operation. */ + res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX, + 1, OPTAB_DIRECT); + res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX, + 1, OPTAB_DIRECT); + gcc_assert (res_l && res_h); + + /* Merge the data back into the right place. */ + d.target = dest; + d.op0 = gen_lowpart (qimode, res_l); + d.op1 = gen_lowpart (qimode, res_h); + d.vmode = qimode; + d.nelt = GET_MODE_NUNITS (qimode); + d.one_operand_p = false; + d.testing_p = false; + + if (full_interleave) + { + /* For SSE2, we used an full interleave, so the desired + results are in the even elements. */ + for (i = 0; i < d.nelt; ++i) + d.perm[i] = i * 2; + } + else + { + /* For AVX, the interleave used above was not cross-lane. So the + extraction is evens but with the second and third quarter swapped. + Happily, that is even one insn shorter than even extraction. + For AVX512BW we have 4 lanes. We extract evens from within a lane, + always first from the first and then from the second source operand, + the index bits above the low 4 bits remains the same. + Thus, for d.nelt == 32 we want permutation + 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62 + and for d.nelt == 64 we want permutation + 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94, + 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */ + for (i = 0; i < d.nelt; ++i) + d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15); + } + + ok = ix86_expand_vec_perm_const_1 (&d); + gcc_assert (ok); + + set_unique_reg_note (get_last_insn (), REG_EQUAL, + gen_rtx_fmt_ee (code, qimode, op1, op2)); +} + +/* Helper function of ix86_expand_mul_widen_evenodd. Return true + if op is CONST_VECTOR with all odd elements equal to their + preceding element. */ + +static bool +const_vector_equal_evenodd_p (rtx op) +{ + machine_mode mode = GET_MODE (op); + int i, nunits = GET_MODE_NUNITS (mode); + if (GET_CODE (op) != CONST_VECTOR + || nunits != CONST_VECTOR_NUNITS (op)) + return false; + for (i = 0; i < nunits; i += 2) + if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1)) + return false; + return true; +} + +void +ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, + bool uns_p, bool odd_p) +{ + machine_mode mode = GET_MODE (op1); + machine_mode wmode = GET_MODE (dest); + rtx x; + rtx orig_op1 = op1, orig_op2 = op2; + + if (!nonimmediate_operand (op1, mode)) + op1 = force_reg (mode, op1); + if (!nonimmediate_operand (op2, mode)) + op2 = force_reg (mode, op2); + + /* We only play even/odd games with vectors of SImode. */ + gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode); + + /* If we're looking for the odd results, shift those members down to + the even slots. For some cpus this is faster than a PSHUFD. */ + if (odd_p) + { + /* For XOP use vpmacsdqh, but only for smult, as it is only + signed. */ + if (TARGET_XOP && mode == V4SImode && !uns_p) + { + x = force_reg (wmode, CONST0_RTX (wmode)); + emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x)); + return; + } + + x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode)); + if (!const_vector_equal_evenodd_p (orig_op1)) + op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), + x, NULL, 1, OPTAB_DIRECT); + if (!const_vector_equal_evenodd_p (orig_op2)) + op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), + x, NULL, 1, OPTAB_DIRECT); + op1 = gen_lowpart (mode, op1); + op2 = gen_lowpart (mode, op2); + } + + if (mode == V16SImode) + { + if (uns_p) + x = gen_vec_widen_umult_even_v16si (dest, op1, op2); + else + x = gen_vec_widen_smult_even_v16si (dest, op1, op2); + } + else if (mode == V8SImode) + { + if (uns_p) + x = gen_vec_widen_umult_even_v8si (dest, op1, op2); + else + x = gen_vec_widen_smult_even_v8si (dest, op1, op2); + } + else if (uns_p) + x = gen_vec_widen_umult_even_v4si (dest, op1, op2); + else if (TARGET_SSE4_1) + x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2); + else + { + rtx s1, s2, t0, t1, t2; + + /* The easiest way to implement this without PMULDQ is to go through + the motions as if we are performing a full 64-bit multiply. With + the exception that we need to do less shuffling of the elements. */ + + /* Compute the sign-extension, aka highparts, of the two operands. */ + s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), + op1, pc_rtx, pc_rtx); + s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), + op2, pc_rtx, pc_rtx); + + /* Multiply LO(A) * HI(B), and vice-versa. */ + t1 = gen_reg_rtx (wmode); + t2 = gen_reg_rtx (wmode); + emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2)); + emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1)); + + /* Multiply LO(A) * LO(B). */ + t0 = gen_reg_rtx (wmode); + emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2)); + + /* Combine and shift the highparts into place. */ + t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT); + t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1, + 1, OPTAB_DIRECT); + + /* Combine high and low parts. */ + force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT); + return; + } + emit_insn (x); +} + +void +ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2, + bool uns_p, bool high_p) +{ + machine_mode wmode = GET_MODE (dest); + machine_mode mode = GET_MODE (op1); + rtx t1, t2, t3, t4, mask; + + switch (mode) + { + case E_V4SImode: + t1 = gen_reg_rtx (mode); + t2 = gen_reg_rtx (mode); + if (TARGET_XOP && !uns_p) + { + /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case, + shuffle the elements once so that all elements are in the right + place for immediate use: { A C B D }. */ + emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + } + else + { + /* Put the elements into place for the multiply. */ + ix86_expand_vec_interleave (t1, op1, op1, high_p); + ix86_expand_vec_interleave (t2, op2, op2, high_p); + high_p = false; + } + ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p); + break; + + case E_V8SImode: + /* Shuffle the elements between the lanes. After this we + have { A B E F | C D G H } for each operand. */ + t1 = gen_reg_rtx (V4DImode); + t2 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1), + const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2), + const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + + /* Shuffle the elements within the lanes. After this we + have { A A B B | C C D D } or { E E F F | G G H H }. */ + t3 = gen_reg_rtx (V8SImode); + t4 = gen_reg_rtx (V8SImode); + mask = GEN_INT (high_p + ? 2 + (2 << 2) + (3 << 4) + (3 << 6) + : 0 + (0 << 2) + (1 << 4) + (1 << 6)); + emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask)); + emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask)); + + ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false); + break; + + case E_V8HImode: + case E_V16HImode: + t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX, + uns_p, OPTAB_DIRECT); + t2 = expand_binop (mode, + uns_p ? umul_highpart_optab : smul_highpart_optab, + op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT); + gcc_assert (t1 && t2); + + t3 = gen_reg_rtx (mode); + ix86_expand_vec_interleave (t3, t1, t2, high_p); + emit_move_insn (dest, gen_lowpart (wmode, t3)); + break; + + case E_V16QImode: + case E_V32QImode: + case E_V32HImode: + case E_V16SImode: + case E_V64QImode: + t1 = gen_reg_rtx (wmode); + t2 = gen_reg_rtx (wmode); + ix86_expand_sse_unpack (t1, op1, uns_p, high_p); + ix86_expand_sse_unpack (t2, op2, uns_p, high_p); + + emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2))); + break; + + default: + gcc_unreachable (); + } +} + +void +ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) +{ + rtx res_1, res_2, res_3, res_4; + + res_1 = gen_reg_rtx (V4SImode); + res_2 = gen_reg_rtx (V4SImode); + res_3 = gen_reg_rtx (V2DImode); + res_4 = gen_reg_rtx (V2DImode); + ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false); + ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true); + + /* Move the results in element 2 down to element 1; we don't care + what goes in elements 2 and 3. Then we can merge the parts + back together with an interleave. + + Note that two other sequences were tried: + (1) Use interleaves at the start instead of psrldq, which allows + us to use a single shufps to merge things back at the end. + (2) Use shufps here to combine the two vectors, then pshufd to + put the elements in the correct order. + In both cases the cost of the reformatting stall was too high + and the overall sequence slower. */ + + emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3), + const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4), + const0_rtx, const2_rtx, + const0_rtx, const0_rtx)); + res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2)); + + set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2)); +} + +void +ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) +{ + machine_mode mode = GET_MODE (op0); + rtx t1, t2, t3, t4, t5, t6; + + if (TARGET_AVX512DQ && mode == V8DImode) + emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2)); + else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode) + emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2)); + else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode) + emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2)); + else if (TARGET_XOP && mode == V2DImode) + { + /* op1: A,B,C,D, op2: E,F,G,H */ + op1 = gen_lowpart (V4SImode, op1); + op2 = gen_lowpart (V4SImode, op2); + + t1 = gen_reg_rtx (V4SImode); + t2 = gen_reg_rtx (V4SImode); + t3 = gen_reg_rtx (V2DImode); + t4 = gen_reg_rtx (V2DImode); + + /* t1: B,A,D,C */ + emit_insn (gen_sse2_pshufd_1 (t1, op1, + GEN_INT (1), + GEN_INT (0), + GEN_INT (3), + GEN_INT (2))); + + /* t2: (B*E),(A*F),(D*G),(C*H) */ + emit_insn (gen_mulv4si3 (t2, t1, op2)); + + /* t3: (B*E)+(A*F), (D*G)+(C*H) */ + emit_insn (gen_xop_phadddq (t3, t2)); + + /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */ + emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32))); + + /* Multiply lower parts and add all */ + t5 = gen_reg_rtx (V2DImode); + emit_insn (gen_vec_widen_umult_even_v4si (t5, + gen_lowpart (V4SImode, op1), + gen_lowpart (V4SImode, op2))); + op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT); + + } + else + { + machine_mode nmode; + rtx (*umul) (rtx, rtx, rtx); + + if (mode == V2DImode) + { + umul = gen_vec_widen_umult_even_v4si; + nmode = V4SImode; + } + else if (mode == V4DImode) + { + umul = gen_vec_widen_umult_even_v8si; + nmode = V8SImode; + } + else if (mode == V8DImode) + { + umul = gen_vec_widen_umult_even_v16si; + nmode = V16SImode; + } + else + gcc_unreachable (); + + + /* Multiply low parts. */ + t1 = gen_reg_rtx (mode); + emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2))); + + /* Shift input vectors right 32 bits so we can multiply high parts. */ + t6 = GEN_INT (32); + t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT); + t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT); + + /* Multiply high parts by low parts. */ + t4 = gen_reg_rtx (mode); + t5 = gen_reg_rtx (mode); + emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2))); + emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1))); + + /* Combine and shift the highparts back. */ + t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT); + t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT); + + /* Combine high and low parts. */ + force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT); + } + + set_unique_reg_note (get_last_insn (), REG_EQUAL, + gen_rtx_MULT (mode, op1, op2)); +} + +/* Return 1 if control tansfer instruction INSN + should be encoded with notrack prefix. */ + +bool +ix86_notrack_prefixed_insn_p (rtx insn) +{ + if (!insn || !((flag_cf_protection & CF_BRANCH))) + return false; + + if (CALL_P (insn)) + { + rtx call = get_call_rtx_from (insn); + gcc_assert (call != NULL_RTX); + rtx addr = XEXP (call, 0); + + /* Do not emit 'notrack' if it's not an indirect call. */ + if (MEM_P (addr) + && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF) + return false; + else + return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0); + } + + if (JUMP_P (insn) && !flag_cet_switch) + { + rtx target = JUMP_LABEL (insn); + if (target == NULL_RTX || ANY_RETURN_P (target)) + return false; + + /* Check the jump is a switch table. */ + rtx_insn *label = as_a (target); + rtx_insn *table = next_insn (label); + if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table)) + return false; + else + return true; + } + return false; +} + +/* Calculate integer abs() using only SSE2 instructions. */ + +void +ix86_expand_sse2_abs (rtx target, rtx input) +{ + machine_mode mode = GET_MODE (target); + rtx tmp0, tmp1, x; + + switch (mode) + { + case E_V2DImode: + case E_V4DImode: + /* For 64-bit signed integer X, with SSE4.2 use + pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X. + Otherwise handle it similarly to V4SImode, except use 64 as W instead of + 32 and use logical instead of arithmetic right shift (which is + unimplemented) and subtract. */ + if (TARGET_SSE4_2) + { + tmp0 = gen_reg_rtx (mode); + tmp1 = gen_reg_rtx (mode); + emit_move_insn (tmp1, CONST0_RTX (mode)); + if (mode == E_V2DImode) + emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input)); + else + emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input)); + } + else + { + tmp0 = expand_simple_binop (mode, LSHIFTRT, input, + GEN_INT (GET_MODE_UNIT_BITSIZE (mode) + - 1), NULL, 0, OPTAB_DIRECT); + tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false); + } + + tmp1 = expand_simple_binop (mode, XOR, tmp0, input, + NULL, 0, OPTAB_DIRECT); + x = expand_simple_binop (mode, MINUS, tmp1, tmp0, + target, 0, OPTAB_DIRECT); + break; + + case E_V4SImode: + /* For 32-bit signed integer X, the best way to calculate the absolute + value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ + tmp0 = expand_simple_binop (mode, ASHIFTRT, input, + GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), + NULL, 0, OPTAB_DIRECT); + tmp1 = expand_simple_binop (mode, XOR, tmp0, input, + NULL, 0, OPTAB_DIRECT); + x = expand_simple_binop (mode, MINUS, tmp1, tmp0, + target, 0, OPTAB_DIRECT); + break; + + case E_V8HImode: + /* For 16-bit signed integer X, the best way to calculate the absolute + value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ + tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); + + x = expand_simple_binop (mode, SMAX, tmp0, input, + target, 0, OPTAB_DIRECT); + break; + + case E_V16QImode: + /* For 8-bit signed integer X, the best way to calculate the absolute + value of X is min ((unsigned char) X, (unsigned char) (-X)), + as SSE2 provides the PMINUB insn. */ + tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); + + x = expand_simple_binop (V16QImode, UMIN, tmp0, input, + target, 0, OPTAB_DIRECT); + break; + + default: + gcc_unreachable (); + } + + if (x != target) + emit_move_insn (target, x); +} + +/* Expand an extract from a vector register through pextr insn. + Return true if successful. */ + +bool +ix86_expand_pextr (rtx *operands) +{ + rtx dst = operands[0]; + rtx src = operands[1]; + + unsigned int size = INTVAL (operands[2]); + unsigned int pos = INTVAL (operands[3]); + + if (SUBREG_P (dst)) + { + /* Reject non-lowpart subregs. */ + if (SUBREG_BYTE (dst) > 0) + return false; + dst = SUBREG_REG (dst); + } + + if (SUBREG_P (src)) + { + pos += SUBREG_BYTE (src) * BITS_PER_UNIT; + src = SUBREG_REG (src); + } + + switch (GET_MODE (src)) + { + case E_V16QImode: + case E_V8HImode: + case E_V4SImode: + case E_V2DImode: + case E_V1TImode: + case E_TImode: + { + machine_mode srcmode, dstmode; + rtx d, pat; + + if (!int_mode_for_size (size, 0).exists (&dstmode)) + return false; + + switch (dstmode) + { + case E_QImode: + if (!TARGET_SSE4_1) + return false; + srcmode = V16QImode; + break; + + case E_HImode: + if (!TARGET_SSE2) + return false; + srcmode = V8HImode; + break; + + case E_SImode: + if (!TARGET_SSE4_1) + return false; + srcmode = V4SImode; + break; + + case E_DImode: + gcc_assert (TARGET_64BIT); + if (!TARGET_SSE4_1) + return false; + srcmode = V2DImode; + break; + + default: + return false; + } + + /* Reject extractions from misaligned positions. */ + if (pos & (size-1)) + return false; + + if (GET_MODE (dst) == dstmode) + d = dst; + else + d = gen_reg_rtx (dstmode); + + /* Construct insn pattern. */ + pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size))); + pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat); + + /* Let the rtl optimizers know about the zero extension performed. */ + if (dstmode == QImode || dstmode == HImode) + { + pat = gen_rtx_ZERO_EXTEND (SImode, pat); + d = gen_lowpart (SImode, d); + } + + emit_insn (gen_rtx_SET (d, pat)); + + if (d != dst) + emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d)); + return true; + } + + default: + return false; + } +} + +/* Expand an insert into a vector register through pinsr insn. + Return true if successful. */ + +bool +ix86_expand_pinsr (rtx *operands) +{ + rtx dst = operands[0]; + rtx src = operands[3]; + + unsigned int size = INTVAL (operands[1]); + unsigned int pos = INTVAL (operands[2]); + + if (SUBREG_P (dst)) + { + pos += SUBREG_BYTE (dst) * BITS_PER_UNIT; + dst = SUBREG_REG (dst); + } + + switch (GET_MODE (dst)) + { + case E_V16QImode: + case E_V8HImode: + case E_V4SImode: + case E_V2DImode: + case E_V1TImode: + case E_TImode: + { + machine_mode srcmode, dstmode; + rtx (*pinsr)(rtx, rtx, rtx, rtx); + rtx d; + + if (!int_mode_for_size (size, 0).exists (&srcmode)) + return false; + + switch (srcmode) + { + case E_QImode: + if (!TARGET_SSE4_1) + return false; + dstmode = V16QImode; + pinsr = gen_sse4_1_pinsrb; + break; + + case E_HImode: + if (!TARGET_SSE2) + return false; + dstmode = V8HImode; + pinsr = gen_sse2_pinsrw; + break; + + case E_SImode: + if (!TARGET_SSE4_1) + return false; + dstmode = V4SImode; + pinsr = gen_sse4_1_pinsrd; + break; + + case E_DImode: + gcc_assert (TARGET_64BIT); + if (!TARGET_SSE4_1) + return false; + dstmode = V2DImode; + pinsr = gen_sse4_1_pinsrq; + break; + + default: + return false; + } + + /* Reject insertions to misaligned positions. */ + if (pos & (size-1)) + return false; + + if (SUBREG_P (src)) + { + unsigned int srcpos = SUBREG_BYTE (src); + + if (srcpos > 0) + { + rtx extr_ops[4]; + + extr_ops[0] = gen_reg_rtx (srcmode); + extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src)); + extr_ops[2] = GEN_INT (size); + extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT); + + if (!ix86_expand_pextr (extr_ops)) + return false; + + src = extr_ops[0]; + } + else + src = gen_lowpart (srcmode, SUBREG_REG (src)); + } + + if (GET_MODE (dst) == dstmode) + d = dst; + else + d = gen_reg_rtx (dstmode); + + emit_insn (pinsr (d, gen_lowpart (dstmode, dst), + gen_lowpart (srcmode, src), + GEN_INT (1 << (pos / size)))); + if (d != dst) + emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d)); + return true; + } + + default: + return false; + } +} + +/* All CPUs prefer to avoid cross-lane operations so perform reductions + upper against lower halves up to SSE reg size. */ + +machine_mode +ix86_split_reduction (machine_mode mode) +{ + /* Reduce lowpart against highpart until we reach SSE reg width to + avoid cross-lane operations. */ + switch (mode) + { + case E_V8DImode: + case E_V4DImode: + return V2DImode; + case E_V16SImode: + case E_V8SImode: + return V4SImode; + case E_V32HImode: + case E_V16HImode: + return V8HImode; + case E_V64QImode: + case E_V32QImode: + return V16QImode; + case E_V16SFmode: + case E_V8SFmode: + return V4SFmode; + case E_V8DFmode: + case E_V4DFmode: + return V2DFmode; + default: + return mode; + } +} + +/* Generate call to __divmoddi4. */ + +void +ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, + rtx op0, rtx op1, + rtx *quot_p, rtx *rem_p) +{ + rtx rem = assign_386_stack_local (mode, SLOT_TEMP); + + rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL, + mode, op0, mode, op1, mode, + XEXP (rem, 0), Pmode); + *quot_p = quot; + *rem_p = rem; +} + +#include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h new file mode 100644 index 00000000000..9271bb85ac5 --- /dev/null +++ b/gcc/config/i386/i386-expand.h @@ -0,0 +1,58 @@ +/* Copyright (C) 1988-2019 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#ifndef GCC_I386_EXPAND_H +#define GCC_I386_EXPAND_H + +/* AVX512F does support 64-byte integer vector operations, + thus the longest vector we are faced with is V64QImode. */ +#define MAX_VECT_LEN 64 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + machine_mode vmode; + unsigned char nelt; + bool one_operand_p; + bool testing_p; +}; + +rtx legitimize_tls_address (rtx x, enum tls_model model, bool for_mov); +alias_set_type ix86_GOT_alias_set (void); +rtx legitimize_pic_address (rtx orig, rtx reg); +rtx legitimize_pe_coff_symbol (rtx addr, bool inreg); + +bool insn_defines_reg (unsigned int regno1, unsigned int regno2, + rtx_insn *insn); +void ix86_emit_binop (enum rtx_code code, machine_mode mode, rtx dst, rtx src); +enum calling_abi ix86_function_abi (const_tree fndecl); +bool ix86_function_ms_hook_prologue (const_tree fn); +void warn_once_call_ms2sysv_xlogues (const char *feature); +rtx gen_push (rtx arg); +rtx gen_pop (rtx arg); +rtx ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + machine_mode mode, int ignore); +bool ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, + rtx op1, const vec_perm_indices &sel); +bool ix86_notrack_prefixed_insn_p (rtx insn); +machine_mode ix86_split_reduction (machine_mode mode); +void ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, rtx op0, + rtx op1, rtx *quot_p, rtx *rem_p); + +#endif /* GCC_I386_EXPAND_H */ diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c new file mode 100644 index 00000000000..67f45d66c48 --- /dev/null +++ b/gcc/config/i386/i386-features.c @@ -0,0 +1,2742 @@ +/* Copyright (C) 1988-2019 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "gimple.h" +#include "cfghooks.h" +#include "cfgloop.h" +#include "df.h" +#include "tm_p.h" +#include "stringpool.h" +#include "expmed.h" +#include "optabs.h" +#include "regs.h" +#include "emit-rtl.h" +#include "recog.h" +#include "cgraph.h" +#include "diagnostic.h" +#include "cfgbuild.h" +#include "alias.h" +#include "fold-const.h" +#include "attribs.h" +#include "calls.h" +#include "stor-layout.h" +#include "varasm.h" +#include "output.h" +#include "insn-attr.h" +#include "flags.h" +#include "except.h" +#include "explow.h" +#include "expr.h" +#include "cfgrtl.h" +#include "common/common-target.h" +#include "langhooks.h" +#include "reload.h" +#include "gimplify.h" +#include "dwarf2.h" +#include "tm-constrs.h" +#include "params.h" +#include "cselib.h" +#include "sched-int.h" +#include "opts.h" +#include "tree-pass.h" +#include "context.h" +#include "pass_manager.h" +#include "target-globals.h" +#include "gimple-iterator.h" +#include "tree-vectorizer.h" +#include "shrink-wrap.h" +#include "builtins.h" +#include "rtl-iter.h" +#include "tree-iterator.h" +#include "dbgcnt.h" +#include "case-cfn-macros.h" +#include "dojump.h" +#include "fold-const-call.h" +#include "tree-vrp.h" +#include "tree-ssanames.h" +#include "selftest.h" +#include "selftest-rtl.h" +#include "print-rtl.h" +#include "intl.h" +#include "ifcvt.h" +#include "symbol-summary.h" +#include "ipa-prop.h" +#include "ipa-fnsummary.h" +#include "wide-int-bitmask.h" +#include "tree-vector-builder.h" +#include "debug.h" +#include "dwarf2out.h" +#include "i386-builtins.h" +#include "i386-features.h" + +const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = { + "savms64", + "resms64", + "resms64x", + "savms64f", + "resms64f", + "resms64fx" +}; + +const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = { +/* The below offset values are where each register is stored for the layout + relative to incoming stack pointer. The value of each m_regs[].offset will + be relative to the incoming base pointer (rax or rsi) used by the stub. + + s_instances: 0 1 2 3 + Offset: realigned or aligned + 8 + Register aligned aligned + 8 aligned w/HFP w/HFP */ + XMM15_REG, /* 0x10 0x18 0x10 0x18 */ + XMM14_REG, /* 0x20 0x28 0x20 0x28 */ + XMM13_REG, /* 0x30 0x38 0x30 0x38 */ + XMM12_REG, /* 0x40 0x48 0x40 0x48 */ + XMM11_REG, /* 0x50 0x58 0x50 0x58 */ + XMM10_REG, /* 0x60 0x68 0x60 0x68 */ + XMM9_REG, /* 0x70 0x78 0x70 0x78 */ + XMM8_REG, /* 0x80 0x88 0x80 0x88 */ + XMM7_REG, /* 0x90 0x98 0x90 0x98 */ + XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */ + SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */ + DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */ + BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */ + BP_REG, /* 0xc0 0xc8 N/A N/A */ + R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */ + R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */ + R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */ + R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */ +}; + +/* Instantiate static const values. */ +const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET; +const unsigned xlogue_layout::MIN_REGS; +const unsigned xlogue_layout::MAX_REGS; +const unsigned xlogue_layout::MAX_EXTRA_REGS; +const unsigned xlogue_layout::VARIANT_COUNT; +const unsigned xlogue_layout::STUB_NAME_MAX_LEN; + +/* Initialize xlogue_layout::s_stub_names to zero. */ +char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT] + [STUB_NAME_MAX_LEN]; + +/* Instantiates all xlogue_layout instances. */ +const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = { + xlogue_layout (0, false), + xlogue_layout (8, false), + xlogue_layout (0, true), + xlogue_layout (8, true) +}; + +/* Return an appropriate const instance of xlogue_layout based upon values + in cfun->machine and crtl. */ +const struct xlogue_layout & +xlogue_layout::get_instance () +{ + enum xlogue_stub_sets stub_set; + bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in; + + if (stack_realign_fp) + stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN; + else if (frame_pointer_needed) + stub_set = aligned_plus_8 + ? XLOGUE_SET_HFP_ALIGNED_PLUS_8 + : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN; + else + stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED; + + return s_instances[stub_set]; +} + +/* Determine how many clobbered registers can be saved by the stub. + Returns the count of registers the stub will save and restore. */ +unsigned +xlogue_layout::count_stub_managed_regs () +{ + bool hfp = frame_pointer_needed || stack_realign_fp; + unsigned i, count; + unsigned regno; + + for (count = i = MIN_REGS; i < MAX_REGS; ++i) + { + regno = REG_ORDER[i]; + if (regno == BP_REG && hfp) + continue; + if (!ix86_save_reg (regno, false, false)) + break; + ++count; + } + return count; +} + +/* Determine if register REGNO is a stub managed register given the + total COUNT of stub managed registers. */ +bool +xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count) +{ + bool hfp = frame_pointer_needed || stack_realign_fp; + unsigned i; + + for (i = 0; i < count; ++i) + { + gcc_assert (i < MAX_REGS); + if (REG_ORDER[i] == BP_REG && hfp) + ++count; + else if (REG_ORDER[i] == regno) + return true; + } + return false; +} + +/* Constructor for xlogue_layout. */ +xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp) + : m_hfp (hfp) , m_nregs (hfp ? 17 : 18), + m_stack_align_off_in (stack_align_off_in) +{ + HOST_WIDE_INT offset = stack_align_off_in; + unsigned i, j; + + for (i = j = 0; i < MAX_REGS; ++i) + { + unsigned regno = REG_ORDER[i]; + + if (regno == BP_REG && hfp) + continue; + if (SSE_REGNO_P (regno)) + { + offset += 16; + /* Verify that SSE regs are always aligned. */ + gcc_assert (!((stack_align_off_in + offset) & 15)); + } + else + offset += 8; + + m_regs[j].regno = regno; + m_regs[j++].offset = offset - STUB_INDEX_OFFSET; + } + gcc_assert (j == m_nregs); +} + +const char * +xlogue_layout::get_stub_name (enum xlogue_stub stub, + unsigned n_extra_regs) +{ + const int have_avx = TARGET_AVX; + char *name = s_stub_names[!!have_avx][stub][n_extra_regs]; + + /* Lazy init */ + if (!*name) + { + int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u", + (have_avx ? "avx" : "sse"), + STUB_BASE_NAMES[stub], + MIN_REGS + n_extra_regs); + gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN); + } + + return name; +} + +/* Return rtx of a symbol ref for the entry point (based upon + cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */ +rtx +xlogue_layout::get_stub_rtx (enum xlogue_stub stub) +{ + const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs; + gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS); + gcc_assert (stub < XLOGUE_STUB_COUNT); + gcc_assert (crtl->stack_realign_finalized); + + return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs)); +} + +unsigned scalar_chain::max_id = 0; + +/* Initialize new chain. */ + +scalar_chain::scalar_chain () +{ + chain_id = ++max_id; + + if (dump_file) + fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id); + + bitmap_obstack_initialize (NULL); + insns = BITMAP_ALLOC (NULL); + defs = BITMAP_ALLOC (NULL); + defs_conv = BITMAP_ALLOC (NULL); + queue = NULL; +} + +/* Free chain's data. */ + +scalar_chain::~scalar_chain () +{ + BITMAP_FREE (insns); + BITMAP_FREE (defs); + BITMAP_FREE (defs_conv); + bitmap_obstack_release (NULL); +} + +/* Add instruction into chains' queue. */ + +void +scalar_chain::add_to_queue (unsigned insn_uid) +{ + if (bitmap_bit_p (insns, insn_uid) + || bitmap_bit_p (queue, insn_uid)) + return; + + if (dump_file) + fprintf (dump_file, " Adding insn %d into chain's #%d queue\n", + insn_uid, chain_id); + bitmap_set_bit (queue, insn_uid); +} + +/* For DImode conversion, mark register defined by DEF as requiring + conversion. */ + +void +dimode_scalar_chain::mark_dual_mode_def (df_ref def) +{ + gcc_assert (DF_REF_REG_DEF_P (def)); + + if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def))) + return; + + if (dump_file) + fprintf (dump_file, + " Mark r%d def in insn %d as requiring both modes in chain #%d\n", + DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id); + + bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); +} + +/* For TImode conversion, it is unused. */ + +void +timode_scalar_chain::mark_dual_mode_def (df_ref) +{ + gcc_unreachable (); +} + +/* Check REF's chain to add new insns into a queue + and find registers requiring conversion. */ + +void +scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref) +{ + df_link *chain; + + gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)) + || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))); + add_to_queue (DF_REF_INSN_UID (ref)); + + for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next) + { + unsigned uid = DF_REF_INSN_UID (chain->ref); + + if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref))) + continue; + + if (!DF_REF_REG_MEM_P (chain->ref)) + { + if (bitmap_bit_p (insns, uid)) + continue; + + if (bitmap_bit_p (candidates, uid)) + { + add_to_queue (uid); + continue; + } + } + + if (DF_REF_REG_DEF_P (chain->ref)) + { + if (dump_file) + fprintf (dump_file, " r%d def in insn %d isn't convertible\n", + DF_REF_REGNO (chain->ref), uid); + mark_dual_mode_def (chain->ref); + } + else + { + if (dump_file) + fprintf (dump_file, " r%d use in insn %d isn't convertible\n", + DF_REF_REGNO (chain->ref), uid); + mark_dual_mode_def (ref); + } + } +} + +/* Add instruction into a chain. */ + +void +scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid) +{ + if (bitmap_bit_p (insns, insn_uid)) + return; + + if (dump_file) + fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id); + + bitmap_set_bit (insns, insn_uid); + + rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; + rtx def_set = single_set (insn); + if (def_set && REG_P (SET_DEST (def_set)) + && !HARD_REGISTER_P (SET_DEST (def_set))) + bitmap_set_bit (defs, REGNO (SET_DEST (def_set))); + + df_ref ref; + df_ref def; + for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) + if (!HARD_REGISTER_P (DF_REF_REG (ref))) + for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref)); + def; + def = DF_REF_NEXT_REG (def)) + analyze_register_chain (candidates, def); + for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) + if (!DF_REF_REG_MEM_P (ref)) + analyze_register_chain (candidates, ref); +} + +/* Build new chain starting from insn INSN_UID recursively + adding all dependent uses and definitions. */ + +void +scalar_chain::build (bitmap candidates, unsigned insn_uid) +{ + queue = BITMAP_ALLOC (NULL); + bitmap_set_bit (queue, insn_uid); + + if (dump_file) + fprintf (dump_file, "Building chain #%d...\n", chain_id); + + while (!bitmap_empty_p (queue)) + { + insn_uid = bitmap_first_set_bit (queue); + bitmap_clear_bit (queue, insn_uid); + bitmap_clear_bit (candidates, insn_uid); + add_insn (candidates, insn_uid); + } + + if (dump_file) + { + fprintf (dump_file, "Collected chain #%d...\n", chain_id); + fprintf (dump_file, " insns: "); + dump_bitmap (dump_file, insns); + if (!bitmap_empty_p (defs_conv)) + { + bitmap_iterator bi; + unsigned id; + const char *comma = ""; + fprintf (dump_file, " defs to convert: "); + EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi) + { + fprintf (dump_file, "%sr%d", comma, id); + comma = ", "; + } + fprintf (dump_file, "\n"); + } + } + + BITMAP_FREE (queue); +} + +/* Return a cost of building a vector costant + instead of using a scalar one. */ + +int +dimode_scalar_chain::vector_const_cost (rtx exp) +{ + gcc_assert (CONST_INT_P (exp)); + + if (standard_sse_constant_p (exp, V2DImode)) + return COSTS_N_INSNS (1); + return ix86_cost->sse_load[1]; +} + +/* Compute a gain for chain conversion. */ + +int +dimode_scalar_chain::compute_convert_gain () +{ + bitmap_iterator bi; + unsigned insn_uid; + int gain = 0; + int cost = 0; + + if (dump_file) + fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); + + EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi) + { + rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; + rtx def_set = single_set (insn); + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + + if (REG_P (src) && REG_P (dst)) + gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move; + else if (REG_P (src) && MEM_P (dst)) + gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1]; + else if (MEM_P (src) && REG_P (dst)) + gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1]; + else if (GET_CODE (src) == ASHIFT + || GET_CODE (src) == ASHIFTRT + || GET_CODE (src) == LSHIFTRT) + { + if (CONST_INT_P (XEXP (src, 0))) + gain -= vector_const_cost (XEXP (src, 0)); + gain += ix86_cost->shift_const; + if (INTVAL (XEXP (src, 1)) >= 32) + gain -= COSTS_N_INSNS (1); + } + else if (GET_CODE (src) == PLUS + || GET_CODE (src) == MINUS + || GET_CODE (src) == IOR + || GET_CODE (src) == XOR + || GET_CODE (src) == AND) + { + gain += ix86_cost->add; + /* Additional gain for andnot for targets without BMI. */ + if (GET_CODE (XEXP (src, 0)) == NOT + && !TARGET_BMI) + gain += 2 * ix86_cost->add; + + if (CONST_INT_P (XEXP (src, 0))) + gain -= vector_const_cost (XEXP (src, 0)); + if (CONST_INT_P (XEXP (src, 1))) + gain -= vector_const_cost (XEXP (src, 1)); + } + else if (GET_CODE (src) == NEG + || GET_CODE (src) == NOT) + gain += ix86_cost->add - COSTS_N_INSNS (1); + else if (GET_CODE (src) == COMPARE) + { + /* Assume comparison cost is the same. */ + } + else if (CONST_INT_P (src)) + { + if (REG_P (dst)) + gain += COSTS_N_INSNS (2); + else if (MEM_P (dst)) + gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1]; + gain -= vector_const_cost (src); + } + else + gcc_unreachable (); + } + + if (dump_file) + fprintf (dump_file, " Instruction conversion gain: %d\n", gain); + + EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi) + cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer; + + if (dump_file) + fprintf (dump_file, " Registers conversion cost: %d\n", cost); + + gain -= cost; + + if (dump_file) + fprintf (dump_file, " Total gain: %d\n", gain); + + return gain; +} + +/* Replace REG in X with a V2DI subreg of NEW_REG. */ + +rtx +dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg) +{ + if (x == reg) + return gen_rtx_SUBREG (V2DImode, new_reg, 0); + + const char *fmt = GET_RTX_FORMAT (GET_CODE (x)); + int i, j; + for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--) + { + if (fmt[i] == 'e') + XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg); + else if (fmt[i] == 'E') + for (j = XVECLEN (x, i) - 1; j >= 0; j--) + XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j), + reg, new_reg); + } + + return x; +} + +/* Replace REG in INSN with a V2DI subreg of NEW_REG. */ + +void +dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn, + rtx reg, rtx new_reg) +{ + replace_with_subreg (single_set (insn), reg, new_reg); +} + +/* Insert generated conversion instruction sequence INSNS + after instruction AFTER. New BB may be required in case + instruction has EH region attached. */ + +void +scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after) +{ + if (!control_flow_insn_p (after)) + { + emit_insn_after (insns, after); + return; + } + + basic_block bb = BLOCK_FOR_INSN (after); + edge e = find_fallthru_edge (bb->succs); + gcc_assert (e); + + basic_block new_bb = split_edge (e); + emit_insn_after (insns, BB_HEAD (new_bb)); +} + +/* Make vector copies for all register REGNO definitions + and replace its uses in a chain. */ + +void +dimode_scalar_chain::make_vector_copies (unsigned regno) +{ + rtx reg = regno_reg_rtx[regno]; + rtx vreg = gen_reg_rtx (DImode); + df_ref ref; + + for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) + { + start_sequence (); + if (!TARGET_INTER_UNIT_MOVES_TO_VEC) + { + rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP); + emit_move_insn (adjust_address (tmp, SImode, 0), + gen_rtx_SUBREG (SImode, reg, 0)); + emit_move_insn (adjust_address (tmp, SImode, 4), + gen_rtx_SUBREG (SImode, reg, 4)); + emit_move_insn (vreg, tmp); + } + else if (TARGET_SSE4_1) + { + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), + CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, reg, 0))); + emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (SImode, reg, 4), + GEN_INT (2))); + } + else + { + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), + CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, reg, 0))); + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0), + CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, reg, 4))); + emit_insn (gen_vec_interleave_lowv4si + (gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (V4SImode, tmp, 0))); + } + rtx_insn *seq = get_insns (); + end_sequence (); + rtx_insn *insn = DF_REF_INSN (ref); + emit_conversion_insns (seq, insn); + + if (dump_file) + fprintf (dump_file, + " Copied r%d to a vector register r%d for insn %d\n", + regno, REGNO (vreg), INSN_UID (insn)); + } + + for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) + { + rtx_insn *insn = DF_REF_INSN (ref); + replace_with_subreg_in_insn (insn, reg, vreg); + + if (dump_file) + fprintf (dump_file, " Replaced r%d with r%d in insn %d\n", + regno, REGNO (vreg), INSN_UID (insn)); + } +} + +/* Convert all definitions of register REGNO + and fix its uses. Scalar copies may be created + in case register is used in not convertible insn. */ + +void +dimode_scalar_chain::convert_reg (unsigned regno) +{ + bool scalar_copy = bitmap_bit_p (defs_conv, regno); + rtx reg = regno_reg_rtx[regno]; + rtx scopy = NULL_RTX; + df_ref ref; + bitmap conv; + + conv = BITMAP_ALLOC (NULL); + bitmap_copy (conv, insns); + + if (scalar_copy) + scopy = gen_reg_rtx (DImode); + + for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + { + rtx_insn *insn = DF_REF_INSN (ref); + rtx def_set = single_set (insn); + rtx src = SET_SRC (def_set); + rtx reg = DF_REF_REG (ref); + + if (!MEM_P (src)) + { + replace_with_subreg_in_insn (insn, reg, reg); + bitmap_clear_bit (conv, INSN_UID (insn)); + } + + if (scalar_copy) + { + start_sequence (); + if (!TARGET_INTER_UNIT_MOVES_FROM_VEC) + { + rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP); + emit_move_insn (tmp, reg); + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), + adjust_address (tmp, SImode, 0)); + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), + adjust_address (tmp, SImode, 4)); + } + else if (TARGET_SSE4_1) + { + rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + emit_insn + (gen_rtx_SET + (gen_rtx_SUBREG (SImode, scopy, 0), + gen_rtx_VEC_SELECT (SImode, + gen_rtx_SUBREG (V4SImode, reg, 0), tmp))); + + tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx)); + emit_insn + (gen_rtx_SET + (gen_rtx_SUBREG (SImode, scopy, 4), + gen_rtx_VEC_SELECT (SImode, + gen_rtx_SUBREG (V4SImode, reg, 0), tmp))); + } + else + { + rtx vcopy = gen_reg_rtx (V2DImode); + emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0)); + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), + gen_rtx_SUBREG (SImode, vcopy, 0)); + emit_move_insn (vcopy, + gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32))); + emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), + gen_rtx_SUBREG (SImode, vcopy, 0)); + } + rtx_insn *seq = get_insns (); + end_sequence (); + emit_conversion_insns (seq, insn); + + if (dump_file) + fprintf (dump_file, + " Copied r%d to a scalar register r%d for insn %d\n", + regno, REGNO (scopy), INSN_UID (insn)); + } + } + + for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) + { + if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref))) + { + rtx_insn *insn = DF_REF_INSN (ref); + + rtx def_set = single_set (insn); + gcc_assert (def_set); + + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + + if (!MEM_P (dst) || !REG_P (src)) + replace_with_subreg_in_insn (insn, reg, reg); + + bitmap_clear_bit (conv, INSN_UID (insn)); + } + } + /* Skip debug insns and uninitialized uses. */ + else if (DF_REF_CHAIN (ref) + && NONDEBUG_INSN_P (DF_REF_INSN (ref))) + { + gcc_assert (scopy); + replace_rtx (DF_REF_INSN (ref), reg, scopy); + df_insn_rescan (DF_REF_INSN (ref)); + } + + BITMAP_FREE (conv); +} + +/* Convert operand OP in INSN. We should handle + memory operands and uninitialized registers. + All other register uses are converted during + registers conversion. */ + +void +dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn) +{ + *op = copy_rtx_if_shared (*op); + + if (GET_CODE (*op) == NOT) + { + convert_op (&XEXP (*op, 0), insn); + PUT_MODE (*op, V2DImode); + } + else if (MEM_P (*op)) + { + rtx tmp = gen_reg_rtx (DImode); + + emit_insn_before (gen_move_insn (tmp, *op), insn); + *op = gen_rtx_SUBREG (V2DImode, tmp, 0); + + if (dump_file) + fprintf (dump_file, " Preloading operand for insn %d into r%d\n", + INSN_UID (insn), REGNO (tmp)); + } + else if (REG_P (*op)) + { + /* We may have not converted register usage in case + this register has no definition. Otherwise it + should be converted in convert_reg. */ + df_ref ref; + FOR_EACH_INSN_USE (ref, insn) + if (DF_REF_REGNO (ref) == REGNO (*op)) + { + gcc_assert (!DF_REF_CHAIN (ref)); + break; + } + *op = gen_rtx_SUBREG (V2DImode, *op, 0); + } + else if (CONST_INT_P (*op)) + { + rtx vec_cst; + rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0); + + /* Prefer all ones vector in case of -1. */ + if (constm1_operand (*op, GET_MODE (*op))) + vec_cst = CONSTM1_RTX (V2DImode); + else + vec_cst = gen_rtx_CONST_VECTOR (V2DImode, + gen_rtvec (2, *op, const0_rtx)); + + if (!standard_sse_constant_p (vec_cst, V2DImode)) + { + start_sequence (); + vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst)); + rtx_insn *seq = get_insns (); + end_sequence (); + emit_insn_before (seq, insn); + } + + emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn); + *op = tmp; + } + else + { + gcc_assert (SUBREG_P (*op)); + gcc_assert (GET_MODE (*op) == V2DImode); + } +} + +/* Convert INSN to vector mode. */ + +void +dimode_scalar_chain::convert_insn (rtx_insn *insn) +{ + rtx def_set = single_set (insn); + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + rtx subreg; + + if (MEM_P (dst) && !REG_P (src)) + { + /* There are no scalar integer instructions and therefore + temporary register usage is required. */ + rtx tmp = gen_reg_rtx (DImode); + emit_conversion_insns (gen_move_insn (dst, tmp), insn); + dst = gen_rtx_SUBREG (V2DImode, tmp, 0); + } + + switch (GET_CODE (src)) + { + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + convert_op (&XEXP (src, 0), insn); + PUT_MODE (src, V2DImode); + break; + + case PLUS: + case MINUS: + case IOR: + case XOR: + case AND: + convert_op (&XEXP (src, 0), insn); + convert_op (&XEXP (src, 1), insn); + PUT_MODE (src, V2DImode); + break; + + case NEG: + src = XEXP (src, 0); + convert_op (&src, insn); + subreg = gen_reg_rtx (V2DImode); + emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn); + src = gen_rtx_MINUS (V2DImode, subreg, src); + break; + + case NOT: + src = XEXP (src, 0); + convert_op (&src, insn); + subreg = gen_reg_rtx (V2DImode); + emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn); + src = gen_rtx_XOR (V2DImode, src, subreg); + break; + + case MEM: + if (!REG_P (dst)) + convert_op (&src, insn); + break; + + case REG: + if (!MEM_P (dst)) + convert_op (&src, insn); + break; + + case SUBREG: + gcc_assert (GET_MODE (src) == V2DImode); + break; + + case COMPARE: + src = SUBREG_REG (XEXP (XEXP (src, 0), 0)); + + gcc_assert ((REG_P (src) && GET_MODE (src) == DImode) + || (SUBREG_P (src) && GET_MODE (src) == V2DImode)); + + if (REG_P (src)) + subreg = gen_rtx_SUBREG (V2DImode, src, 0); + else + subreg = copy_rtx_if_shared (src); + emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg), + copy_rtx_if_shared (subreg), + copy_rtx_if_shared (subreg)), + insn); + dst = gen_rtx_REG (CCmode, FLAGS_REG); + src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src), + copy_rtx_if_shared (src)), + UNSPEC_PTEST); + break; + + case CONST_INT: + convert_op (&src, insn); + break; + + default: + gcc_unreachable (); + } + + SET_SRC (def_set) = src; + SET_DEST (def_set) = dst; + + /* Drop possible dead definitions. */ + PATTERN (insn) = def_set; + + INSN_CODE (insn) = -1; + recog_memoized (insn); + df_insn_rescan (insn); +} + +/* Fix uses of converted REG in debug insns. */ + +void +timode_scalar_chain::fix_debug_reg_uses (rtx reg) +{ + if (!flag_var_tracking) + return; + + df_ref ref, next; + for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next) + { + rtx_insn *insn = DF_REF_INSN (ref); + /* Make sure the next ref is for a different instruction, + so that we're not affected by the rescan. */ + next = DF_REF_NEXT_REG (ref); + while (next && DF_REF_INSN (next) == insn) + next = DF_REF_NEXT_REG (next); + + if (DEBUG_INSN_P (insn)) + { + /* It may be a debug insn with a TImode variable in + register. */ + bool changed = false; + for (; ref != next; ref = DF_REF_NEXT_REG (ref)) + { + rtx *loc = DF_REF_LOC (ref); + if (REG_P (*loc) && GET_MODE (*loc) == V1TImode) + { + *loc = gen_rtx_SUBREG (TImode, *loc, 0); + changed = true; + } + } + if (changed) + df_insn_rescan (insn); + } + } +} + +/* Convert INSN from TImode to V1T1mode. */ + +void +timode_scalar_chain::convert_insn (rtx_insn *insn) +{ + rtx def_set = single_set (insn); + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + + switch (GET_CODE (dst)) + { + case REG: + { + rtx tmp = find_reg_equal_equiv_note (insn); + if (tmp) + PUT_MODE (XEXP (tmp, 0), V1TImode); + PUT_MODE (dst, V1TImode); + fix_debug_reg_uses (dst); + } + break; + case MEM: + PUT_MODE (dst, V1TImode); + break; + + default: + gcc_unreachable (); + } + + switch (GET_CODE (src)) + { + case REG: + PUT_MODE (src, V1TImode); + /* Call fix_debug_reg_uses only if SRC is never defined. */ + if (!DF_REG_DEF_CHAIN (REGNO (src))) + fix_debug_reg_uses (src); + break; + + case MEM: + PUT_MODE (src, V1TImode); + break; + + case CONST_WIDE_INT: + if (NONDEBUG_INSN_P (insn)) + { + /* Since there are no instructions to store 128-bit constant, + temporary register usage is required. */ + rtx tmp = gen_reg_rtx (V1TImode); + start_sequence (); + src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src)); + src = validize_mem (force_const_mem (V1TImode, src)); + rtx_insn *seq = get_insns (); + end_sequence (); + if (seq) + emit_insn_before (seq, insn); + emit_conversion_insns (gen_rtx_SET (dst, tmp), insn); + dst = tmp; + } + break; + + case CONST_INT: + switch (standard_sse_constant_p (src, TImode)) + { + case 1: + src = CONST0_RTX (GET_MODE (dst)); + break; + case 2: + src = CONSTM1_RTX (GET_MODE (dst)); + break; + default: + gcc_unreachable (); + } + if (NONDEBUG_INSN_P (insn)) + { + rtx tmp = gen_reg_rtx (V1TImode); + /* Since there are no instructions to store standard SSE + constant, temporary register usage is required. */ + emit_conversion_insns (gen_rtx_SET (dst, tmp), insn); + dst = tmp; + } + break; + + default: + gcc_unreachable (); + } + + SET_SRC (def_set) = src; + SET_DEST (def_set) = dst; + + /* Drop possible dead definitions. */ + PATTERN (insn) = def_set; + + INSN_CODE (insn) = -1; + recog_memoized (insn); + df_insn_rescan (insn); +} + +void +dimode_scalar_chain::convert_registers () +{ + bitmap_iterator bi; + unsigned id; + + EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi) + convert_reg (id); + + EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi) + make_vector_copies (id); +} + +/* Convert whole chain creating required register + conversions and copies. */ + +int +scalar_chain::convert () +{ + bitmap_iterator bi; + unsigned id; + int converted_insns = 0; + + if (!dbg_cnt (stv_conversion)) + return 0; + + if (dump_file) + fprintf (dump_file, "Converting chain #%d...\n", chain_id); + + convert_registers (); + + EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi) + { + convert_insn (DF_INSN_UID_GET (id)->insn); + converted_insns++; + } + + return converted_insns; +} + +/* Return 1 if INSN uses or defines a hard register. + Hard register uses in a memory address are ignored. + Clobbers and flags definitions are ignored. */ + +static bool +has_non_address_hard_reg (rtx_insn *insn) +{ + df_ref ref; + FOR_EACH_INSN_DEF (ref, insn) + if (HARD_REGISTER_P (DF_REF_REAL_REG (ref)) + && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER) + && DF_REF_REGNO (ref) != FLAGS_REG) + return true; + + FOR_EACH_INSN_USE (ref, insn) + if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref))) + return true; + + return false; +} + +/* Check if comparison INSN may be transformed + into vector comparison. Currently we transform + zero checks only which look like: + + (set (reg:CCZ 17 flags) + (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4) + (subreg:SI (reg:DI x) 0)) + (const_int 0 [0]))) */ + +static bool +convertible_comparison_p (rtx_insn *insn) +{ + if (!TARGET_SSE4_1) + return false; + + rtx def_set = single_set (insn); + + gcc_assert (def_set); + + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + + gcc_assert (GET_CODE (src) == COMPARE); + + if (GET_CODE (dst) != REG + || REGNO (dst) != FLAGS_REG + || GET_MODE (dst) != CCZmode) + return false; + + rtx op1 = XEXP (src, 0); + rtx op2 = XEXP (src, 1); + + if (op2 != CONST0_RTX (GET_MODE (op2))) + return false; + + if (GET_CODE (op1) != IOR) + return false; + + op2 = XEXP (op1, 1); + op1 = XEXP (op1, 0); + + if (!SUBREG_P (op1) + || !SUBREG_P (op2) + || GET_MODE (op1) != SImode + || GET_MODE (op2) != SImode + || ((SUBREG_BYTE (op1) != 0 + || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode)) + && (SUBREG_BYTE (op2) != 0 + || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode)))) + return false; + + op1 = SUBREG_REG (op1); + op2 = SUBREG_REG (op2); + + if (op1 != op2 + || !REG_P (op1) + || GET_MODE (op1) != DImode) + return false; + + return true; +} + +/* The DImode version of scalar_to_vector_candidate_p. */ + +static bool +dimode_scalar_to_vector_candidate_p (rtx_insn *insn) +{ + rtx def_set = single_set (insn); + + if (!def_set) + return false; + + if (has_non_address_hard_reg (insn)) + return false; + + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + + if (GET_CODE (src) == COMPARE) + return convertible_comparison_p (insn); + + /* We are interested in DImode promotion only. */ + if ((GET_MODE (src) != DImode + && !CONST_INT_P (src)) + || GET_MODE (dst) != DImode) + return false; + + if (!REG_P (dst) && !MEM_P (dst)) + return false; + + switch (GET_CODE (src)) + { + case ASHIFTRT: + if (!TARGET_AVX512VL) + return false; + /* FALLTHRU */ + + case ASHIFT: + case LSHIFTRT: + if (!CONST_INT_P (XEXP (src, 1)) + || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)) + return false; + break; + + case PLUS: + case MINUS: + case IOR: + case XOR: + case AND: + if (!REG_P (XEXP (src, 1)) + && !MEM_P (XEXP (src, 1)) + && !CONST_INT_P (XEXP (src, 1))) + return false; + + if (GET_MODE (XEXP (src, 1)) != DImode + && !CONST_INT_P (XEXP (src, 1))) + return false; + break; + + case NEG: + case NOT: + break; + + case REG: + return true; + + case MEM: + case CONST_INT: + return REG_P (dst); + + default: + return false; + } + + if (!REG_P (XEXP (src, 0)) + && !MEM_P (XEXP (src, 0)) + && !CONST_INT_P (XEXP (src, 0)) + /* Check for andnot case. */ + && (GET_CODE (src) != AND + || GET_CODE (XEXP (src, 0)) != NOT + || !REG_P (XEXP (XEXP (src, 0), 0)))) + return false; + + if (GET_MODE (XEXP (src, 0)) != DImode + && !CONST_INT_P (XEXP (src, 0))) + return false; + + return true; +} + +/* The TImode version of scalar_to_vector_candidate_p. */ + +static bool +timode_scalar_to_vector_candidate_p (rtx_insn *insn) +{ + rtx def_set = single_set (insn); + + if (!def_set) + return false; + + if (has_non_address_hard_reg (insn)) + return false; + + rtx src = SET_SRC (def_set); + rtx dst = SET_DEST (def_set); + + /* Only TImode load and store are allowed. */ + if (GET_MODE (dst) != TImode) + return false; + + if (MEM_P (dst)) + { + /* Check for store. Memory must be aligned or unaligned store + is optimal. Only support store from register, standard SSE + constant or CONST_WIDE_INT generated from piecewise store. + + ??? Verify performance impact before enabling CONST_INT for + __int128 store. */ + if (misaligned_operand (dst, TImode) + && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + return false; + + switch (GET_CODE (src)) + { + default: + return false; + + case REG: + case CONST_WIDE_INT: + return true; + + case CONST_INT: + return standard_sse_constant_p (src, TImode); + } + } + else if (MEM_P (src)) + { + /* Check for load. Memory must be aligned or unaligned load is + optimal. */ + return (REG_P (dst) + && (!misaligned_operand (src, TImode) + || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)); + } + + return false; +} + +/* Return 1 if INSN may be converted into vector + instruction. */ + +static bool +scalar_to_vector_candidate_p (rtx_insn *insn) +{ + if (TARGET_64BIT) + return timode_scalar_to_vector_candidate_p (insn); + else + return dimode_scalar_to_vector_candidate_p (insn); +} + +/* The DImode version of remove_non_convertible_regs. */ + +static void +dimode_remove_non_convertible_regs (bitmap candidates) +{ + bitmap_iterator bi; + unsigned id; + bitmap regs = BITMAP_ALLOC (NULL); + + EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi) + { + rtx def_set = single_set (DF_INSN_UID_GET (id)->insn); + rtx reg = SET_DEST (def_set); + + if (!REG_P (reg) + || bitmap_bit_p (regs, REGNO (reg)) + || HARD_REGISTER_P (reg)) + continue; + + for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg)); + def; + def = DF_REF_NEXT_REG (def)) + { + if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) + { + if (dump_file) + fprintf (dump_file, + "r%d has non convertible definition in insn %d\n", + REGNO (reg), DF_REF_INSN_UID (def)); + + bitmap_set_bit (regs, REGNO (reg)); + break; + } + } + } + + EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi) + { + for (df_ref def = DF_REG_DEF_CHAIN (id); + def; + def = DF_REF_NEXT_REG (def)) + if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) + { + if (dump_file) + fprintf (dump_file, "Removing insn %d from candidates list\n", + DF_REF_INSN_UID (def)); + + bitmap_clear_bit (candidates, DF_REF_INSN_UID (def)); + } + } + + BITMAP_FREE (regs); +} + +/* For a register REGNO, scan instructions for its defs and uses. + Put REGNO in REGS if a def or use isn't in CANDIDATES. */ + +static void +timode_check_non_convertible_regs (bitmap candidates, bitmap regs, + unsigned int regno) +{ + for (df_ref def = DF_REG_DEF_CHAIN (regno); + def; + def = DF_REF_NEXT_REG (def)) + { + if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) + { + if (dump_file) + fprintf (dump_file, + "r%d has non convertible def in insn %d\n", + regno, DF_REF_INSN_UID (def)); + + bitmap_set_bit (regs, regno); + break; + } + } + + for (df_ref ref = DF_REG_USE_CHAIN (regno); + ref; + ref = DF_REF_NEXT_REG (ref)) + { + /* Debug instructions are skipped. */ + if (NONDEBUG_INSN_P (DF_REF_INSN (ref)) + && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))) + { + if (dump_file) + fprintf (dump_file, + "r%d has non convertible use in insn %d\n", + regno, DF_REF_INSN_UID (ref)); + + bitmap_set_bit (regs, regno); + break; + } + } +} + +/* The TImode version of remove_non_convertible_regs. */ + +static void +timode_remove_non_convertible_regs (bitmap candidates) +{ + bitmap_iterator bi; + unsigned id; + bitmap regs = BITMAP_ALLOC (NULL); + + EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi) + { + rtx def_set = single_set (DF_INSN_UID_GET (id)->insn); + rtx dest = SET_DEST (def_set); + rtx src = SET_SRC (def_set); + + if ((!REG_P (dest) + || bitmap_bit_p (regs, REGNO (dest)) + || HARD_REGISTER_P (dest)) + && (!REG_P (src) + || bitmap_bit_p (regs, REGNO (src)) + || HARD_REGISTER_P (src))) + continue; + + if (REG_P (dest)) + timode_check_non_convertible_regs (candidates, regs, + REGNO (dest)); + + if (REG_P (src)) + timode_check_non_convertible_regs (candidates, regs, + REGNO (src)); + } + + EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi) + { + for (df_ref def = DF_REG_DEF_CHAIN (id); + def; + def = DF_REF_NEXT_REG (def)) + if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) + { + if (dump_file) + fprintf (dump_file, "Removing insn %d from candidates list\n", + DF_REF_INSN_UID (def)); + + bitmap_clear_bit (candidates, DF_REF_INSN_UID (def)); + } + + for (df_ref ref = DF_REG_USE_CHAIN (id); + ref; + ref = DF_REF_NEXT_REG (ref)) + if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))) + { + if (dump_file) + fprintf (dump_file, "Removing insn %d from candidates list\n", + DF_REF_INSN_UID (ref)); + + bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref)); + } + } + + BITMAP_FREE (regs); +} + +/* For a given bitmap of insn UIDs scans all instruction and + remove insn from CANDIDATES in case it has both convertible + and not convertible definitions. + + All insns in a bitmap are conversion candidates according to + scalar_to_vector_candidate_p. Currently it implies all insns + are single_set. */ + +static void +remove_non_convertible_regs (bitmap candidates) +{ + if (TARGET_64BIT) + timode_remove_non_convertible_regs (candidates); + else + dimode_remove_non_convertible_regs (candidates); +} + +/* Main STV pass function. Find and convert scalar + instructions into vector mode when profitable. */ + +static unsigned int +convert_scalars_to_vector () +{ + basic_block bb; + bitmap candidates; + int converted_insns = 0; + + bitmap_obstack_initialize (NULL); + candidates = BITMAP_ALLOC (NULL); + + calculate_dominance_info (CDI_DOMINATORS); + df_set_flags (DF_DEFER_INSN_RESCAN); + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); + df_md_add_problem (); + df_analyze (); + + /* Find all instructions we want to convert into vector mode. */ + if (dump_file) + fprintf (dump_file, "Searching for mode conversion candidates...\n"); + + FOR_EACH_BB_FN (bb, cfun) + { + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + if (scalar_to_vector_candidate_p (insn)) + { + if (dump_file) + fprintf (dump_file, " insn %d is marked as a candidate\n", + INSN_UID (insn)); + + bitmap_set_bit (candidates, INSN_UID (insn)); + } + } + + remove_non_convertible_regs (candidates); + + if (bitmap_empty_p (candidates)) + if (dump_file) + fprintf (dump_file, "There are no candidates for optimization.\n"); + + while (!bitmap_empty_p (candidates)) + { + unsigned uid = bitmap_first_set_bit (candidates); + scalar_chain *chain; + + if (TARGET_64BIT) + chain = new timode_scalar_chain; + else + chain = new dimode_scalar_chain; + + /* Find instructions chain we want to convert to vector mode. + Check all uses and definitions to estimate all required + conversions. */ + chain->build (candidates, uid); + + if (chain->compute_convert_gain () > 0) + converted_insns += chain->convert (); + else + if (dump_file) + fprintf (dump_file, "Chain #%d conversion is not profitable\n", + chain->chain_id); + + delete chain; + } + + if (dump_file) + fprintf (dump_file, "Total insns converted: %d\n", converted_insns); + + BITMAP_FREE (candidates); + bitmap_obstack_release (NULL); + df_process_deferred_rescans (); + + /* Conversion means we may have 128bit register spills/fills + which require aligned stack. */ + if (converted_insns) + { + if (crtl->stack_alignment_needed < 128) + crtl->stack_alignment_needed = 128; + if (crtl->stack_alignment_estimated < 128) + crtl->stack_alignment_estimated = 128; + /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */ + if (TARGET_64BIT) + for (tree parm = DECL_ARGUMENTS (current_function_decl); + parm; parm = DECL_CHAIN (parm)) + { + if (TYPE_MODE (TREE_TYPE (parm)) != TImode) + continue; + if (DECL_RTL_SET_P (parm) + && GET_MODE (DECL_RTL (parm)) == V1TImode) + { + rtx r = DECL_RTL (parm); + if (REG_P (r)) + SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0)); + } + if (DECL_INCOMING_RTL (parm) + && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode) + { + rtx r = DECL_INCOMING_RTL (parm); + if (REG_P (r)) + DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0); + } + } + } + + return 0; +} + +static unsigned int +rest_of_handle_insert_vzeroupper (void) +{ + int i; + + /* vzeroupper instructions are inserted immediately after reload to + account for possible spills from 256bit or 512bit registers. The pass + reuses mode switching infrastructure by re-running mode insertion + pass, so disable entities that have already been processed. */ + for (i = 0; i < MAX_386_ENTITIES; i++) + ix86_optimize_mode_switching[i] = 0; + + ix86_optimize_mode_switching[AVX_U128] = 1; + + /* Call optimize_mode_switching. */ + g->get_passes ()->execute_pass_mode_switching (); + return 0; +} + +namespace { + +const pass_data pass_data_insert_vzeroupper = +{ + RTL_PASS, /* type */ + "vzeroupper", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_insert_vzeroupper : public rtl_opt_pass +{ +public: + pass_insert_vzeroupper(gcc::context *ctxt) + : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return TARGET_AVX + && TARGET_VZEROUPPER && flag_expensive_optimizations + && !optimize_size; + } + + virtual unsigned int execute (function *) + { + return rest_of_handle_insert_vzeroupper (); + } + +}; // class pass_insert_vzeroupper + +const pass_data pass_data_stv = +{ + RTL_PASS, /* type */ + "stv", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_stv : public rtl_opt_pass +{ +public: + pass_stv (gcc::context *ctxt) + : rtl_opt_pass (pass_data_stv, ctxt), + timode_p (false) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return (timode_p == !!TARGET_64BIT + && TARGET_STV && TARGET_SSE2 && optimize > 1); + } + + virtual unsigned int execute (function *) + { + return convert_scalars_to_vector (); + } + + opt_pass *clone () + { + return new pass_stv (m_ctxt); + } + + void set_pass_param (unsigned int n, bool param) + { + gcc_assert (n == 0); + timode_p = param; + } + +private: + bool timode_p; +}; // class pass_stv + +} // anon namespace + +rtl_opt_pass * +make_pass_insert_vzeroupper (gcc::context *ctxt) +{ + return new pass_insert_vzeroupper (ctxt); +} + +rtl_opt_pass * +make_pass_stv (gcc::context *ctxt) +{ + return new pass_stv (ctxt); +} + +/* Inserting ENDBRANCH instructions. */ + +static unsigned int +rest_of_insert_endbranch (void) +{ + timevar_push (TV_MACH_DEP); + + rtx cet_eb; + rtx_insn *insn; + basic_block bb; + + /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is + absent among function attributes. Later an optimization will be + introduced to make analysis if an address of a static function is + taken. A static function whose address is not taken will get a + nocf_check attribute. This will allow to reduce the number of EB. */ + + if (!lookup_attribute ("nocf_check", + TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) + && (!flag_manual_endbr + || lookup_attribute ("cf_check", + DECL_ATTRIBUTES (cfun->decl))) + && !cgraph_node::get (cfun->decl)->only_called_directly_p ()) + { + /* Queue ENDBR insertion to x86_function_profiler. */ + if (crtl->profile && flag_fentry) + cfun->machine->endbr_queued_at_entrance = true; + else + { + cet_eb = gen_nop_endbr (); + + bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; + insn = BB_HEAD (bb); + emit_insn_before (cet_eb, insn); + } + } + + bb = 0; + FOR_EACH_BB_FN (bb, cfun) + { + for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); + insn = NEXT_INSN (insn)) + { + if (CALL_P (insn)) + { + bool need_endbr; + need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL; + if (!need_endbr && !SIBLING_CALL_P (insn)) + { + rtx call = get_call_rtx_from (insn); + rtx fnaddr = XEXP (call, 0); + tree fndecl = NULL_TREE; + + /* Also generate ENDBRANCH for non-tail call which + may return via indirect branch. */ + if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) + fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0)); + if (fndecl == NULL_TREE) + fndecl = MEM_EXPR (fnaddr); + if (fndecl + && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE + && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE) + fndecl = NULL_TREE; + if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl))) + { + tree fntype = TREE_TYPE (fndecl); + if (lookup_attribute ("indirect_return", + TYPE_ATTRIBUTES (fntype))) + need_endbr = true; + } + } + if (!need_endbr) + continue; + /* Generate ENDBRANCH after CALL, which can return more than + twice, setjmp-like functions. */ + + cet_eb = gen_nop_endbr (); + emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn)); + continue; + } + + if (JUMP_P (insn) && flag_cet_switch) + { + rtx target = JUMP_LABEL (insn); + if (target == NULL_RTX || ANY_RETURN_P (target)) + continue; + + /* Check the jump is a switch table. */ + rtx_insn *label = as_a (target); + rtx_insn *table = next_insn (label); + if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table)) + continue; + + /* For the indirect jump find out all places it jumps and insert + ENDBRANCH there. It should be done under a special flag to + control ENDBRANCH generation for switch stmts. */ + edge_iterator ei; + edge e; + basic_block dest_blk; + + FOR_EACH_EDGE (e, ei, bb->succs) + { + rtx_insn *insn; + + dest_blk = e->dest; + insn = BB_HEAD (dest_blk); + gcc_assert (LABEL_P (insn)); + cet_eb = gen_nop_endbr (); + emit_insn_after (cet_eb, insn); + } + continue; + } + + if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn)) + || (NOTE_P (insn) + && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)) + /* TODO. Check /s bit also. */ + { + cet_eb = gen_nop_endbr (); + emit_insn_after (cet_eb, insn); + continue; + } + } + } + + timevar_pop (TV_MACH_DEP); + return 0; +} + +namespace { + +const pass_data pass_data_insert_endbranch = +{ + RTL_PASS, /* type. */ + "cet", /* name. */ + OPTGROUP_NONE, /* optinfo_flags. */ + TV_MACH_DEP, /* tv_id. */ + 0, /* properties_required. */ + 0, /* properties_provided. */ + 0, /* properties_destroyed. */ + 0, /* todo_flags_start. */ + 0, /* todo_flags_finish. */ +}; + +class pass_insert_endbranch : public rtl_opt_pass +{ +public: + pass_insert_endbranch (gcc::context *ctxt) + : rtl_opt_pass (pass_data_insert_endbranch, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return ((flag_cf_protection & CF_BRANCH)); + } + + virtual unsigned int execute (function *) + { + return rest_of_insert_endbranch (); + } + +}; // class pass_insert_endbranch + +} // anon namespace + +rtl_opt_pass * +make_pass_insert_endbranch (gcc::context *ctxt) +{ + return new pass_insert_endbranch (ctxt); +} + +/* At entry of the nearest common dominator for basic blocks with + conversions, generate a single + vxorps %xmmN, %xmmN, %xmmN + for all + vcvtss2sd op, %xmmN, %xmmX + vcvtsd2ss op, %xmmN, %xmmX + vcvtsi2ss op, %xmmN, %xmmX + vcvtsi2sd op, %xmmN, %xmmX + + NB: We want to generate only a single vxorps to cover the whole + function. The LCM algorithm isn't appropriate here since it may + place a vxorps inside the loop. */ + +static unsigned int +remove_partial_avx_dependency (void) +{ + timevar_push (TV_MACH_DEP); + + bitmap_obstack_initialize (NULL); + bitmap convert_bbs = BITMAP_ALLOC (NULL); + + basic_block bb; + rtx_insn *insn, *set_insn; + rtx set; + rtx v4sf_const0 = NULL_RTX; + + auto_vec control_flow_insns; + + FOR_EACH_BB_FN (bb, cfun) + { + FOR_BB_INSNS (bb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + + set = single_set (insn); + if (!set) + continue; + + if (get_attr_avx_partial_xmm_update (insn) + != AVX_PARTIAL_XMM_UPDATE_TRUE) + continue; + + if (!v4sf_const0) + { + calculate_dominance_info (CDI_DOMINATORS); + df_set_flags (DF_DEFER_INSN_RESCAN); + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); + df_md_add_problem (); + df_analyze (); + v4sf_const0 = gen_reg_rtx (V4SFmode); + } + + /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF, + SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and + vec_merge with subreg. */ + rtx src = SET_SRC (set); + rtx dest = SET_DEST (set); + machine_mode dest_mode = GET_MODE (dest); + + rtx zero; + machine_mode dest_vecmode; + if (dest_mode == E_SFmode) + { + dest_vecmode = V4SFmode; + zero = v4sf_const0; + } + else + { + dest_vecmode = V2DFmode; + zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0); + } + + /* Change source to vector mode. */ + src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src); + src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero, + GEN_INT (HOST_WIDE_INT_1U)); + /* Change destination to vector mode. */ + rtx vec = gen_reg_rtx (dest_vecmode); + /* Generate an XMM vector SET. */ + set = gen_rtx_SET (vec, src); + set_insn = emit_insn_before (set, insn); + df_insn_rescan (set_insn); + + if (cfun->can_throw_non_call_exceptions) + { + /* Handle REG_EH_REGION note. */ + rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX); + if (note) + { + control_flow_insns.safe_push (set_insn); + add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0)); + } + } + + src = gen_rtx_SUBREG (dest_mode, vec, 0); + set = gen_rtx_SET (dest, src); + + /* Drop possible dead definitions. */ + PATTERN (insn) = set; + + INSN_CODE (insn) = -1; + recog_memoized (insn); + df_insn_rescan (insn); + bitmap_set_bit (convert_bbs, bb->index); + } + } + + if (v4sf_const0) + { + /* (Re-)discover loops so that bb->loop_father can be used in the + analysis below. */ + loop_optimizer_init (AVOID_CFG_MODIFICATIONS); + + /* Generate a vxorps at entry of the nearest dominator for basic + blocks with conversions, which is in the the fake loop that + contains the whole function, so that there is only a single + vxorps in the whole function. */ + bb = nearest_common_dominator_for_set (CDI_DOMINATORS, + convert_bbs); + while (bb->loop_father->latch + != EXIT_BLOCK_PTR_FOR_FN (cfun)) + bb = get_immediate_dominator (CDI_DOMINATORS, + bb->loop_father->header); + + set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode)); + + insn = BB_HEAD (bb); + while (insn && !NONDEBUG_INSN_P (insn)) + { + if (insn == BB_END (bb)) + { + insn = NULL; + break; + } + insn = NEXT_INSN (insn); + } + if (insn == BB_HEAD (bb)) + set_insn = emit_insn_before (set, insn); + else + set_insn = emit_insn_after (set, + insn ? PREV_INSN (insn) : BB_END (bb)); + df_insn_rescan (set_insn); + df_process_deferred_rescans (); + loop_optimizer_finalize (); + + if (!control_flow_insns.is_empty ()) + { + free_dominance_info (CDI_DOMINATORS); + + unsigned int i; + FOR_EACH_VEC_ELT (control_flow_insns, i, insn) + if (control_flow_insn_p (insn)) + { + /* Split the block after insn. There will be a fallthru + edge, which is OK so we keep it. We have to create + the exception edges ourselves. */ + bb = BLOCK_FOR_INSN (insn); + split_block (bb, insn); + rtl_make_eh_edge (NULL, bb, BB_END (bb)); + } + } + } + + bitmap_obstack_release (NULL); + BITMAP_FREE (convert_bbs); + + timevar_pop (TV_MACH_DEP); + return 0; +} + +namespace { + +const pass_data pass_data_remove_partial_avx_dependency = +{ + RTL_PASS, /* type */ + "rpad", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_MACH_DEP, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_remove_partial_avx_dependency : public rtl_opt_pass +{ +public: + pass_remove_partial_avx_dependency (gcc::context *ctxt) + : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return (TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && TARGET_SSE_MATH + && optimize + && optimize_function_for_speed_p (cfun)); + } + + virtual unsigned int execute (function *) + { + return remove_partial_avx_dependency (); + } +}; // class pass_rpad + +} // anon namespace + +rtl_opt_pass * +make_pass_remove_partial_avx_dependency (gcc::context *ctxt) +{ + return new pass_remove_partial_avx_dependency (ctxt); +} + +/* This compares the priority of target features in function DECL1 + and DECL2. It returns positive value if DECL1 is higher priority, + negative value if DECL2 is higher priority and 0 if they are the + same. */ + +int +ix86_compare_version_priority (tree decl1, tree decl2) +{ + unsigned int priority1 = get_builtin_code_for_version (decl1, NULL); + unsigned int priority2 = get_builtin_code_for_version (decl2, NULL); + + return (int)priority1 - (int)priority2; +} + +/* V1 and V2 point to function versions with different priorities + based on the target ISA. This function compares their priorities. */ + +static int +feature_compare (const void *v1, const void *v2) +{ + typedef struct _function_version_info + { + tree version_decl; + tree predicate_chain; + unsigned int dispatch_priority; + } function_version_info; + + const function_version_info c1 = *(const function_version_info *)v1; + const function_version_info c2 = *(const function_version_info *)v2; + return (c2.dispatch_priority - c1.dispatch_priority); +} + +/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL + to return a pointer to VERSION_DECL if the outcome of the expression + formed by PREDICATE_CHAIN is true. This function will be called during + version dispatch to decide which function version to execute. It returns + the basic block at the end, to which more conditions can be added. */ + +static basic_block +add_condition_to_bb (tree function_decl, tree version_decl, + tree predicate_chain, basic_block new_bb) +{ + gimple *return_stmt; + tree convert_expr, result_var; + gimple *convert_stmt; + gimple *call_cond_stmt; + gimple *if_else_stmt; + + basic_block bb1, bb2, bb3; + edge e12, e23; + + tree cond_var, and_expr_var = NULL_TREE; + gimple_seq gseq; + + tree predicate_decl, predicate_arg; + + push_cfun (DECL_STRUCT_FUNCTION (function_decl)); + + gcc_assert (new_bb != NULL); + gseq = bb_seq (new_bb); + + + convert_expr = build1 (CONVERT_EXPR, ptr_type_node, + build_fold_addr_expr (version_decl)); + result_var = create_tmp_var (ptr_type_node); + convert_stmt = gimple_build_assign (result_var, convert_expr); + return_stmt = gimple_build_return (result_var); + + if (predicate_chain == NULL_TREE) + { + gimple_seq_add_stmt (&gseq, convert_stmt); + gimple_seq_add_stmt (&gseq, return_stmt); + set_bb_seq (new_bb, gseq); + gimple_set_bb (convert_stmt, new_bb); + gimple_set_bb (return_stmt, new_bb); + pop_cfun (); + return new_bb; + } + + while (predicate_chain != NULL) + { + cond_var = create_tmp_var (integer_type_node); + predicate_decl = TREE_PURPOSE (predicate_chain); + predicate_arg = TREE_VALUE (predicate_chain); + call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg); + gimple_call_set_lhs (call_cond_stmt, cond_var); + + gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl)); + gimple_set_bb (call_cond_stmt, new_bb); + gimple_seq_add_stmt (&gseq, call_cond_stmt); + + predicate_chain = TREE_CHAIN (predicate_chain); + + if (and_expr_var == NULL) + and_expr_var = cond_var; + else + { + gimple *assign_stmt; + /* Use MIN_EXPR to check if any integer is zero?. + and_expr_var = min_expr */ + assign_stmt = gimple_build_assign (and_expr_var, + build2 (MIN_EXPR, integer_type_node, + cond_var, and_expr_var)); + + gimple_set_block (assign_stmt, DECL_INITIAL (function_decl)); + gimple_set_bb (assign_stmt, new_bb); + gimple_seq_add_stmt (&gseq, assign_stmt); + } + } + + if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var, + integer_zero_node, + NULL_TREE, NULL_TREE); + gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl)); + gimple_set_bb (if_else_stmt, new_bb); + gimple_seq_add_stmt (&gseq, if_else_stmt); + + gimple_seq_add_stmt (&gseq, convert_stmt); + gimple_seq_add_stmt (&gseq, return_stmt); + set_bb_seq (new_bb, gseq); + + bb1 = new_bb; + e12 = split_block (bb1, if_else_stmt); + bb2 = e12->dest; + e12->flags &= ~EDGE_FALLTHRU; + e12->flags |= EDGE_TRUE_VALUE; + + e23 = split_block (bb2, return_stmt); + + gimple_set_bb (convert_stmt, bb2); + gimple_set_bb (return_stmt, bb2); + + bb3 = e23->dest; + make_edge (bb1, bb3, EDGE_FALSE_VALUE); + + remove_edge (e23); + make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0); + + pop_cfun (); + + return bb3; +} + +/* This function generates the dispatch function for + multi-versioned functions. DISPATCH_DECL is the function which will + contain the dispatch logic. FNDECLS are the function choices for + dispatch, and is a tree chain. EMPTY_BB is the basic block pointer + in DISPATCH_DECL in which the dispatch code is generated. */ + +static int +dispatch_function_versions (tree dispatch_decl, + void *fndecls_p, + basic_block *empty_bb) +{ + tree default_decl; + gimple *ifunc_cpu_init_stmt; + gimple_seq gseq; + int ix; + tree ele; + vec *fndecls; + unsigned int num_versions = 0; + unsigned int actual_versions = 0; + unsigned int i; + + struct _function_version_info + { + tree version_decl; + tree predicate_chain; + unsigned int dispatch_priority; + }*function_version_info; + + gcc_assert (dispatch_decl != NULL + && fndecls_p != NULL + && empty_bb != NULL); + + /*fndecls_p is actually a vector. */ + fndecls = static_cast *> (fndecls_p); + + /* At least one more version other than the default. */ + num_versions = fndecls->length (); + gcc_assert (num_versions >= 2); + + function_version_info = (struct _function_version_info *) + XNEWVEC (struct _function_version_info, (num_versions - 1)); + + /* The first version in the vector is the default decl. */ + default_decl = (*fndecls)[0]; + + push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl)); + + gseq = bb_seq (*empty_bb); + /* Function version dispatch is via IFUNC. IFUNC resolvers fire before + constructors, so explicity call __builtin_cpu_init here. */ + ifunc_cpu_init_stmt + = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL); + gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt); + gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb); + set_bb_seq (*empty_bb, gseq); + + pop_cfun (); + + + for (ix = 1; fndecls->iterate (ix, &ele); ++ix) + { + tree version_decl = ele; + tree predicate_chain = NULL_TREE; + unsigned int priority; + /* Get attribute string, parse it and find the right predicate decl. + The predicate function could be a lengthy combination of many + features, like arch-type and various isa-variants. */ + priority = get_builtin_code_for_version (version_decl, + &predicate_chain); + + if (predicate_chain == NULL_TREE) + continue; + + function_version_info [actual_versions].version_decl = version_decl; + function_version_info [actual_versions].predicate_chain + = predicate_chain; + function_version_info [actual_versions].dispatch_priority = priority; + actual_versions++; + } + + /* Sort the versions according to descending order of dispatch priority. The + priority is based on the ISA. This is not a perfect solution. There + could still be ambiguity. If more than one function version is suitable + to execute, which one should be dispatched? In future, allow the user + to specify a dispatch priority next to the version. */ + qsort (function_version_info, actual_versions, + sizeof (struct _function_version_info), feature_compare); + + for (i = 0; i < actual_versions; ++i) + *empty_bb = add_condition_to_bb (dispatch_decl, + function_version_info[i].version_decl, + function_version_info[i].predicate_chain, + *empty_bb); + + /* dispatch default version at the end. */ + *empty_bb = add_condition_to_bb (dispatch_decl, default_decl, + NULL, *empty_bb); + + free (function_version_info); + return 0; +} + +/* This function changes the assembler name for functions that are + versions. If DECL is a function version and has a "target" + attribute, it appends the attribute string to its assembler name. */ + +static tree +ix86_mangle_function_version_assembler_name (tree decl, tree id) +{ + tree version_attr; + const char *orig_name, *version_string; + char *attr_str, *assembler_name; + + if (DECL_DECLARED_INLINE_P (decl) + && lookup_attribute ("gnu_inline", + DECL_ATTRIBUTES (decl))) + error_at (DECL_SOURCE_LOCATION (decl), + "function versions cannot be marked as gnu_inline," + " bodies have to be generated"); + + if (DECL_VIRTUAL_P (decl) + || DECL_VINDEX (decl)) + sorry ("virtual function multiversioning not supported"); + + version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl)); + + /* target attribute string cannot be NULL. */ + gcc_assert (version_attr != NULL_TREE); + + orig_name = IDENTIFIER_POINTER (id); + version_string + = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr))); + + if (strcmp (version_string, "default") == 0) + return id; + + attr_str = sorted_attr_string (TREE_VALUE (version_attr)); + assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2); + + sprintf (assembler_name, "%s.%s", orig_name, attr_str); + + /* Allow assembler name to be modified if already set. */ + if (DECL_ASSEMBLER_NAME_SET_P (decl)) + SET_DECL_RTL (decl, NULL); + + tree ret = get_identifier (assembler_name); + XDELETEVEC (attr_str); + XDELETEVEC (assembler_name); + return ret; +} + +tree +ix86_mangle_decl_assembler_name (tree decl, tree id) +{ + /* For function version, add the target suffix to the assembler name. */ + if (TREE_CODE (decl) == FUNCTION_DECL + && DECL_FUNCTION_VERSIONED (decl)) + id = ix86_mangle_function_version_assembler_name (decl, id); +#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME + id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id); +#endif + + return id; +} + +/* Make a dispatcher declaration for the multi-versioned function DECL. + Calls to DECL function will be replaced with calls to the dispatcher + by the front-end. Returns the decl of the dispatcher function. */ + +tree +ix86_get_function_versions_dispatcher (void *decl) +{ + tree fn = (tree) decl; + struct cgraph_node *node = NULL; + struct cgraph_node *default_node = NULL; + struct cgraph_function_version_info *node_v = NULL; + struct cgraph_function_version_info *first_v = NULL; + + tree dispatch_decl = NULL; + + struct cgraph_function_version_info *default_version_info = NULL; + + gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn)); + + node = cgraph_node::get (fn); + gcc_assert (node != NULL); + + node_v = node->function_version (); + gcc_assert (node_v != NULL); + + if (node_v->dispatcher_resolver != NULL) + return node_v->dispatcher_resolver; + + /* Find the default version and make it the first node. */ + first_v = node_v; + /* Go to the beginning of the chain. */ + while (first_v->prev != NULL) + first_v = first_v->prev; + default_version_info = first_v; + while (default_version_info != NULL) + { + if (is_function_default_version + (default_version_info->this_node->decl)) + break; + default_version_info = default_version_info->next; + } + + /* If there is no default node, just return NULL. */ + if (default_version_info == NULL) + return NULL; + + /* Make default info the first node. */ + if (first_v != default_version_info) + { + default_version_info->prev->next = default_version_info->next; + if (default_version_info->next) + default_version_info->next->prev = default_version_info->prev; + first_v->prev = default_version_info; + default_version_info->next = first_v; + default_version_info->prev = NULL; + } + + default_node = default_version_info->this_node; + +#if defined (ASM_OUTPUT_TYPE_DIRECTIVE) + if (targetm.has_ifunc_p ()) + { + struct cgraph_function_version_info *it_v = NULL; + struct cgraph_node *dispatcher_node = NULL; + struct cgraph_function_version_info *dispatcher_version_info = NULL; + + /* Right now, the dispatching is done via ifunc. */ + dispatch_decl = make_dispatcher_decl (default_node->decl); + + dispatcher_node = cgraph_node::get_create (dispatch_decl); + gcc_assert (dispatcher_node != NULL); + dispatcher_node->dispatcher_function = 1; + dispatcher_version_info + = dispatcher_node->insert_new_function_version (); + dispatcher_version_info->next = default_version_info; + dispatcher_node->definition = 1; + + /* Set the dispatcher for all the versions. */ + it_v = default_version_info; + while (it_v != NULL) + { + it_v->dispatcher_resolver = dispatch_decl; + it_v = it_v->next; + } + } + else +#endif + { + error_at (DECL_SOURCE_LOCATION (default_node->decl), + "multiversioning needs ifunc which is not supported " + "on this target"); + } + + return dispatch_decl; +} + +/* Make the resolver function decl to dispatch the versions of + a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is + ifunc alias that will point to the created resolver. Create an + empty basic block in the resolver and store the pointer in + EMPTY_BB. Return the decl of the resolver function. */ + +static tree +make_resolver_func (const tree default_decl, + const tree ifunc_alias_decl, + basic_block *empty_bb) +{ + char *resolver_name; + tree decl, type, decl_name, t; + + /* IFUNC's have to be globally visible. So, if the default_decl is + not, then the name of the IFUNC should be made unique. */ + if (TREE_PUBLIC (default_decl) == 0) + { + char *ifunc_name = make_unique_name (default_decl, "ifunc", true); + symtab->change_decl_assembler_name (ifunc_alias_decl, + get_identifier (ifunc_name)); + XDELETEVEC (ifunc_name); + } + + resolver_name = make_unique_name (default_decl, "resolver", false); + + /* The resolver function should return a (void *). */ + type = build_function_type_list (ptr_type_node, NULL_TREE); + + decl = build_fn_decl (resolver_name, type); + decl_name = get_identifier (resolver_name); + SET_DECL_ASSEMBLER_NAME (decl, decl_name); + + DECL_NAME (decl) = decl_name; + TREE_USED (decl) = 1; + DECL_ARTIFICIAL (decl) = 1; + DECL_IGNORED_P (decl) = 1; + TREE_PUBLIC (decl) = 0; + DECL_UNINLINABLE (decl) = 1; + + /* Resolver is not external, body is generated. */ + DECL_EXTERNAL (decl) = 0; + DECL_EXTERNAL (ifunc_alias_decl) = 0; + + DECL_CONTEXT (decl) = NULL_TREE; + DECL_INITIAL (decl) = make_node (BLOCK); + DECL_STATIC_CONSTRUCTOR (decl) = 0; + + if (DECL_COMDAT_GROUP (default_decl) + || TREE_PUBLIC (default_decl)) + { + /* In this case, each translation unit with a call to this + versioned function will put out a resolver. Ensure it + is comdat to keep just one copy. */ + DECL_COMDAT (decl) = 1; + make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl)); + } + /* Build result decl and add to function_decl. */ + t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node); + DECL_CONTEXT (t) = decl; + DECL_ARTIFICIAL (t) = 1; + DECL_IGNORED_P (t) = 1; + DECL_RESULT (decl) = t; + + gimplify_function_tree (decl); + push_cfun (DECL_STRUCT_FUNCTION (decl)); + *empty_bb = init_lowered_empty_function (decl, false, + profile_count::uninitialized ()); + + cgraph_node::add_new_function (decl, true); + symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl)); + + pop_cfun (); + + gcc_assert (ifunc_alias_decl != NULL); + /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */ + DECL_ATTRIBUTES (ifunc_alias_decl) + = make_attribute ("ifunc", resolver_name, + DECL_ATTRIBUTES (ifunc_alias_decl)); + + /* Create the alias for dispatch to resolver here. */ + cgraph_node::create_same_body_alias (ifunc_alias_decl, decl); + XDELETEVEC (resolver_name); + return decl; +} + +/* Generate the dispatching code body to dispatch multi-versioned function + DECL. The target hook is called to process the "target" attributes and + provide the code to dispatch the right function at run-time. NODE points + to the dispatcher decl whose body will be created. */ + +tree +ix86_generate_version_dispatcher_body (void *node_p) +{ + tree resolver_decl; + basic_block empty_bb; + tree default_ver_decl; + struct cgraph_node *versn; + struct cgraph_node *node; + + struct cgraph_function_version_info *node_version_info = NULL; + struct cgraph_function_version_info *versn_info = NULL; + + node = (cgraph_node *)node_p; + + node_version_info = node->function_version (); + gcc_assert (node->dispatcher_function + && node_version_info != NULL); + + if (node_version_info->dispatcher_resolver) + return node_version_info->dispatcher_resolver; + + /* The first version in the chain corresponds to the default version. */ + default_ver_decl = node_version_info->next->this_node->decl; + + /* node is going to be an alias, so remove the finalized bit. */ + node->definition = false; + + resolver_decl = make_resolver_func (default_ver_decl, + node->decl, &empty_bb); + + node_version_info->dispatcher_resolver = resolver_decl; + + push_cfun (DECL_STRUCT_FUNCTION (resolver_decl)); + + auto_vec fn_ver_vec; + + for (versn_info = node_version_info->next; versn_info; + versn_info = versn_info->next) + { + versn = versn_info->this_node; + /* Check for virtual functions here again, as by this time it should + have been determined if this function needs a vtable index or + not. This happens for methods in derived classes that override + virtual methods in base classes but are not explicitly marked as + virtual. */ + if (DECL_VINDEX (versn->decl)) + sorry ("virtual function multiversioning not supported"); + + fn_ver_vec.safe_push (versn->decl); + } + + dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb); + cgraph_edge::rebuild_edges (); + pop_cfun (); + return resolver_decl; +} + + diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h new file mode 100644 index 00000000000..35812224997 --- /dev/null +++ b/gcc/config/i386/i386-features.h @@ -0,0 +1,201 @@ +/* Copyright (C) 1988-2019 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#ifndef GCC_I386_FEATURES_H +#define GCC_I386_FEATURES_H + +enum xlogue_stub { + XLOGUE_STUB_SAVE, + XLOGUE_STUB_RESTORE, + XLOGUE_STUB_RESTORE_TAIL, + XLOGUE_STUB_SAVE_HFP, + XLOGUE_STUB_RESTORE_HFP, + XLOGUE_STUB_RESTORE_HFP_TAIL, + + XLOGUE_STUB_COUNT +}; + +enum xlogue_stub_sets { + XLOGUE_SET_ALIGNED, + XLOGUE_SET_ALIGNED_PLUS_8, + XLOGUE_SET_HFP_ALIGNED_OR_REALIGN, + XLOGUE_SET_HFP_ALIGNED_PLUS_8, + + XLOGUE_SET_COUNT +}; + +/* Register save/restore layout used by out-of-line stubs. */ +class xlogue_layout { +public: + struct reginfo + { + unsigned regno; + HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or + rsi) to where each register is stored. */ + }; + + unsigned get_nregs () const {return m_nregs;} + HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;} + + const reginfo &get_reginfo (unsigned reg) const + { + gcc_assert (reg < m_nregs); + return m_regs[reg]; + } + + static const char *get_stub_name (enum xlogue_stub stub, + unsigned n_extra_args); + + /* Returns an rtx for the stub's symbol based upon + 1.) the specified stub (save, restore or restore_ret) and + 2.) the value of cfun->machine->call_ms2sysv_extra_regs and + 3.) rather or not stack alignment is being performed. */ + static rtx get_stub_rtx (enum xlogue_stub stub); + + /* Returns the amount of stack space (including padding) that the stub + needs to store registers based upon data in the machine_function. */ + HOST_WIDE_INT get_stack_space_used () const + { + const struct machine_function *m = cfun->machine; + unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1; + + gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS); + return m_regs[last_reg].offset + STUB_INDEX_OFFSET; + } + + /* Returns the offset for the base pointer used by the stub. */ + HOST_WIDE_INT get_stub_ptr_offset () const + { + return STUB_INDEX_OFFSET + m_stack_align_off_in; + } + + static const struct xlogue_layout &get_instance (); + static unsigned count_stub_managed_regs (); + static bool is_stub_managed_reg (unsigned regno, unsigned count); + + static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70; + static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS; + static const unsigned MAX_REGS = 18; + static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS; + static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1; + static const unsigned STUB_NAME_MAX_LEN = 20; + static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT]; + static const unsigned REG_ORDER[MAX_REGS]; + static const unsigned REG_ORDER_REALIGN[MAX_REGS]; + +private: + xlogue_layout (); + xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp); + xlogue_layout (const xlogue_layout &); + + /* True if hard frame pointer is used. */ + bool m_hfp; + + /* Max number of register this layout manages. */ + unsigned m_nregs; + + /* Incoming offset from 16-byte alignment. */ + HOST_WIDE_INT m_stack_align_off_in; + + /* Register order and offsets. */ + struct reginfo m_regs[MAX_REGS]; + + /* Lazy-inited cache of symbol names for stubs. */ + static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT] + [STUB_NAME_MAX_LEN]; + + static const xlogue_layout s_instances[XLOGUE_SET_COUNT]; +}; + +namespace { + +class scalar_chain +{ + public: + scalar_chain (); + virtual ~scalar_chain (); + + static unsigned max_id; + + /* ID of a chain. */ + unsigned int chain_id; + /* A queue of instructions to be included into a chain. */ + bitmap queue; + /* Instructions included into a chain. */ + bitmap insns; + /* All registers defined by a chain. */ + bitmap defs; + /* Registers used in both vector and sclar modes. */ + bitmap defs_conv; + + void build (bitmap candidates, unsigned insn_uid); + virtual int compute_convert_gain () = 0; + int convert (); + + protected: + void add_to_queue (unsigned insn_uid); + void emit_conversion_insns (rtx insns, rtx_insn *pos); + + private: + void add_insn (bitmap candidates, unsigned insn_uid); + void analyze_register_chain (bitmap candidates, df_ref ref); + virtual void mark_dual_mode_def (df_ref def) = 0; + virtual void convert_insn (rtx_insn *insn) = 0; + virtual void convert_registers () = 0; +}; + +class dimode_scalar_chain : public scalar_chain +{ + public: + int compute_convert_gain (); + private: + void mark_dual_mode_def (df_ref def); + rtx replace_with_subreg (rtx x, rtx reg, rtx subreg); + void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg); + void convert_insn (rtx_insn *insn); + void convert_op (rtx *op, rtx_insn *insn); + void convert_reg (unsigned regno); + void make_vector_copies (unsigned regno); + void convert_registers (); + int vector_const_cost (rtx exp); +}; + +class timode_scalar_chain : public scalar_chain +{ + public: + /* Convert from TImode to V1TImode is always faster. */ + int compute_convert_gain () { return 1; } + + private: + void mark_dual_mode_def (df_ref def); + void fix_debug_reg_uses (rtx reg); + void convert_insn (rtx_insn *insn); + /* We don't convert registers to difference size. */ + void convert_registers () {} +}; + +} // anon namespace + +bool ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined); +int ix86_compare_version_priority (tree decl1, tree decl2); +tree ix86_generate_version_dispatcher_body (void *node_p); +tree ix86_get_function_versions_dispatcher (void *decl); +tree ix86_mangle_decl_assembler_name (tree decl, tree id); + + +#endif /* GCC_I386_FEATURES_H */ diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c new file mode 100644 index 00000000000..1a673d278ee --- /dev/null +++ b/gcc/config/i386/i386-options.c @@ -0,0 +1,3688 @@ +/* Copyright (C) 1988-2019 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#define IN_TARGET_CODE 1 + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "gimple.h" +#include "cfghooks.h" +#include "cfgloop.h" +#include "df.h" +#include "tm_p.h" +#include "stringpool.h" +#include "expmed.h" +#include "optabs.h" +#include "regs.h" +#include "emit-rtl.h" +#include "recog.h" +#include "cgraph.h" +#include "diagnostic.h" +#include "cfgbuild.h" +#include "alias.h" +#include "fold-const.h" +#include "attribs.h" +#include "calls.h" +#include "stor-layout.h" +#include "varasm.h" +#include "output.h" +#include "insn-attr.h" +#include "flags.h" +#include "except.h" +#include "explow.h" +#include "expr.h" +#include "cfgrtl.h" +#include "common/common-target.h" +#include "langhooks.h" +#include "reload.h" +#include "gimplify.h" +#include "dwarf2.h" +#include "tm-constrs.h" +#include "params.h" +#include "cselib.h" +#include "sched-int.h" +#include "opts.h" +#include "tree-pass.h" +#include "context.h" +#include "pass_manager.h" +#include "target-globals.h" +#include "gimple-iterator.h" +#include "tree-vectorizer.h" +#include "shrink-wrap.h" +#include "builtins.h" +#include "rtl-iter.h" +#include "tree-iterator.h" +#include "dbgcnt.h" +#include "case-cfn-macros.h" +#include "dojump.h" +#include "fold-const-call.h" +#include "tree-vrp.h" +#include "tree-ssanames.h" +#include "selftest.h" +#include "selftest-rtl.h" +#include "print-rtl.h" +#include "intl.h" +#include "ifcvt.h" +#include "symbol-summary.h" +#include "ipa-prop.h" +#include "ipa-fnsummary.h" +#include "wide-int-bitmask.h" +#include "tree-vector-builder.h" +#include "debug.h" +#include "dwarf2out.h" +#include "i386-options.h" + +#include "x86-tune-costs.h" + +#ifndef SUBTARGET32_DEFAULT_CPU +#define SUBTARGET32_DEFAULT_CPU "i386" +#endif + +/* Processor feature/optimization bitmasks. */ +#define m_386 (HOST_WIDE_INT_1U< 70) + { + *ptr++ = '\\'; + *ptr++ = '\n'; + line_len = 0; + } + } + + for (j = 0; j < 2; j++) + if (opts[i][j]) + { + memcpy (ptr, opts[i][j], len2[j]); + ptr += len2[j]; + line_len += len2[j]; + } + } + + *ptr = '\0'; + gcc_assert (ret + len >= ptr); + + return ret; +} + +/* Function that is callable from the debugger to print the current + options. */ +void ATTRIBUTE_UNUSED +ix86_debug_options (void) +{ + char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2, + target_flags, ix86_target_flags, + ix86_arch_string,ix86_tune_string, + ix86_fpmath, true, true); + + if (opts) + { + fprintf (stderr, "%s\n\n", opts); + free (opts); + } + else + fputs ("\n\n", stderr); + + return; +} + +/* Save the current options */ + +void +ix86_function_specific_save (struct cl_target_option *ptr, + struct gcc_options *opts) +{ + ptr->arch = ix86_arch; + ptr->schedule = ix86_schedule; + ptr->prefetch_sse = x86_prefetch_sse; + ptr->tune = ix86_tune; + ptr->branch_cost = ix86_branch_cost; + ptr->tune_defaulted = ix86_tune_defaulted; + ptr->arch_specified = ix86_arch_specified; + ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit; + ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit; + ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit; + ptr->x_ix86_arch_string = opts->x_ix86_arch_string; + ptr->x_ix86_tune_string = opts->x_ix86_tune_string; + ptr->x_ix86_cmodel = opts->x_ix86_cmodel; + ptr->x_ix86_abi = opts->x_ix86_abi; + ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect; + ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost; + ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes; + ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer; + ptr->x_ix86_force_drap = opts->x_ix86_force_drap; + ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg; + ptr->x_ix86_pmode = opts->x_ix86_pmode; + ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg; + ptr->x_ix86_recip_name = opts->x_ix86_recip_name; + ptr->x_ix86_regparm = opts->x_ix86_regparm; + ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold; + ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx; + ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard; + ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg; + ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect; + ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string; + ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy; + ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy; + ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default; + ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type; + + /* The fields are char but the variables are not; make sure the + values fit in the fields. */ + gcc_assert (ptr->arch == ix86_arch); + gcc_assert (ptr->schedule == ix86_schedule); + gcc_assert (ptr->tune == ix86_tune); + gcc_assert (ptr->branch_cost == ix86_branch_cost); +} + +/* Feature tests against the various architecture variations, used to create + ix86_arch_features based on the processor mask. */ +static unsigned HOST_WIDE_INT initial_ix86_arch_features[X86_ARCH_LAST] = { + /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */ + ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6), + + /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */ + ~m_386, + + /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */ + ~(m_386 | m_486), + + /* X86_ARCH_XADD: Exchange and add was added for 80486. */ + ~m_386, + + /* X86_ARCH_BSWAP: Byteswap was added for 80486. */ + ~m_386, +}; + +/* This table must be in sync with enum processor_type in i386.h. */ +static const struct processor_costs *processor_cost_table[] = +{ + &generic_cost, + &i386_cost, + &i486_cost, + &pentium_cost, + &lakemont_cost, + &pentiumpro_cost, + &pentium4_cost, + &nocona_cost, + &core_cost, + &core_cost, + &core_cost, + &core_cost, + &atom_cost, + &slm_cost, + &slm_cost, + &slm_cost, + &slm_cost, + &slm_cost, + &slm_cost, + &skylake_cost, + &skylake_cost, + &skylake_cost, + &skylake_cost, + &skylake_cost, + &skylake_cost, + &intel_cost, + &geode_cost, + &k6_cost, + &athlon_cost, + &k8_cost, + &amdfam10_cost, + &bdver_cost, + &bdver_cost, + &bdver_cost, + &bdver_cost, + &btver1_cost, + &btver2_cost, + &znver1_cost, + &znver2_cost +}; + +/* Guarantee that the array is aligned with enum processor_type. */ +STATIC_ASSERT (ARRAY_SIZE (processor_cost_table) == PROCESSOR_max); + +static bool +ix86_option_override_internal (bool main_args_p, + struct gcc_options *opts, + struct gcc_options *opts_set); +static void +set_ix86_tune_features (enum processor_type ix86_tune, bool dump); + +/* Restore the current options */ + +void +ix86_function_specific_restore (struct gcc_options *opts, + struct cl_target_option *ptr) +{ + enum processor_type old_tune = ix86_tune; + enum processor_type old_arch = ix86_arch; + unsigned HOST_WIDE_INT ix86_arch_mask; + int i; + + /* We don't change -fPIC. */ + opts->x_flag_pic = flag_pic; + + ix86_arch = (enum processor_type) ptr->arch; + ix86_schedule = (enum attr_cpu) ptr->schedule; + ix86_tune = (enum processor_type) ptr->tune; + x86_prefetch_sse = ptr->prefetch_sse; + opts->x_ix86_branch_cost = ptr->branch_cost; + ix86_tune_defaulted = ptr->tune_defaulted; + ix86_arch_specified = ptr->arch_specified; + opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit; + opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit; + opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit; + opts->x_ix86_arch_string = ptr->x_ix86_arch_string; + opts->x_ix86_tune_string = ptr->x_ix86_tune_string; + opts->x_ix86_cmodel = ptr->x_ix86_cmodel; + opts->x_ix86_abi = ptr->x_ix86_abi; + opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect; + opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost; + opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes; + opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer; + opts->x_ix86_force_drap = ptr->x_ix86_force_drap; + opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg; + opts->x_ix86_pmode = ptr->x_ix86_pmode; + opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg; + opts->x_ix86_recip_name = ptr->x_ix86_recip_name; + opts->x_ix86_regparm = ptr->x_ix86_regparm; + opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold; + opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx; + opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard; + opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg; + opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect; + opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string; + opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy; + opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy; + opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default; + opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type; + ix86_tune_cost = processor_cost_table[ix86_tune]; + /* TODO: ix86_cost should be chosen at instruction or function granuality + so for cold code we use size_cost even in !optimize_size compilation. */ + if (opts->x_optimize_size) + ix86_cost = &ix86_size_cost; + else + ix86_cost = ix86_tune_cost; + + /* Recreate the arch feature tests if the arch changed */ + if (old_arch != ix86_arch) + { + ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch; + for (i = 0; i < X86_ARCH_LAST; ++i) + ix86_arch_features[i] + = !!(initial_ix86_arch_features[i] & ix86_arch_mask); + } + + /* Recreate the tune optimization tests */ + if (old_tune != ix86_tune) + set_ix86_tune_features (ix86_tune, false); +} + +/* Adjust target options after streaming them in. This is mainly about + reconciling them with global options. */ + +void +ix86_function_specific_post_stream_in (struct cl_target_option *ptr) +{ + /* flag_pic is a global option, but ix86_cmodel is target saved option + partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel + for PIC, or error out. */ + if (flag_pic) + switch (ptr->x_ix86_cmodel) + { + case CM_SMALL: + ptr->x_ix86_cmodel = CM_SMALL_PIC; + break; + + case CM_MEDIUM: + ptr->x_ix86_cmodel = CM_MEDIUM_PIC; + break; + + case CM_LARGE: + ptr->x_ix86_cmodel = CM_LARGE_PIC; + break; + + case CM_KERNEL: + error ("code model %s does not support PIC mode", "kernel"); + break; + + default: + break; + } + else + switch (ptr->x_ix86_cmodel) + { + case CM_SMALL_PIC: + ptr->x_ix86_cmodel = CM_SMALL; + break; + + case CM_MEDIUM_PIC: + ptr->x_ix86_cmodel = CM_MEDIUM; + break; + + case CM_LARGE_PIC: + ptr->x_ix86_cmodel = CM_LARGE; + break; + + default: + break; + } +} + +/* Print the current options */ + +void +ix86_function_specific_print (FILE *file, int indent, + struct cl_target_option *ptr) +{ + char *target_string + = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2, + ptr->x_target_flags, ptr->x_ix86_target_flags, + NULL, NULL, ptr->x_ix86_fpmath, false, true); + + gcc_assert (ptr->arch < PROCESSOR_max); + fprintf (file, "%*sarch = %d (%s)\n", + indent, "", + ptr->arch, processor_names[ptr->arch]); + + gcc_assert (ptr->tune < PROCESSOR_max); + fprintf (file, "%*stune = %d (%s)\n", + indent, "", + ptr->tune, processor_names[ptr->tune]); + + fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost); + + if (target_string) + { + fprintf (file, "%*s%s\n", indent, "", target_string); + free (target_string); + } +} + + +/* Inner function to process the attribute((target(...))), take an argument and + set the current options from the argument. If we have a list, recursively go + over the list. */ + +static bool +ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], + struct gcc_options *opts, + struct gcc_options *opts_set, + struct gcc_options *enum_opts_set, + bool target_clone_attr) +{ + char *next_optstr; + bool ret = true; + +#define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 } +#define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 } +#define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 } +#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M } +#define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M } + + enum ix86_opt_type + { + ix86_opt_unknown, + ix86_opt_yes, + ix86_opt_no, + ix86_opt_str, + ix86_opt_enum, + ix86_opt_isa + }; + + static const struct + { + const char *string; + size_t len; + enum ix86_opt_type type; + int opt; + int mask; + } attrs[] = { + /* isa options */ + IX86_ATTR_ISA ("pconfig", OPT_mpconfig), + IX86_ATTR_ISA ("wbnoinvd", OPT_mwbnoinvd), + IX86_ATTR_ISA ("sgx", OPT_msgx), + IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps), + IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw), + IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq), + IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2), + IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni), + IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg), + + IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi), + IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma), + IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl), + IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw), + IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq), + IX86_ATTR_ISA ("avx512er", OPT_mavx512er), + IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf), + IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd), + IX86_ATTR_ISA ("avx512f", OPT_mavx512f), + IX86_ATTR_ISA ("avx2", OPT_mavx2), + IX86_ATTR_ISA ("fma", OPT_mfma), + IX86_ATTR_ISA ("xop", OPT_mxop), + IX86_ATTR_ISA ("fma4", OPT_mfma4), + IX86_ATTR_ISA ("f16c", OPT_mf16c), + IX86_ATTR_ISA ("avx", OPT_mavx), + IX86_ATTR_ISA ("sse4", OPT_msse4), + IX86_ATTR_ISA ("sse4.2", OPT_msse4_2), + IX86_ATTR_ISA ("sse4.1", OPT_msse4_1), + IX86_ATTR_ISA ("sse4a", OPT_msse4a), + IX86_ATTR_ISA ("ssse3", OPT_mssse3), + IX86_ATTR_ISA ("sse3", OPT_msse3), + IX86_ATTR_ISA ("aes", OPT_maes), + IX86_ATTR_ISA ("sha", OPT_msha), + IX86_ATTR_ISA ("pclmul", OPT_mpclmul), + IX86_ATTR_ISA ("sse2", OPT_msse2), + IX86_ATTR_ISA ("sse", OPT_msse), + IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa), + IX86_ATTR_ISA ("3dnow", OPT_m3dnow), + IX86_ATTR_ISA ("mmx", OPT_mmmx), + IX86_ATTR_ISA ("rtm", OPT_mrtm), + IX86_ATTR_ISA ("prfchw", OPT_mprfchw), + IX86_ATTR_ISA ("rdseed", OPT_mrdseed), + IX86_ATTR_ISA ("adx", OPT_madx), + IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1), + IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt), + IX86_ATTR_ISA ("xsaves", OPT_mxsaves), + IX86_ATTR_ISA ("xsavec", OPT_mxsavec), + IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt), + IX86_ATTR_ISA ("xsave", OPT_mxsave), + IX86_ATTR_ISA ("abm", OPT_mabm), + IX86_ATTR_ISA ("bmi", OPT_mbmi), + IX86_ATTR_ISA ("bmi2", OPT_mbmi2), + IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt), + IX86_ATTR_ISA ("tbm", OPT_mtbm), + IX86_ATTR_ISA ("popcnt", OPT_mpopcnt), + IX86_ATTR_ISA ("cx16", OPT_mcx16), + IX86_ATTR_ISA ("sahf", OPT_msahf), + IX86_ATTR_ISA ("movbe", OPT_mmovbe), + IX86_ATTR_ISA ("crc32", OPT_mcrc32), + IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase), + IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd), + IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx), + IX86_ATTR_ISA ("clzero", OPT_mclzero), + IX86_ATTR_ISA ("pku", OPT_mpku), + IX86_ATTR_ISA ("lwp", OPT_mlwp), + IX86_ATTR_ISA ("hle", OPT_mhle), + IX86_ATTR_ISA ("fxsr", OPT_mfxsr), + IX86_ATTR_ISA ("clwb", OPT_mclwb), + IX86_ATTR_ISA ("rdpid", OPT_mrdpid), + IX86_ATTR_ISA ("gfni", OPT_mgfni), + IX86_ATTR_ISA ("shstk", OPT_mshstk), + IX86_ATTR_ISA ("vaes", OPT_mvaes), + IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq), + IX86_ATTR_ISA ("movdiri", OPT_mmovdiri), + IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b), + IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg), + IX86_ATTR_ISA ("cldemote", OPT_mcldemote), + IX86_ATTR_ISA ("ptwrite", OPT_mptwrite), + + /* enum options */ + IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), + + /* string options */ + IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH), + IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE), + + /* flag options */ + IX86_ATTR_YES ("cld", + OPT_mcld, + MASK_CLD), + + IX86_ATTR_NO ("fancy-math-387", + OPT_mfancy_math_387, + MASK_NO_FANCY_MATH_387), + + IX86_ATTR_YES ("ieee-fp", + OPT_mieee_fp, + MASK_IEEE_FP), + + IX86_ATTR_YES ("inline-all-stringops", + OPT_minline_all_stringops, + MASK_INLINE_ALL_STRINGOPS), + + IX86_ATTR_YES ("inline-stringops-dynamically", + OPT_minline_stringops_dynamically, + MASK_INLINE_STRINGOPS_DYNAMICALLY), + + IX86_ATTR_NO ("align-stringops", + OPT_mno_align_stringops, + MASK_NO_ALIGN_STRINGOPS), + + IX86_ATTR_YES ("recip", + OPT_mrecip, + MASK_RECIP), + }; + + location_t loc + = fndecl == NULL ? UNKNOWN_LOCATION : DECL_SOURCE_LOCATION (fndecl); + const char *attr_name = target_clone_attr ? "target_clone" : "target"; + + /* If this is a list, recurse to get the options. */ + if (TREE_CODE (args) == TREE_LIST) + { + bool ret = true; + + for (; args; args = TREE_CHAIN (args)) + if (TREE_VALUE (args) + && !ix86_valid_target_attribute_inner_p (fndecl, TREE_VALUE (args), + p_strings, opts, opts_set, + enum_opts_set, + target_clone_attr)) + ret = false; + + return ret; + } + + else if (TREE_CODE (args) != STRING_CST) + { + error_at (loc, "attribute %qs argument is not a string", attr_name); + return false; + } + + /* Handle multiple arguments separated by commas. */ + next_optstr = ASTRDUP (TREE_STRING_POINTER (args)); + + while (next_optstr && *next_optstr != '\0') + { + char *p = next_optstr; + char *orig_p = p; + char *comma = strchr (next_optstr, ','); + size_t len, opt_len; + int opt; + bool opt_set_p; + char ch; + unsigned i; + enum ix86_opt_type type = ix86_opt_unknown; + int mask = 0; + + if (comma) + { + *comma = '\0'; + len = comma - next_optstr; + next_optstr = comma + 1; + } + else + { + len = strlen (p); + next_optstr = NULL; + } + + /* Recognize no-xxx. */ + if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-') + { + opt_set_p = false; + p += 3; + len -= 3; + } + else + opt_set_p = true; + + /* Find the option. */ + ch = *p; + opt = N_OPTS; + for (i = 0; i < ARRAY_SIZE (attrs); i++) + { + type = attrs[i].type; + opt_len = attrs[i].len; + if (ch == attrs[i].string[0] + && ((type != ix86_opt_str && type != ix86_opt_enum) + ? len == opt_len + : len > opt_len) + && memcmp (p, attrs[i].string, opt_len) == 0) + { + opt = attrs[i].opt; + mask = attrs[i].mask; + break; + } + } + + /* Process the option. */ + if (opt == N_OPTS) + { + error_at (loc, "attribute %qs argument %qs is unknown", + orig_p, attr_name); + ret = false; + } + + else if (type == ix86_opt_isa) + { + struct cl_decoded_option decoded; + + generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded); + ix86_handle_option (opts, opts_set, + &decoded, input_location); + } + + else if (type == ix86_opt_yes || type == ix86_opt_no) + { + if (type == ix86_opt_no) + opt_set_p = !opt_set_p; + + if (opt_set_p) + opts->x_target_flags |= mask; + else + opts->x_target_flags &= ~mask; + } + + else if (type == ix86_opt_str) + { + if (p_strings[opt]) + { + error_at (loc, "attribute value %qs was already specified " + "in %qs attribute", orig_p, attr_name); + ret = false; + } + else + p_strings[opt] = xstrdup (p + opt_len); + } + + else if (type == ix86_opt_enum) + { + bool arg_ok; + int value; + + arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET); + if (arg_ok) + set_option (opts, enum_opts_set, opt, value, + p + opt_len, DK_UNSPECIFIED, input_location, + global_dc); + else + { + error_at (loc, "attribute value %qs is unknown in %qs attribute", + orig_p, attr_name); + ret = false; + } + } + + else + gcc_unreachable (); + } + + return ret; +} + +/* Release allocated strings. */ +static void +release_options_strings (char **option_strings) +{ + /* Free up memory allocated to hold the strings */ + for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++) + free (option_strings[i]); +} + +/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */ + +tree +ix86_valid_target_attribute_tree (tree fndecl, tree args, + struct gcc_options *opts, + struct gcc_options *opts_set, + bool target_clone_attr) +{ + const char *orig_arch_string = opts->x_ix86_arch_string; + const char *orig_tune_string = opts->x_ix86_tune_string; + enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath; + int orig_tune_defaulted = ix86_tune_defaulted; + int orig_arch_specified = ix86_arch_specified; + char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL }; + tree t = NULL_TREE; + struct cl_target_option *def + = TREE_TARGET_OPTION (target_option_default_node); + struct gcc_options enum_opts_set; + + memset (&enum_opts_set, 0, sizeof (enum_opts_set)); + + /* Process each of the options on the chain. */ + if (!ix86_valid_target_attribute_inner_p (fndecl, args, option_strings, opts, + opts_set, &enum_opts_set, + target_clone_attr)) + return error_mark_node; + + /* If the changed options are different from the default, rerun + ix86_option_override_internal, and then save the options away. + The string options are attribute options, and will be undone + when we copy the save structure. */ + if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags + || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2 + || opts->x_target_flags != def->x_target_flags + || option_strings[IX86_FUNCTION_SPECIFIC_ARCH] + || option_strings[IX86_FUNCTION_SPECIFIC_TUNE] + || enum_opts_set.x_ix86_fpmath) + { + /* If we are using the default tune= or arch=, undo the string assigned, + and use the default. */ + if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]) + { + opts->x_ix86_arch_string + = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]); + + /* If arch= is set, clear all bits in x_ix86_isa_flags, + except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */ + opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT + | OPTION_MASK_ABI_64 + | OPTION_MASK_ABI_X32 + | OPTION_MASK_CODE16); + opts->x_ix86_isa_flags2 = 0; + } + else if (!orig_arch_specified) + opts->x_ix86_arch_string = NULL; + + if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]) + opts->x_ix86_tune_string + = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]); + else if (orig_tune_defaulted) + opts->x_ix86_tune_string = NULL; + + /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */ + if (enum_opts_set.x_ix86_fpmath) + opts_set->x_ix86_fpmath = (enum fpmath_unit) 1; + + /* Do any overrides, such as arch=xxx, or tune=xxx support. */ + bool r = ix86_option_override_internal (false, opts, opts_set); + if (!r) + { + release_options_strings (option_strings); + return error_mark_node; + } + + /* Add any builtin functions with the new isa if any. */ + ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2); + + /* Save the current options unless we are validating options for + #pragma. */ + t = build_target_option_node (opts); + + opts->x_ix86_arch_string = orig_arch_string; + opts->x_ix86_tune_string = orig_tune_string; + opts_set->x_ix86_fpmath = orig_fpmath_set; + + release_options_strings (option_strings); + } + + return t; +} + +/* Hook to validate attribute((target("string"))). */ + +bool +ix86_valid_target_attribute_p (tree fndecl, + tree ARG_UNUSED (name), + tree args, + int flags) +{ + struct gcc_options func_options; + tree new_target, new_optimize; + bool ret = true; + + /* attribute((target("default"))) does nothing, beyond + affecting multi-versioning. */ + if (TREE_VALUE (args) + && TREE_CODE (TREE_VALUE (args)) == STRING_CST + && TREE_CHAIN (args) == NULL_TREE + && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0) + return true; + + tree old_optimize = build_optimization_node (&global_options); + + /* Get the optimization options of the current function. */ + tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); + + if (!func_optimize) + func_optimize = old_optimize; + + /* Init func_options. */ + memset (&func_options, 0, sizeof (func_options)); + init_options_struct (&func_options, NULL); + lang_hooks.init_options_struct (&func_options); + + cl_optimization_restore (&func_options, + TREE_OPTIMIZATION (func_optimize)); + + /* Initialize func_options to the default before its target options can + be set. */ + cl_target_option_restore (&func_options, + TREE_TARGET_OPTION (target_option_default_node)); + + /* FLAGS == 1 is used for target_clones attribute. */ + new_target + = ix86_valid_target_attribute_tree (fndecl, args, &func_options, + &global_options_set, flags == 1); + + new_optimize = build_optimization_node (&func_options); + + if (new_target == error_mark_node) + ret = false; + + else if (fndecl && new_target) + { + DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target; + + if (old_optimize != new_optimize) + DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize; + } + + finalize_options_struct (&func_options); + + return ret; +} + +const char *stringop_alg_names[] = { +#define DEF_ENUM +#define DEF_ALG(alg, name) #name, +#include "stringop.def" +#undef DEF_ENUM +#undef DEF_ALG +}; + +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. + The string is of the following form (or comma separated list of it): + + strategy_alg:max_size:[align|noalign] + + where the full size range for the strategy is either [0, max_size] or + [min_size, max_size], in which min_size is the max_size + 1 of the + preceding range. The last size range must have max_size == -1. + + Examples: + + 1. + -mmemcpy-strategy=libcall:-1:noalign + + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall + + + 2. + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign + + This is to tell the compiler to use the following strategy for memset + 1) when the expected size is between [1, 16], use rep_8byte strategy; + 2) when the size is between [17, 2048], use vector_loop; + 3) when the size is > 2048, use libcall. */ + +struct stringop_size_range +{ + int max; + stringop_alg alg; + bool noalign; +}; + +static void +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) +{ + const struct stringop_algs *default_algs; + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; + char *curr_range_str, *next_range_str; + const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="; + int i = 0, n = 0; + + if (is_memset) + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + curr_range_str = strategy_str; + + do + { + int maxs; + char alg_name[128]; + char align[16]; + next_range_str = strchr (curr_range_str, ','); + if (next_range_str) + *next_range_str++ = '\0'; + + if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs, + align) != 3) + { + error ("wrong argument %qs to option %qs", curr_range_str, opt); + return; + } + + if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1)) + { + error ("size ranges of option %qs should be increasing", opt); + return; + } + + for (i = 0; i < last_alg; i++) + if (!strcmp (alg_name, stringop_alg_names[i])) + break; + + if (i == last_alg) + { + error ("wrong strategy name %qs specified for option %qs", + alg_name, opt); + + auto_vec candidates; + for (i = 0; i < last_alg; i++) + if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT) + candidates.safe_push (stringop_alg_names[i]); + + char *s; + const char *hint + = candidates_list_and_hint (alg_name, s, candidates); + if (hint) + inform (input_location, + "valid arguments to %qs are: %s; did you mean %qs?", + opt, s, hint); + else + inform (input_location, "valid arguments to %qs are: %s", + opt, s); + XDELETEVEC (s); + return; + } + + if ((stringop_alg) i == rep_prefix_8_byte + && !TARGET_64BIT) + { + /* rep; movq isn't available in 32-bit code. */ + error ("strategy name %qs specified for option %qs " + "not supported for 32-bit code", alg_name, opt); + return; + } + + input_ranges[n].max = maxs; + input_ranges[n].alg = (stringop_alg) i; + if (!strcmp (align, "align")) + input_ranges[n].noalign = false; + else if (!strcmp (align, "noalign")) + input_ranges[n].noalign = true; + else + { + error ("unknown alignment %qs specified for option %qs", align, opt); + return; + } + n++; + curr_range_str = next_range_str; + } + while (curr_range_str); + + if (input_ranges[n - 1].max != -1) + { + error ("the max value for the last size range should be -1" + " for option %qs", opt); + return; + } + + if (n > MAX_STRINGOP_ALGS) + { + error ("too many size ranges specified in option %qs", opt); + return; + } + + /* Now override the default algs array. */ + for (i = 0; i < n; i++) + { + *const_cast(&default_algs->size[i].max) = input_ranges[i].max; + *const_cast(&default_algs->size[i].alg) + = input_ranges[i].alg; + *const_cast(&default_algs->size[i].noalign) + = input_ranges[i].noalign; + } +} + + +/* parse -mtune-ctrl= option. When DUMP is true, + print the features that are explicitly set. */ + +static void +parse_mtune_ctrl_str (bool dump) +{ + if (!ix86_tune_ctrl_string) + return; + + char *next_feature_string = NULL; + char *curr_feature_string = xstrdup (ix86_tune_ctrl_string); + char *orig = curr_feature_string; + int i; + do + { + bool clear = false; + + next_feature_string = strchr (curr_feature_string, ','); + if (next_feature_string) + *next_feature_string++ = '\0'; + if (*curr_feature_string == '^') + { + curr_feature_string++; + clear = true; + } + for (i = 0; i < X86_TUNE_LAST; i++) + { + if (!strcmp (curr_feature_string, ix86_tune_feature_names[i])) + { + ix86_tune_features[i] = !clear; + if (dump) + fprintf (stderr, "Explicitly %s feature %s\n", + clear ? "clear" : "set", ix86_tune_feature_names[i]); + break; + } + } + if (i == X86_TUNE_LAST) + error ("unknown parameter to option %<-mtune-ctrl%>: %s", + clear ? curr_feature_string - 1 : curr_feature_string); + curr_feature_string = next_feature_string; + } + while (curr_feature_string); + free (orig); +} + +/* Helper function to set ix86_tune_features. IX86_TUNE is the + processor type. */ + +static void +set_ix86_tune_features (enum processor_type ix86_tune, bool dump) +{ + unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune; + int i; + + for (i = 0; i < X86_TUNE_LAST; ++i) + { + if (ix86_tune_no_default) + ix86_tune_features[i] = 0; + else + ix86_tune_features[i] + = !!(initial_ix86_tune_features[i] & ix86_tune_mask); + } + + if (dump) + { + fprintf (stderr, "List of x86 specific tuning parameter names:\n"); + for (i = 0; i < X86_TUNE_LAST; i++) + fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i], + ix86_tune_features[i] ? "on" : "off"); + } + + parse_mtune_ctrl_str (dump); +} + + +/* Default align_* from the processor table. */ + +static void +ix86_default_align (struct gcc_options *opts) +{ + /* -falign-foo without argument: supply one. */ + if (opts->x_flag_align_loops && !opts->x_str_align_loops) + opts->x_str_align_loops = processor_cost_table[ix86_tune]->align_loop; + if (opts->x_flag_align_jumps && !opts->x_str_align_jumps) + opts->x_str_align_jumps = processor_cost_table[ix86_tune]->align_jump; + if (opts->x_flag_align_labels && !opts->x_str_align_labels) + opts->x_str_align_labels = processor_cost_table[ix86_tune]->align_label; + if (opts->x_flag_align_functions && !opts->x_str_align_functions) + opts->x_str_align_functions = processor_cost_table[ix86_tune]->align_func; +} + +/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */ + +void +ix86_override_options_after_change (void) +{ + ix86_default_align (&global_options); +} + +/* Clear stack slot assignments remembered from previous functions. + This is called from INIT_EXPANDERS once before RTL is emitted for each + function. */ + +static struct machine_function * +ix86_init_machine_status (void) +{ + struct machine_function *f; + + f = ggc_cleared_alloc (); + f->call_abi = ix86_abi; + + return f; +} + +/* Override various settings based on options. If MAIN_ARGS_P, the + options are from the command line, otherwise they are from + attributes. Return true if there's an error related to march + option. */ + +static bool +ix86_option_override_internal (bool main_args_p, + struct gcc_options *opts, + struct gcc_options *opts_set) +{ + int i; + unsigned HOST_WIDE_INT ix86_arch_mask; + const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL); + + /* -mrecip options. */ + static struct + { + const char *string; /* option name */ + unsigned int mask; /* mask bits to set */ + } + const recip_options[] = + { + { "all", RECIP_MASK_ALL }, + { "none", RECIP_MASK_NONE }, + { "div", RECIP_MASK_DIV }, + { "sqrt", RECIP_MASK_SQRT }, + { "vec-div", RECIP_MASK_VEC_DIV }, + { "vec-sqrt", RECIP_MASK_VEC_SQRT }, + }; + + + /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if + TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */ + if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32); +#ifdef TARGET_BI_ARCH + else + { +#if TARGET_BI_ARCH == 1 + /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64 + is on and OPTION_MASK_ABI_X32 is off. We turn off + OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by + -mx32. */ + if (TARGET_X32_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64; +#else + /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is + on and OPTION_MASK_ABI_64 is off. We turn off + OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by + -m64 or OPTION_MASK_CODE16 is turned on by -m16. */ + if (TARGET_LP64_P (opts->x_ix86_isa_flags) + || TARGET_16BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32; +#endif + if (TARGET_64BIT_P (opts->x_ix86_isa_flags) + && TARGET_IAMCU_P (opts->x_target_flags)) + sorry ("Intel MCU psABI isn%'t supported in %s mode", + TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit"); + } +#endif + + if (TARGET_X32_P (opts->x_ix86_isa_flags)) + { + /* Always turn on OPTION_MASK_ISA_64BIT and turn off + OPTION_MASK_ABI_64 for TARGET_X32. */ + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT; + opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64; + } + else if (TARGET_16BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT + | OPTION_MASK_ABI_X32 + | OPTION_MASK_ABI_64); + else if (TARGET_LP64_P (opts->x_ix86_isa_flags)) + { + /* Always turn on OPTION_MASK_ISA_64BIT and turn off + OPTION_MASK_ABI_X32 for TARGET_LP64. */ + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT; + opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32; + } + +#ifdef SUBTARGET_OVERRIDE_OPTIONS + SUBTARGET_OVERRIDE_OPTIONS; +#endif + +#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS + SUBSUBTARGET_OVERRIDE_OPTIONS; +#endif + + /* -fPIC is the default for x86_64. */ + if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_flag_pic = 2; + + /* Need to check -mtune=generic first. */ + if (opts->x_ix86_tune_string) + { + /* As special support for cross compilers we read -mtune=native + as -mtune=generic. With native compilers we won't see the + -mtune=native, as it was changed by the driver. */ + if (!strcmp (opts->x_ix86_tune_string, "native")) + { + opts->x_ix86_tune_string = "generic"; + } + else if (!strcmp (opts->x_ix86_tune_string, "x86-64")) + warning (OPT_Wdeprecated, + main_args_p + ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> " + "or %<-mtune=generic%> instead as appropriate") + : G_("% is deprecated; use " + "% or %" + " instead as appropriate")); + } + else + { + if (opts->x_ix86_arch_string) + opts->x_ix86_tune_string = opts->x_ix86_arch_string; + if (!opts->x_ix86_tune_string) + { + opts->x_ix86_tune_string = processor_names[TARGET_CPU_DEFAULT]; + ix86_tune_defaulted = 1; + } + + /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string + or defaulted. We need to use a sensible tune option. */ + if (!strcmp (opts->x_ix86_tune_string, "x86-64")) + { + opts->x_ix86_tune_string = "generic"; + } + } + + if (opts->x_ix86_stringop_alg == rep_prefix_8_byte + && !TARGET_64BIT_P (opts->x_ix86_isa_flags)) + { + /* rep; movq isn't available in 32-bit code. */ + error ("%<-mstringop-strategy=rep_8byte%> not supported for 32-bit code"); + opts->x_ix86_stringop_alg = no_stringop; + } + + if (!opts->x_ix86_arch_string) + opts->x_ix86_arch_string + = TARGET_64BIT_P (opts->x_ix86_isa_flags) + ? "x86-64" : SUBTARGET32_DEFAULT_CPU; + else + ix86_arch_specified = 1; + + if (opts_set->x_ix86_pmode) + { + if ((TARGET_LP64_P (opts->x_ix86_isa_flags) + && opts->x_ix86_pmode == PMODE_SI) + || (!TARGET_64BIT_P (opts->x_ix86_isa_flags) + && opts->x_ix86_pmode == PMODE_DI)) + error ("address mode %qs not supported in the %s bit mode", + TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long", + TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32"); + } + else + opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags) + ? PMODE_DI : PMODE_SI; + + if (!opts_set->x_ix86_abi) + opts->x_ix86_abi = DEFAULT_ABI; + + if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags)) + error ("%<-mabi=ms%> not supported with X32 ABI"); + gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI); + + if ((opts->x_flag_sanitize & SANITIZE_USER_ADDRESS) && opts->x_ix86_abi == MS_ABI) + error ("%<-mabi=ms%> not supported with %<-fsanitize=address%>"); + if ((opts->x_flag_sanitize & SANITIZE_KERNEL_ADDRESS) && opts->x_ix86_abi == MS_ABI) + error ("%<-mabi=ms%> not supported with %<-fsanitize=kernel-address%>"); + if ((opts->x_flag_sanitize & SANITIZE_THREAD) && opts->x_ix86_abi == MS_ABI) + error ("%<-mabi=ms%> not supported with %<-fsanitize=thread%>"); + + /* For targets using ms ABI enable ms-extensions, if not + explicit turned off. For non-ms ABI we turn off this + option. */ + if (!opts_set->x_flag_ms_extensions) + opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI); + + if (opts_set->x_ix86_cmodel) + { + switch (opts->x_ix86_cmodel) + { + case CM_SMALL: + case CM_SMALL_PIC: + if (opts->x_flag_pic) + opts->x_ix86_cmodel = CM_SMALL_PIC; + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) + error ("code model %qs not supported in the %s bit mode", + "small", "32"); + break; + + case CM_MEDIUM: + case CM_MEDIUM_PIC: + if (opts->x_flag_pic) + opts->x_ix86_cmodel = CM_MEDIUM_PIC; + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) + error ("code model %qs not supported in the %s bit mode", + "medium", "32"); + else if (TARGET_X32_P (opts->x_ix86_isa_flags)) + error ("code model %qs not supported in x32 mode", + "medium"); + break; + + case CM_LARGE: + case CM_LARGE_PIC: + if (opts->x_flag_pic) + opts->x_ix86_cmodel = CM_LARGE_PIC; + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) + error ("code model %qs not supported in the %s bit mode", + "large", "32"); + else if (TARGET_X32_P (opts->x_ix86_isa_flags)) + error ("code model %qs not supported in x32 mode", + "large"); + break; + + case CM_32: + if (opts->x_flag_pic) + error ("code model %s does not support PIC mode", "32"); + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) + error ("code model %qs not supported in the %s bit mode", + "32", "64"); + break; + + case CM_KERNEL: + if (opts->x_flag_pic) + { + error ("code model %s does not support PIC mode", "kernel"); + opts->x_ix86_cmodel = CM_32; + } + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) + error ("code model %qs not supported in the %s bit mode", + "kernel", "32"); + break; + + default: + gcc_unreachable (); + } + } + else + { + /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the + use of rip-relative addressing. This eliminates fixups that + would otherwise be needed if this object is to be placed in a + DLL, and is essentially just as efficient as direct addressing. */ + if (TARGET_64BIT_P (opts->x_ix86_isa_flags) + && (TARGET_RDOS || TARGET_PECOFF)) + opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1; + else if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL; + else + opts->x_ix86_cmodel = CM_32; + } + if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL) + { + error ("%<-masm=intel%> not supported in this configuration"); + opts->x_ix86_asm_dialect = ASM_ATT; + } + if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0) + != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0)) + sorry ("%i-bit mode not compiled in", + (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32); + + for (i = 0; i < pta_size; i++) + if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name)) + { + if (!strcmp (opts->x_ix86_arch_string, "generic")) + { + error (main_args_p + ? G_("% CPU can be used only for %<-mtune=%> " + "switch") + : G_("% CPU can be used only for " + "% attribute")); + return false; + } + else if (!strcmp (opts->x_ix86_arch_string, "intel")) + { + error (main_args_p + ? G_("% CPU can be used only for %<-mtune=%> " + "switch") + : G_("% CPU can be used only for " + "% attribute")); + return false; + } + + if (TARGET_64BIT_P (opts->x_ix86_isa_flags) + && !((processor_alias_table[i].flags & PTA_64BIT) != 0)) + { + error ("CPU you selected does not support x86-64 " + "instruction set"); + return false; + } + + ix86_schedule = processor_alias_table[i].schedule; + ix86_arch = processor_alias_table[i].processor; + /* Default cpu tuning to the architecture. */ + ix86_tune = ix86_arch; + + if (((processor_alias_table[i].flags & PTA_MMX) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX; + if (((processor_alias_table[i].flags & PTA_3DNOW) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW; + if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A; + if (((processor_alias_table[i].flags & PTA_SSE) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE; + if (((processor_alias_table[i].flags & PTA_SSE2) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2; + if (((processor_alias_table[i].flags & PTA_SSE3) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3; + if (((processor_alias_table[i].flags & PTA_SSSE3) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3; + if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1; + if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2; + if (((processor_alias_table[i].flags & PTA_AVX) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX; + if (((processor_alias_table[i].flags & PTA_AVX2) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2; + if (((processor_alias_table[i].flags & PTA_FMA) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA; + if (((processor_alias_table[i].flags & PTA_SSE4A) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; + if (((processor_alias_table[i].flags & PTA_FMA4) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4; + if (((processor_alias_table[i].flags & PTA_XOP) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP; + if (((processor_alias_table[i].flags & PTA_LWP) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP; + if (((processor_alias_table[i].flags & PTA_ABM) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM; + if (((processor_alias_table[i].flags & PTA_BMI) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI; + if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT; + if (((processor_alias_table[i].flags & PTA_TBM) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM; + if (((processor_alias_table[i].flags & PTA_BMI2) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2; + if (((processor_alias_table[i].flags & PTA_CX16) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16; + if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT; + if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags) + && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0)) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF; + if (((processor_alias_table[i].flags & PTA_MOVBE) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE; + if (((processor_alias_table[i].flags & PTA_AES) != 0) + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES)) + ix86_isa_flags |= OPTION_MASK_ISA_AES; + if (((processor_alias_table[i].flags & PTA_SHA) != 0) + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA)) + ix86_isa_flags |= OPTION_MASK_ISA_SHA; + if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL; + if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE; + if (((processor_alias_table[i].flags & PTA_RDRND) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND; + if (((processor_alias_table[i].flags & PTA_F16C) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C; + if (((processor_alias_table[i].flags & PTA_RTM) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM; + if (((processor_alias_table[i].flags & PTA_HLE) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE; + if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW; + if (((processor_alias_table[i].flags & PTA_RDSEED) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED; + if (((processor_alias_table[i].flags & PTA_ADX) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX; + if (((processor_alias_table[i].flags & PTA_FXSR) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR; + if (((processor_alias_table[i].flags & PTA_XSAVE) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE; + if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT; + if (((processor_alias_table[i].flags & PTA_AVX512F) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F; + if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER; + if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF; + if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD; + if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1; + if (((processor_alias_table[i].flags & PTA_CLWB) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB; + if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT; + if (((processor_alias_table[i].flags & PTA_CLZERO) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO; + if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC; + if (((processor_alias_table[i].flags & PTA_XSAVES) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES; + if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ; + if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW; + if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL; + if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI; + if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA; + if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI; + if (((processor_alias_table[i].flags & PTA_GFNI) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI; + if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0) + && !(opts->x_ix86_isa_flags_explicit + & OPTION_MASK_ISA_AVX512VBMI2)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2; + if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ; + if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0) + && !(opts->x_ix86_isa_flags_explicit + & OPTION_MASK_ISA_AVX512BITALG)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG; + + if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0) + && !(opts->x_ix86_isa_flags2_explicit + & OPTION_MASK_ISA_AVX5124VNNIW)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW; + if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0) + && !(opts->x_ix86_isa_flags2_explicit + & OPTION_MASK_ISA_AVX5124FMAPS)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS; + if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0) + && !(opts->x_ix86_isa_flags_explicit + & OPTION_MASK_ISA_AVX512VPOPCNTDQ)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ; + if (((processor_alias_table[i].flags & PTA_SGX) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX; + if (((processor_alias_table[i].flags & PTA_VAES) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES; + if (((processor_alias_table[i].flags & PTA_RDPID) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID; + if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG; + if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD; + if (((processor_alias_table[i].flags & PTA_PTWRITE) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PTWRITE)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PTWRITE; + + if ((processor_alias_table[i].flags + & (PTA_PREFETCH_SSE | PTA_SSE)) != 0) + x86_prefetch_sse = true; + if (((processor_alias_table[i].flags & PTA_MWAITX) != 0) + && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX)) + opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX; + if (((processor_alias_table[i].flags & PTA_PKU) != 0) + && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU)) + opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU; + + /* Don't enable x87 instructions if only + general registers are allowed. */ + if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY) + && !(opts_set->x_target_flags & MASK_80387)) + { + if (((processor_alias_table[i].flags & PTA_NO_80387) != 0)) + opts->x_target_flags &= ~MASK_80387; + else + opts->x_target_flags |= MASK_80387; + } + break; + } + + if (i == pta_size) + { + error (main_args_p + ? G_("bad value (%qs) for %<-march=%> switch") + : G_("bad value (%qs) for % attribute"), + opts->x_ix86_arch_string); + + auto_vec candidates; + for (i = 0; i < pta_size; i++) + if (strcmp (processor_alias_table[i].name, "generic") + && strcmp (processor_alias_table[i].name, "intel") + && (!TARGET_64BIT_P (opts->x_ix86_isa_flags) + || ((processor_alias_table[i].flags & PTA_64BIT) != 0))) + candidates.safe_push (processor_alias_table[i].name); + +#ifdef HAVE_LOCAL_CPU_DETECT + /* Add also "native" as possible value. */ + candidates.safe_push ("native"); +#endif + + char *s; + const char *hint + = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates); + if (hint) + inform (input_location, + main_args_p + ? G_("valid arguments to %<-march=%> switch are: " + "%s; did you mean %qs?") + : G_("valid arguments to % attribute are: " + "%s; did you mean %qs?"), s, hint); + else + inform (input_location, + main_args_p + ? G_("valid arguments to %<-march=%> switch are: %s") + : G_("valid arguments to % attribute " + "are: %s"), s); + XDELETEVEC (s); + } + + ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch; + for (i = 0; i < X86_ARCH_LAST; ++i) + ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask); + + for (i = 0; i < pta_size; i++) + if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name)) + { + ix86_schedule = processor_alias_table[i].schedule; + ix86_tune = processor_alias_table[i].processor; + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) + { + if (!((processor_alias_table[i].flags & PTA_64BIT) != 0)) + { + if (ix86_tune_defaulted) + { + opts->x_ix86_tune_string = "x86-64"; + for (i = 0; i < pta_size; i++) + if (! strcmp (opts->x_ix86_tune_string, + processor_alias_table[i].name)) + break; + ix86_schedule = processor_alias_table[i].schedule; + ix86_tune = processor_alias_table[i].processor; + } + else + error ("CPU you selected does not support x86-64 " + "instruction set"); + } + } + /* Intel CPUs have always interpreted SSE prefetch instructions as + NOPs; so, we can enable SSE prefetch instructions even when + -mtune (rather than -march) points us to a processor that has them. + However, the VIA C3 gives a SIGILL, so we only do that for i686 and + higher processors. */ + if (TARGET_CMOV + && ((processor_alias_table[i].flags + & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)) + x86_prefetch_sse = true; + break; + } + + if (ix86_tune_specified && i == pta_size) + { + error (main_args_p + ? G_("bad value (%qs) for %<-mtune=%> switch") + : G_("bad value (%qs) for % attribute"), + opts->x_ix86_tune_string); + + auto_vec candidates; + for (i = 0; i < pta_size; i++) + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) + || ((processor_alias_table[i].flags & PTA_64BIT) != 0)) + candidates.safe_push (processor_alias_table[i].name); + +#ifdef HAVE_LOCAL_CPU_DETECT + /* Add also "native" as possible value. */ + candidates.safe_push ("native"); +#endif + + char *s; + const char *hint + = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates); + if (hint) + inform (input_location, + main_args_p + ? G_("valid arguments to %<-mtune=%> switch are: " + "%s; did you mean %qs?") + : G_("valid arguments to % attribute are: " + "%s; did you mean %qs?"), s, hint); + else + inform (input_location, + main_args_p + ? G_("valid arguments to %<-mtune=%> switch are: %s") + : G_("valid arguments to % attribute " + "are: %s"), s); + XDELETEVEC (s); + } + + set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes); + +#ifndef USE_IX86_FRAME_POINTER +#define USE_IX86_FRAME_POINTER 0 +#endif + +#ifndef USE_X86_64_FRAME_POINTER +#define USE_X86_64_FRAME_POINTER 0 +#endif + + /* Set the default values for switches whose default depends on TARGET_64BIT + in case they weren't overwritten by command line options. */ + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) + { + if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer) + opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER; + if (opts->x_flag_asynchronous_unwind_tables + && !opts_set->x_flag_unwind_tables + && TARGET_64BIT_MS_ABI) + opts->x_flag_unwind_tables = 1; + if (opts->x_flag_asynchronous_unwind_tables == 2) + opts->x_flag_unwind_tables + = opts->x_flag_asynchronous_unwind_tables = 1; + if (opts->x_flag_pcc_struct_return == 2) + opts->x_flag_pcc_struct_return = 0; + } + else + { + if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer) + opts->x_flag_omit_frame_pointer + = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size); + if (opts->x_flag_asynchronous_unwind_tables == 2) + opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER; + if (opts->x_flag_pcc_struct_return == 2) + { + /* Intel MCU psABI specifies that -freg-struct-return should + be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1, + we check -miamcu so that -freg-struct-return is always + turned on if -miamcu is used. */ + if (TARGET_IAMCU_P (opts->x_target_flags)) + opts->x_flag_pcc_struct_return = 0; + else + opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN; + } + } + + ix86_tune_cost = processor_cost_table[ix86_tune]; + /* TODO: ix86_cost should be chosen at instruction or function granuality + so for cold code we use size_cost even in !optimize_size compilation. */ + if (opts->x_optimize_size) + ix86_cost = &ix86_size_cost; + else + ix86_cost = ix86_tune_cost; + + /* Arrange to set up i386_stack_locals for all functions. */ + init_machine_status = ix86_init_machine_status; + + /* Validate -mregparm= value. */ + if (opts_set->x_ix86_regparm) + { + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) + warning (0, "%<-mregparm%> is ignored in 64-bit mode"); + else if (TARGET_IAMCU_P (opts->x_target_flags)) + warning (0, "%<-mregparm%> is ignored for Intel MCU psABI"); + if (opts->x_ix86_regparm > REGPARM_MAX) + { + error ("%<-mregparm=%d%> is not between 0 and %d", + opts->x_ix86_regparm, REGPARM_MAX); + opts->x_ix86_regparm = 0; + } + } + if (TARGET_IAMCU_P (opts->x_target_flags) + || TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_ix86_regparm = REGPARM_MAX; + + /* Default align_* from the processor table. */ + ix86_default_align (opts); + + /* Provide default for -mbranch-cost= value. */ + if (!opts_set->x_ix86_branch_cost) + opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost; + + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) + { + opts->x_target_flags + |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags; + + if (!ix86_arch_specified) + opts->x_ix86_isa_flags + |= TARGET_SUBTARGET64_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit; + + if (TARGET_RTD_P (opts->x_target_flags)) + warning (0, + main_args_p + ? G_("%<-mrtd%> is ignored in 64bit mode") + : G_("% is ignored in 64bit mode")); + } + else + { + opts->x_target_flags + |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags; + + if (!ix86_arch_specified) + opts->x_ix86_isa_flags + |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit; + + /* i386 ABI does not specify red zone. It still makes sense to use it + when programmer takes care to stack from being destroyed. */ + if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE)) + opts->x_target_flags |= MASK_NO_RED_ZONE; + } + + /* Keep nonleaf frame pointers. */ + if (opts->x_flag_omit_frame_pointer) + opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER; + else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags)) + opts->x_flag_omit_frame_pointer = 1; + + /* If we're doing fast math, we don't care about comparison order + wrt NaNs. This lets us use a shorter comparison sequence. */ + if (opts->x_flag_finite_math_only) + opts->x_target_flags &= ~MASK_IEEE_FP; + + /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387, + since the insns won't need emulation. */ + if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387]) + opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387; + + /* Likewise, if the target doesn't have a 387, or we've specified + software floating point, don't use 387 inline intrinsics. */ + if (!TARGET_80387_P (opts->x_target_flags)) + opts->x_target_flags |= MASK_NO_FANCY_MATH_387; + + /* Turn on MMX builtins for -msse. */ + if (TARGET_SSE_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags + |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit; + + /* Enable SSE prefetch. */ + if (TARGET_SSE_P (opts->x_ix86_isa_flags) + || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags) + && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)) + || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags)) + x86_prefetch_sse = true; + + /* Enable popcnt instruction for -msse4.2 or -mabm. */ + if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags) + || TARGET_ABM_P (opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags + |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit; + + /* Enable lzcnt instruction for -mabm. */ + if (TARGET_ABM_P(opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags + |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit; + + /* Disable BMI, BMI2 and TBM instructions for -m16. */ + if (TARGET_16BIT_P(opts->x_ix86_isa_flags)) + opts->x_ix86_isa_flags + &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM) + & ~opts->x_ix86_isa_flags_explicit); + + /* Validate -mpreferred-stack-boundary= value or default it to + PREFERRED_STACK_BOUNDARY_DEFAULT. */ + ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT; + if (opts_set->x_ix86_preferred_stack_boundary_arg) + { + int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2; + int max = TARGET_SEH ? 4 : 12; + + if (opts->x_ix86_preferred_stack_boundary_arg < min + || opts->x_ix86_preferred_stack_boundary_arg > max) + { + if (min == max) + error ("%<-mpreferred-stack-boundary%> is not supported " + "for this target"); + else + error ("%<-mpreferred-stack-boundary=%d%> is not between %d and %d", + opts->x_ix86_preferred_stack_boundary_arg, min, max); + } + else + ix86_preferred_stack_boundary + = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT; + } + + /* Set the default value for -mstackrealign. */ + if (!opts_set->x_ix86_force_align_arg_pointer) + opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT; + + ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; + + /* Validate -mincoming-stack-boundary= value or default it to + MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */ + ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary; + if (opts_set->x_ix86_incoming_stack_boundary_arg) + { + int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2; + + if (opts->x_ix86_incoming_stack_boundary_arg < min + || opts->x_ix86_incoming_stack_boundary_arg > 12) + error ("%<-mincoming-stack-boundary=%d%> is not between %d and 12", + opts->x_ix86_incoming_stack_boundary_arg, min); + else + { + ix86_user_incoming_stack_boundary + = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT; + ix86_incoming_stack_boundary + = ix86_user_incoming_stack_boundary; + } + } + +#ifndef NO_PROFILE_COUNTERS + if (flag_nop_mcount) + error ("%<-mnop-mcount%> is not compatible with this target"); +#endif + if (flag_nop_mcount && flag_pic) + error ("%<-mnop-mcount%> is not implemented for %<-fPIC%>"); + + /* Accept -msseregparm only if at least SSE support is enabled. */ + if (TARGET_SSEREGPARM_P (opts->x_target_flags) + && ! TARGET_SSE_P (opts->x_ix86_isa_flags)) + error (main_args_p + ? G_("%<-msseregparm%> used without SSE enabled") + : G_("% used without SSE enabled")); + + if (opts_set->x_ix86_fpmath) + { + if (opts->x_ix86_fpmath & FPMATH_SSE) + { + if (!TARGET_SSE_P (opts->x_ix86_isa_flags)) + { + if (TARGET_80387_P (opts->x_target_flags)) + { + warning (0, "SSE instruction set disabled, using 387 arithmetics"); + opts->x_ix86_fpmath = FPMATH_387; + } + } + else if ((opts->x_ix86_fpmath & FPMATH_387) + && !TARGET_80387_P (opts->x_target_flags)) + { + warning (0, "387 instruction set disabled, using SSE arithmetics"); + opts->x_ix86_fpmath = FPMATH_SSE; + } + } + } + /* For all chips supporting SSE2, -mfpmath=sse performs better than + fpmath=387. The second is however default at many targets since the + extra 80bit precision of temporaries is considered to be part of ABI. + Overwrite the default at least for -ffast-math. + TODO: -mfpmath=both seems to produce same performing code with bit + smaller binaries. It is however not clear if register allocation is + ready for this setting. + Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE + codegen. We may switch to 387 with -ffast-math for size optimized + functions. */ + else if (fast_math_flags_set_p (&global_options) + && TARGET_SSE2_P (opts->x_ix86_isa_flags)) + opts->x_ix86_fpmath = FPMATH_SSE; + else + opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags); + + /* Use external vectorized library in vectorizing intrinsics. */ + if (opts_set->x_ix86_veclibabi_type) + switch (opts->x_ix86_veclibabi_type) + { + case ix86_veclibabi_type_svml: + ix86_veclib_handler = &ix86_veclibabi_svml; + break; + + case ix86_veclibabi_type_acml: + ix86_veclib_handler = &ix86_veclibabi_acml; + break; + + default: + gcc_unreachable (); + } + + if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS] + && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) + opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; + + /* If stack probes are required, the space used for large function + arguments on the stack must also be probed, so enable + -maccumulate-outgoing-args so this happens in the prologue. */ + if (TARGET_STACK_PROBE_P (opts->x_target_flags) + && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) + { + if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS) + warning (0, + main_args_p + ? G_("stack probing requires %<-maccumulate-outgoing-args%> " + "for correctness") + : G_("stack probing requires " + "% for " + "correctness")); + opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; + } + + /* Stack realignment without -maccumulate-outgoing-args requires %ebp, + so enable -maccumulate-outgoing-args when %ebp is fixed. */ + if (fixed_regs[BP_REG] + && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) + { + if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS) + warning (0, + main_args_p + ? G_("fixed ebp register requires " + "%<-maccumulate-outgoing-args%>") + : G_("fixed ebp register requires " + "%")); + opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; + } + + /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */ + { + char *p; + ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0); + p = strchr (internal_label_prefix, 'X'); + internal_label_prefix_len = p - internal_label_prefix; + *p = '\0'; + } + + /* When scheduling description is not available, disable scheduler pass + so it won't slow down the compilation and make x87 code slower. */ + if (!TARGET_SCHEDULE) + opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0; + + maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES, + ix86_tune_cost->simultaneous_prefetches, + opts->x_param_values, + opts_set->x_param_values); + maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, + ix86_tune_cost->prefetch_block, + opts->x_param_values, + opts_set->x_param_values); + maybe_set_param_value (PARAM_L1_CACHE_SIZE, + ix86_tune_cost->l1_cache_size, + opts->x_param_values, + opts_set->x_param_values); + maybe_set_param_value (PARAM_L2_CACHE_SIZE, + ix86_tune_cost->l2_cache_size, + opts->x_param_values, + opts_set->x_param_values); + + /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ + if (opts->x_flag_prefetch_loop_arrays < 0 + && HAVE_prefetch + && (opts->x_optimize >= 3 || opts->x_flag_profile_use) + && !opts->x_optimize_size + && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL) + opts->x_flag_prefetch_loop_arrays = 1; + + /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0) + can be opts->x_optimized to ap = __builtin_next_arg (0). */ + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack) + targetm.expand_builtin_va_start = NULL; + + if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) + { + ix86_gen_leave = gen_leave_rex64; + if (Pmode == DImode) + { + ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di; + ix86_gen_tls_local_dynamic_base_64 + = gen_tls_local_dynamic_base_64_di; + } + else + { + ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si; + ix86_gen_tls_local_dynamic_base_64 + = gen_tls_local_dynamic_base_64_si; + } + } + else + ix86_gen_leave = gen_leave; + + if (Pmode == DImode) + { + ix86_gen_add3 = gen_adddi3; + ix86_gen_sub3 = gen_subdi3; + ix86_gen_sub3_carry = gen_subdi3_carry; + ix86_gen_one_cmpl2 = gen_one_cmpldi2; + ix86_gen_andsp = gen_anddi3; + ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di; + ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi; + ix86_gen_probe_stack_range = gen_probe_stack_rangedi; + ix86_gen_monitor = gen_sse3_monitor_di; + ix86_gen_monitorx = gen_monitorx_di; + ix86_gen_clzero = gen_clzero_di; + } + else + { + ix86_gen_add3 = gen_addsi3; + ix86_gen_sub3 = gen_subsi3; + ix86_gen_sub3_carry = gen_subsi3_carry; + ix86_gen_one_cmpl2 = gen_one_cmplsi2; + ix86_gen_andsp = gen_andsi3; + ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si; + ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi; + ix86_gen_probe_stack_range = gen_probe_stack_rangesi; + ix86_gen_monitor = gen_sse3_monitor_si; + ix86_gen_monitorx = gen_monitorx_si; + ix86_gen_clzero = gen_clzero_si; + } + +#ifdef USE_IX86_CLD + /* Use -mcld by default for 32-bit code if configured with --enable-cld. */ + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) + opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags; +#endif + + /* Set the default value for -mfentry. */ + if (!opts_set->x_flag_fentry) + opts->x_flag_fentry = TARGET_SEH; + else + { + if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic + && opts->x_flag_fentry) + sorry ("%<-mfentry%> isn%'t supported for 32-bit in combination " + "with %<-fpic%>"); + else if (TARGET_SEH && !opts->x_flag_fentry) + sorry ("%<-mno-fentry%> isn%'t compatible with SEH"); + } + + if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES) + sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH"); + + if (!(opts_set->x_target_flags & MASK_VZEROUPPER) + && TARGET_EMIT_VZEROUPPER) + opts->x_target_flags |= MASK_VZEROUPPER; + if (!(opts_set->x_target_flags & MASK_STV)) + opts->x_target_flags |= MASK_STV; + /* Disable STV if -mpreferred-stack-boundary={2,3} or + -mincoming-stack-boundary={2,3} or -mstackrealign - the needed + stack realignment will be extra cost the pass doesn't take into + account and the pass can't realign the stack. */ + if (ix86_preferred_stack_boundary < 128 + || ix86_incoming_stack_boundary < 128 + || opts->x_ix86_force_align_arg_pointer) + opts->x_target_flags &= ~MASK_STV; + if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL] + && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) + opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; + if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL] + && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE)) + opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; + + /* Enable 128-bit AVX instruction generation + for the auto-vectorizer. */ + if (TARGET_AVX128_OPTIMAL + && (opts_set->x_prefer_vector_width_type == PVW_NONE)) + opts->x_prefer_vector_width_type = PVW_AVX128; + + /* Use 256-bit AVX instruction generation + in the auto-vectorizer. */ + if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL] + && (opts_set->x_prefer_vector_width_type == PVW_NONE)) + opts->x_prefer_vector_width_type = PVW_AVX256; + + if (opts->x_ix86_recip_name) + { + char *p = ASTRDUP (opts->x_ix86_recip_name); + char *q; + unsigned int mask, i; + bool invert; + + while ((q = strtok (p, ",")) != NULL) + { + p = NULL; + if (*q == '!') + { + invert = true; + q++; + } + else + invert = false; + + if (!strcmp (q, "default")) + mask = RECIP_MASK_ALL; + else + { + for (i = 0; i < ARRAY_SIZE (recip_options); i++) + if (!strcmp (q, recip_options[i].string)) + { + mask = recip_options[i].mask; + break; + } + + if (i == ARRAY_SIZE (recip_options)) + { + error ("unknown option for %<-mrecip=%s%>", q); + invert = false; + mask = RECIP_MASK_NONE; + } + } + + opts->x_recip_mask_explicit |= mask; + if (invert) + opts->x_recip_mask &= ~mask; + else + opts->x_recip_mask |= mask; + } + } + + if (TARGET_RECIP_P (opts->x_target_flags)) + opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit; + else if (opts_set->x_target_flags & MASK_RECIP) + opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit); + + /* Default long double to 64-bit for 32-bit Bionic and to __float128 + for 64-bit Bionic. Also default long double to 64-bit for Intel + MCU psABI. */ + if ((TARGET_HAS_BIONIC || TARGET_IAMCU) + && !(opts_set->x_target_flags + & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128))) + opts->x_target_flags |= (TARGET_64BIT + ? MASK_LONG_DOUBLE_128 + : MASK_LONG_DOUBLE_64); + + /* Only one of them can be active. */ + gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0 + || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0); + + /* Handle stack protector */ + if (!opts_set->x_ix86_stack_protector_guard) + { +#ifdef TARGET_THREAD_SSP_OFFSET + if (!TARGET_HAS_BIONIC) + opts->x_ix86_stack_protector_guard = SSP_TLS; + else +#endif + opts->x_ix86_stack_protector_guard = SSP_GLOBAL; + } + + if (opts_set->x_ix86_stack_protector_guard_offset_str) + { + char *endp; + const char *str = opts->x_ix86_stack_protector_guard_offset_str; + + errno = 0; + int64_t offset; + +#if defined(INT64_T_IS_LONG) + offset = strtol (str, &endp, 0); +#else + offset = strtoll (str, &endp, 0); +#endif + + if (!*str || *endp || errno) + error ("%qs is not a valid number " + "in %<-mstack-protector-guard-offset=%>", str); + + if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000), + HOST_WIDE_INT_C (0x7fffffff))) + error ("%qs is not a valid offset " + "in %<-mstack-protector-guard-offset=%>", str); + + opts->x_ix86_stack_protector_guard_offset = offset; + } +#ifdef TARGET_THREAD_SSP_OFFSET + else + opts->x_ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET; +#endif + + if (opts_set->x_ix86_stack_protector_guard_reg_str) + { + const char *str = opts->x_ix86_stack_protector_guard_reg_str; + addr_space_t seg = ADDR_SPACE_GENERIC; + + /* Discard optional register prefix. */ + if (str[0] == '%') + str++; + + if (strlen (str) == 2 && str[1] == 's') + { + if (str[0] == 'f') + seg = ADDR_SPACE_SEG_FS; + else if (str[0] == 'g') + seg = ADDR_SPACE_SEG_GS; + } + + if (seg == ADDR_SPACE_GENERIC) + error ("%qs is not a valid base register " + "in %<-mstack-protector-guard-reg=%>", + opts->x_ix86_stack_protector_guard_reg_str); + + opts->x_ix86_stack_protector_guard_reg = seg; + } + else + { + opts->x_ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG; + + /* The kernel uses a different segment register for performance + reasons; a system call would not have to trash the userspace + segment register, which would be expensive. */ + if (opts->x_ix86_cmodel == CM_KERNEL) + opts->x_ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS; + } + + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ + if (opts->x_ix86_tune_memcpy_strategy) + { + char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy); + ix86_parse_stringop_strategy_string (str, false); + free (str); + } + + if (opts->x_ix86_tune_memset_strategy) + { + char *str = xstrdup (opts->x_ix86_tune_memset_strategy); + ix86_parse_stringop_strategy_string (str, true); + free (str); + } + + /* Save the initial options in case the user does function specific + options. */ + if (main_args_p) + target_option_default_node = target_option_current_node + = build_target_option_node (opts); + + if (opts->x_flag_cf_protection != CF_NONE) + opts->x_flag_cf_protection + = (cf_protection_level) (opts->x_flag_cf_protection | CF_SET); + + if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS]) + maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128, + opts->x_param_values, + opts_set->x_param_values); + + /* PR86952: jump table usage with retpolines is slow. + The PR provides some numbers about the slowness. */ + if (ix86_indirect_branch != indirect_branch_keep + && !opts_set->x_flag_jump_tables) + opts->x_flag_jump_tables = 0; + + return true; +} + +/* Implement the TARGET_OPTION_OVERRIDE hook. */ + +void +ix86_option_override (void) +{ + ix86_option_override_internal (true, &global_options, &global_options_set); +} + +/* Remember the last target of ix86_set_current_function. */ +static GTY(()) tree ix86_previous_fndecl; + +/* Set targets globals to the default (or current #pragma GCC target + if active). Invalidate ix86_previous_fndecl cache. */ + +void +ix86_reset_previous_fndecl (void) +{ + tree new_tree = target_option_current_node; + cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree)); + if (TREE_TARGET_GLOBALS (new_tree)) + restore_target_globals (TREE_TARGET_GLOBALS (new_tree)); + else if (new_tree == target_option_default_node) + restore_target_globals (&default_target_globals); + else + TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts (); + ix86_previous_fndecl = NULL_TREE; +} + +/* Add target attribute to SIMD clone NODE if needed. */ + +void +ix86_simd_clone_adjust (struct cgraph_node *node) +{ + const char *str = NULL; + + /* Attributes need to be adjusted for definitions, not declarations. */ + if (!node->definition) + return; + + gcc_assert (node->decl == cfun->decl); + switch (node->simdclone->vecsize_mangle) + { + case 'b': + if (!TARGET_SSE2) + str = "sse2"; + break; + case 'c': + if (!TARGET_AVX) + str = "avx"; + break; + case 'd': + if (!TARGET_AVX2) + str = "avx2"; + break; + case 'e': + if (!TARGET_AVX512F) + str = "avx512f"; + break; + default: + gcc_unreachable (); + } + if (str == NULL) + return; + push_cfun (NULL); + tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str)); + bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0); + gcc_assert (ok); + pop_cfun (); + ix86_reset_previous_fndecl (); + ix86_set_current_function (node->decl); +} + + + +/* Set the func_type field from the function FNDECL. */ + +static void +ix86_set_func_type (tree fndecl) +{ + if (cfun->machine->func_type == TYPE_UNKNOWN) + { + if (lookup_attribute ("interrupt", + TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))) + { + if (ix86_function_naked (fndecl)) + error_at (DECL_SOURCE_LOCATION (fndecl), + "interrupt and naked attributes are not compatible"); + + int nargs = 0; + for (tree arg = DECL_ARGUMENTS (fndecl); + arg; + arg = TREE_CHAIN (arg)) + nargs++; + cfun->machine->no_caller_saved_registers = true; + cfun->machine->func_type + = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT; + + ix86_optimize_mode_switching[X86_DIRFLAG] = 1; + + /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */ + if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG) + sorry ("only DWARF debug format is supported for interrupt " + "service routine"); + } + else + { + cfun->machine->func_type = TYPE_NORMAL; + if (lookup_attribute ("no_caller_saved_registers", + TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))) + cfun->machine->no_caller_saved_registers = true; + } + } +} + +/* Set the indirect_branch_type field from the function FNDECL. */ + +static void +ix86_set_indirect_branch_type (tree fndecl) +{ + if (cfun->machine->indirect_branch_type == indirect_branch_unset) + { + tree attr = lookup_attribute ("indirect_branch", + DECL_ATTRIBUTES (fndecl)); + if (attr != NULL) + { + tree args = TREE_VALUE (attr); + if (args == NULL) + gcc_unreachable (); + tree cst = TREE_VALUE (args); + if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0) + cfun->machine->indirect_branch_type = indirect_branch_keep; + else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0) + cfun->machine->indirect_branch_type = indirect_branch_thunk; + else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0) + cfun->machine->indirect_branch_type = indirect_branch_thunk_inline; + else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0) + cfun->machine->indirect_branch_type = indirect_branch_thunk_extern; + else + gcc_unreachable (); + } + else + cfun->machine->indirect_branch_type = ix86_indirect_branch; + + /* -mcmodel=large is not compatible with -mindirect-branch=thunk + nor -mindirect-branch=thunk-extern. */ + if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) + && ((cfun->machine->indirect_branch_type + == indirect_branch_thunk_extern) + || (cfun->machine->indirect_branch_type + == indirect_branch_thunk))) + error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not " + "compatible", + ((cfun->machine->indirect_branch_type + == indirect_branch_thunk_extern) + ? "thunk-extern" : "thunk")); + + if (cfun->machine->indirect_branch_type != indirect_branch_keep + && (flag_cf_protection & CF_RETURN)) + error ("%<-mindirect-branch%> and %<-fcf-protection%> are not " + "compatible"); + } + + if (cfun->machine->function_return_type == indirect_branch_unset) + { + tree attr = lookup_attribute ("function_return", + DECL_ATTRIBUTES (fndecl)); + if (attr != NULL) + { + tree args = TREE_VALUE (attr); + if (args == NULL) + gcc_unreachable (); + tree cst = TREE_VALUE (args); + if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0) + cfun->machine->function_return_type = indirect_branch_keep; + else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0) + cfun->machine->function_return_type = indirect_branch_thunk; + else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0) + cfun->machine->function_return_type = indirect_branch_thunk_inline; + else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0) + cfun->machine->function_return_type = indirect_branch_thunk_extern; + else + gcc_unreachable (); + } + else + cfun->machine->function_return_type = ix86_function_return; + + /* -mcmodel=large is not compatible with -mfunction-return=thunk + nor -mfunction-return=thunk-extern. */ + if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) + && ((cfun->machine->function_return_type + == indirect_branch_thunk_extern) + || (cfun->machine->function_return_type + == indirect_branch_thunk))) + error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not " + "compatible", + ((cfun->machine->function_return_type + == indirect_branch_thunk_extern) + ? "thunk-extern" : "thunk")); + + if (cfun->machine->function_return_type != indirect_branch_keep + && (flag_cf_protection & CF_RETURN)) + error ("%<-mfunction-return%> and %<-fcf-protection%> are not " + "compatible"); + } +} + +/* Establish appropriate back-end context for processing the function + FNDECL. The argument might be NULL to indicate processing at top + level, outside of any function scope. */ +void +ix86_set_current_function (tree fndecl) +{ + /* Only change the context if the function changes. This hook is called + several times in the course of compiling a function, and we don't want to + slow things down too much or call target_reinit when it isn't safe. */ + if (fndecl == ix86_previous_fndecl) + { + /* There may be 2 function bodies for the same function FNDECL, + one is extern inline and one isn't. Call ix86_set_func_type + to set the func_type field. */ + if (fndecl != NULL_TREE) + { + ix86_set_func_type (fndecl); + ix86_set_indirect_branch_type (fndecl); + } + return; + } + + tree old_tree; + if (ix86_previous_fndecl == NULL_TREE) + old_tree = target_option_current_node; + else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)) + old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl); + else + old_tree = target_option_default_node; + + if (fndecl == NULL_TREE) + { + if (old_tree != target_option_current_node) + ix86_reset_previous_fndecl (); + return; + } + + ix86_set_func_type (fndecl); + ix86_set_indirect_branch_type (fndecl); + + tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl); + if (new_tree == NULL_TREE) + new_tree = target_option_default_node; + + if (old_tree != new_tree) + { + cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree)); + if (TREE_TARGET_GLOBALS (new_tree)) + restore_target_globals (TREE_TARGET_GLOBALS (new_tree)); + else if (new_tree == target_option_default_node) + restore_target_globals (&default_target_globals); + else + TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts (); + } + ix86_previous_fndecl = fndecl; + + static bool prev_no_caller_saved_registers; + + /* 64-bit MS and SYSV ABI have different set of call used registers. + Avoid expensive re-initialization of init_regs each time we switch + function context. */ + if (TARGET_64BIT + && (call_used_regs[SI_REG] + == (cfun->machine->call_abi == MS_ABI))) + reinit_regs (); + /* Need to re-initialize init_regs if caller-saved registers are + changed. */ + else if (prev_no_caller_saved_registers + != cfun->machine->no_caller_saved_registers) + reinit_regs (); + + if (cfun->machine->func_type != TYPE_NORMAL + || cfun->machine->no_caller_saved_registers) + { + /* Don't allow SSE, MMX nor x87 instructions since they + may change processor state. */ + const char *isa; + if (TARGET_SSE) + isa = "SSE"; + else if (TARGET_MMX) + isa = "MMX/3Dnow"; + else if (TARGET_80387) + isa = "80387"; + else + isa = NULL; + if (isa != NULL) + { + if (cfun->machine->func_type != TYPE_NORMAL) + sorry (cfun->machine->func_type == TYPE_EXCEPTION + ? G_("%s instructions aren%'t allowed in an" + " exception service routine") + : G_("%s instructions aren%'t allowed in an" + " interrupt service routine"), + isa); + else + sorry ("%s instructions aren%'t allowed in a function with " + "the % attribute", isa); + /* Don't issue the same error twice. */ + cfun->machine->func_type = TYPE_NORMAL; + cfun->machine->no_caller_saved_registers = false; + } + } + + prev_no_caller_saved_registers + = cfun->machine->no_caller_saved_registers; +} + +/* Implement the TARGET_OFFLOAD_OPTIONS hook. */ +char * +ix86_offload_options (void) +{ + if (TARGET_LP64) + return xstrdup ("-foffload-abi=lp64"); + return xstrdup ("-foffload-abi=ilp32"); +} + +/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall", + and "sseregparm" calling convention attributes; + arguments as in struct attribute_spec.handler. */ + +static tree +ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, + bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE + && TREE_CODE (*node) != FIELD_DECL + && TREE_CODE (*node) != TYPE_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + return NULL_TREE; + } + + /* Can combine regparm with all attributes but fastcall, and thiscall. */ + if (is_attribute_p ("regparm", name)) + { + tree cst; + + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and regparm attributes are not compatible"); + } + + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("regparam and thiscall attributes are not compatible"); + } + + cst = TREE_VALUE (args); + if (TREE_CODE (cst) != INTEGER_CST) + { + warning (OPT_Wattributes, + "%qE attribute requires an integer constant argument", + name); + *no_add_attrs = true; + } + else if (compare_tree_int (cst, REGPARM_MAX) > 0) + { + warning (OPT_Wattributes, "argument to %qE attribute larger than %d", + name, REGPARM_MAX); + *no_add_attrs = true; + } + + return NULL_TREE; + } + + if (TARGET_64BIT) + { + /* Do not warn when emulating the MS ABI. */ + if ((TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE) + || ix86_function_type_abi (*node) != MS_ABI) + warning (OPT_Wattributes, "%qE attribute ignored", + name); + *no_add_attrs = true; + return NULL_TREE; + } + + /* Can combine fastcall with stdcall (redundant) and sseregparm. */ + if (is_attribute_p ("fastcall", name)) + { + if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and cdecl attributes are not compatible"); + } + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and stdcall attributes are not compatible"); + } + if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and regparm attributes are not compatible"); + } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and thiscall attributes are not compatible"); + } + } + + /* Can combine stdcall with fastcall (redundant), regparm and + sseregparm. */ + else if (is_attribute_p ("stdcall", name)) + { + if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and cdecl attributes are not compatible"); + } + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and fastcall attributes are not compatible"); + } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and thiscall attributes are not compatible"); + } + } + + /* Can combine cdecl with regparm and sseregparm. */ + else if (is_attribute_p ("cdecl", name)) + { + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and cdecl attributes are not compatible"); + } + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and cdecl attributes are not compatible"); + } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("cdecl and thiscall attributes are not compatible"); + } + } + else if (is_attribute_p ("thiscall", name)) + { + if (TREE_CODE (*node) != METHOD_TYPE && pedantic) + warning (OPT_Wattributes, "%qE attribute is used for non-class method", + name); + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and thiscall attributes are not compatible"); + } + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and thiscall attributes are not compatible"); + } + if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) + { + error ("cdecl and thiscall attributes are not compatible"); + } + } + + /* Can combine sseregparm with all attributes. */ + + return NULL_TREE; +} + +#ifndef CHECK_STACK_LIMIT +#define CHECK_STACK_LIMIT (-1) +#endif + +/* The transactional memory builtins are implicitly regparm or fastcall + depending on the ABI. Override the generic do-nothing attribute that + these builtins were declared with, and replace it with one of the two + attributes that we expect elsewhere. */ + +static tree +ix86_handle_tm_regparm_attribute (tree *node, tree, tree, + int flags, bool *no_add_attrs) +{ + tree alt; + + /* In no case do we want to add the placeholder attribute. */ + *no_add_attrs = true; + + /* The 64-bit ABI is unchanged for transactional memory. */ + if (TARGET_64BIT) + return NULL_TREE; + + /* ??? Is there a better way to validate 32-bit windows? We have + cfun->machine->call_abi, but that seems to be set only for 64-bit. */ + if (CHECK_STACK_LIMIT > 0) + alt = tree_cons (get_identifier ("fastcall"), NULL, NULL); + else + { + alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL); + alt = tree_cons (get_identifier ("regparm"), alt, NULL); + } + decl_attributes (node, alt, flags); + + return NULL_TREE; +} + +/* Handle a "force_align_arg_pointer" attribute. */ + +static tree +ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name, + tree, int, bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE + && TREE_CODE (*node) != FIELD_DECL + && TREE_CODE (*node) != TYPE_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + +/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in + struct attribute_spec.handler. */ + +static tree +ix86_handle_struct_attribute (tree *node, tree name, tree, int, + bool *no_add_attrs) +{ + tree *type = NULL; + if (DECL_P (*node)) + { + if (TREE_CODE (*node) == TYPE_DECL) + type = &TREE_TYPE (*node); + } + else + type = node; + + if (!(type && RECORD_OR_UNION_TYPE_P (*type))) + { + warning (OPT_Wattributes, "%qE attribute ignored", + name); + *no_add_attrs = true; + } + + else if ((is_attribute_p ("ms_struct", name) + && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type))) + || ((is_attribute_p ("gcc_struct", name) + && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type))))) + { + warning (OPT_Wattributes, "%qE incompatible attribute ignored", + name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + +/* Handle a "callee_pop_aggregate_return" attribute; arguments as + in struct attribute_spec handler. */ + +static tree +ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int, + bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE + && TREE_CODE (*node) != FIELD_DECL + && TREE_CODE (*node) != TYPE_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + return NULL_TREE; + } + if (TARGET_64BIT) + { + warning (OPT_Wattributes, "%qE attribute only available for 32-bit", + name); + *no_add_attrs = true; + return NULL_TREE; + } + if (is_attribute_p ("callee_pop_aggregate_return", name)) + { + tree cst; + + cst = TREE_VALUE (args); + if (TREE_CODE (cst) != INTEGER_CST) + { + warning (OPT_Wattributes, + "%qE attribute requires an integer constant argument", + name); + *no_add_attrs = true; + } + else if (compare_tree_int (cst, 0) != 0 + && compare_tree_int (cst, 1) != 0) + { + warning (OPT_Wattributes, + "argument to %qE attribute is neither zero, nor one", + name); + *no_add_attrs = true; + } + + return NULL_TREE; + } + + return NULL_TREE; +} + +/* Handle a "ms_abi" or "sysv" attribute; arguments as in + struct attribute_spec.handler. */ + +static tree +ix86_handle_abi_attribute (tree *node, tree name, tree, int, + bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE + && TREE_CODE (*node) != FIELD_DECL + && TREE_CODE (*node) != TYPE_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + return NULL_TREE; + } + + /* Can combine regparm with all attributes but fastcall. */ + if (is_attribute_p ("ms_abi", name)) + { + if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node))) + { + error ("ms_abi and sysv_abi attributes are not compatible"); + } + + return NULL_TREE; + } + else if (is_attribute_p ("sysv_abi", name)) + { + if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node))) + { + error ("ms_abi and sysv_abi attributes are not compatible"); + } + + return NULL_TREE; + } + + return NULL_TREE; +} + +static tree +ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int, + bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + } + + if (is_attribute_p ("indirect_branch", name)) + { + tree cst = TREE_VALUE (args); + if (TREE_CODE (cst) != STRING_CST) + { + warning (OPT_Wattributes, + "%qE attribute requires a string constant argument", + name); + *no_add_attrs = true; + } + else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0 + && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0 + && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0 + && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0) + { + warning (OPT_Wattributes, + "argument to %qE attribute is not " + "(keep|thunk|thunk-inline|thunk-extern)", name); + *no_add_attrs = true; + } + } + + if (is_attribute_p ("function_return", name)) + { + tree cst = TREE_VALUE (args); + if (TREE_CODE (cst) != STRING_CST) + { + warning (OPT_Wattributes, + "%qE attribute requires a string constant argument", + name); + *no_add_attrs = true; + } + else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0 + && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0 + && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0 + && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0) + { + warning (OPT_Wattributes, + "argument to %qE attribute is not " + "(keep|thunk|thunk-inline|thunk-extern)", name); + *no_add_attrs = true; + } + } + + return NULL_TREE; +} + +static tree +ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree, + int, bool *) +{ + return NULL_TREE; +} + +static tree +ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *) +{ + /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet, + but the function type contains args and return type data. */ + tree func_type = *node; + tree return_type = TREE_TYPE (func_type); + + int nargs = 0; + tree current_arg_type = TYPE_ARG_TYPES (func_type); + while (current_arg_type + && ! VOID_TYPE_P (TREE_VALUE (current_arg_type))) + { + if (nargs == 0) + { + if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type))) + error ("interrupt service routine should have a pointer " + "as the first argument"); + } + else if (nargs == 1) + { + if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE + || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode) + error ("interrupt service routine should have %qs " + "as the second argument", + TARGET_64BIT + ? (TARGET_X32 ? "unsigned long long int" + : "unsigned long int") + : "unsigned int"); + } + nargs++; + current_arg_type = TREE_CHAIN (current_arg_type); + } + if (!nargs || nargs > 2) + error ("interrupt service routine can only have a pointer argument " + "and an optional integer argument"); + if (! VOID_TYPE_P (return_type)) + error ("interrupt service routine can%'t have non-void return value"); + + return NULL_TREE; +} + +/* Handle fentry_name / fentry_section attribute. */ + +static tree +ix86_handle_fentry_name (tree *node, tree name, tree args, + int, bool *no_add_attrs) +{ + if (TREE_CODE (*node) == FUNCTION_DECL + && TREE_CODE (TREE_VALUE (args)) == STRING_CST) + /* Do nothing else, just set the attribute. We'll get at + it later with lookup_attribute. */ + ; + else + { + warning (OPT_Wattributes, "%qE attribute ignored", name); + *no_add_attrs = true; + } + + return NULL_TREE; +} + +/* Table of valid machine attributes. */ +const struct attribute_spec ix86_attribute_table[] = +{ + /* { name, min_len, max_len, decl_req, type_req, fn_type_req, + affects_type_identity, handler, exclude } */ + /* Stdcall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, + NULL }, + /* Fastcall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, + NULL }, + /* Thiscall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, + NULL }, + /* Cdecl attribute says the callee is a normal C declaration */ + { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, + NULL }, + /* Regparm attribute specifies how many integer arguments are to be + passed in registers. */ + { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute, + NULL }, + /* Sseregparm attribute says we are using x86_64 calling conventions + for FP arguments. */ + { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, + NULL }, + /* The transactional memory builtins are implicitly regparm or fastcall + depending on the ABI. Override the generic do-nothing attribute that + these builtins were declared with. */ + { "*tm regparm", 0, 0, false, true, true, true, + ix86_handle_tm_regparm_attribute, NULL }, + /* force_align_arg_pointer says this function realigns the stack at entry. */ + { "force_align_arg_pointer", 0, 0, + false, true, true, false, ix86_handle_force_align_arg_pointer_attribute, + NULL }, +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES + { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute, + NULL }, + { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute, + NULL }, + { "shared", 0, 0, true, false, false, false, + ix86_handle_shared_attribute, NULL }, +#endif + { "ms_struct", 0, 0, false, false, false, false, + ix86_handle_struct_attribute, NULL }, + { "gcc_struct", 0, 0, false, false, false, false, + ix86_handle_struct_attribute, NULL }, +#ifdef SUBTARGET_ATTRIBUTE_TABLE + SUBTARGET_ATTRIBUTE_TABLE, +#endif + /* ms_abi and sysv_abi calling convention function attributes. */ + { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL }, + { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, + NULL }, + { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL }, + { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL }, + { "ms_hook_prologue", 0, 0, true, false, false, false, + ix86_handle_fndecl_attribute, NULL }, + { "callee_pop_aggregate_return", 1, 1, false, true, true, true, + ix86_handle_callee_pop_aggregate_return, NULL }, + { "interrupt", 0, 0, false, true, true, false, + ix86_handle_interrupt_attribute, NULL }, + { "no_caller_saved_registers", 0, 0, false, true, true, false, + ix86_handle_no_caller_saved_registers_attribute, NULL }, + { "naked", 0, 0, true, false, false, false, + ix86_handle_fndecl_attribute, NULL }, + { "indirect_branch", 1, 1, true, false, false, false, + ix86_handle_fndecl_attribute, NULL }, + { "function_return", 1, 1, true, false, false, false, + ix86_handle_fndecl_attribute, NULL }, + { "indirect_return", 0, 0, false, true, true, false, + NULL, NULL }, + { "fentry_name", 1, 1, true, false, false, false, + ix86_handle_fentry_name, NULL }, + { "fentry_section", 1, 1, true, false, false, false, + ix86_handle_fentry_name, NULL }, + { "cf_check", 0, 0, true, false, false, false, + ix86_handle_fndecl_attribute, NULL }, + + /* End element. */ + { NULL, 0, 0, false, false, false, false, NULL, NULL } +}; + +#include "gt-i386-options.h" diff --git a/gcc/config/i386/i386-options.h b/gcc/config/i386/i386-options.h new file mode 100644 index 00000000000..817ddda5c22 --- /dev/null +++ b/gcc/config/i386/i386-options.h @@ -0,0 +1,95 @@ +/* Copyright (C) 1988-2019 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#ifndef GCC_I386_OPTIONS_H +#define GCC_I386_OPTIONS_H + +char *ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2, + int flags, int flags2, + const char *arch, const char *tune, + enum fpmath_unit fpmath, bool add_nl_p, + bool add_abi_p); + +extern enum attr_cpu ix86_schedule; + +extern enum processor_type ix86_tune; +extern enum processor_type ix86_arch; +extern unsigned char x86_prefetch_sse; +extern const struct processor_costs *ix86_tune_cost; + +extern int ix86_tune_defaulted; +extern int ix86_arch_specified; + +extern unsigned int ix86_default_incoming_stack_boundary; +extern HOST_WIDE_INT deferred_isa_values; +extern HOST_WIDE_INT deferred_isa_values2; + +extern unsigned int ix86_preferred_stack_boundary; +extern unsigned int ix86_user_incoming_stack_boundary; +extern unsigned int ix86_default_incoming_stack_boundary; +extern unsigned int ix86_incoming_stack_boundary; + +extern char *ix86_offload_options (void); +extern void ix86_option_override (void); +extern void ix86_override_options_after_change (void); +void ix86_set_current_function (tree fndecl); +bool ix86_function_naked (const_tree fn); +void ix86_simd_clone_adjust (struct cgraph_node *node); + +extern tree (*ix86_veclib_handler) (combined_fn, tree, tree); +extern tree ix86_veclibabi_svml (combined_fn, tree, tree); +extern tree ix86_veclibabi_acml (combined_fn, tree, tree); + +extern rtx (*ix86_gen_leave) (void); +extern rtx (*ix86_gen_add3) (rtx, rtx, rtx); +extern rtx (*ix86_gen_sub3) (rtx, rtx, rtx); +extern rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx); +extern rtx (*ix86_gen_one_cmpl2) (rtx, rtx); +extern rtx (*ix86_gen_monitor) (rtx, rtx, rtx); +extern rtx (*ix86_gen_monitorx) (rtx, rtx, rtx); +extern rtx (*ix86_gen_clzero) (rtx); +extern rtx (*ix86_gen_andsp) (rtx, rtx, rtx); +extern rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx); +extern rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx); +extern rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx); +extern rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx); +extern rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx); + +enum ix86_function_specific_strings +{ + IX86_FUNCTION_SPECIFIC_ARCH, + IX86_FUNCTION_SPECIFIC_TUNE, + IX86_FUNCTION_SPECIFIC_MAX +}; + +extern const char *stringop_alg_names[]; + +void ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2); +void ix86_function_specific_save (struct cl_target_option *, + struct gcc_options *opts); +void ix86_function_specific_restore (struct gcc_options *opts, + struct cl_target_option *); +void ix86_function_specific_post_stream_in (struct cl_target_option *); +void ix86_function_specific_print (FILE *, int, + struct cl_target_option *); +bool ix86_valid_target_attribute_p (tree, tree, tree, int); + +extern const struct attribute_spec ix86_attribute_table[]; + + +#endif /* GCC_I386_OPTIONS_H */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index bc2348c3bc7..2f23986a9dd 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -91,19 +91,17 @@ along with GCC; see the file COPYING3. If not see #include "tree-vector-builder.h" #include "debug.h" #include "dwarf2out.h" +#include "i386-options.h" +#include "i386-builtins.h" +#include "i386-expand.h" +#include "i386-features.h" /* This file should be included last. */ #include "target-def.h" -#include "x86-tune-costs.h" - static rtx legitimize_dllimport_symbol (rtx, bool); static rtx legitimize_pe_coff_extern_decl (rtx, bool); -static rtx legitimize_pe_coff_symbol (rtx, bool); static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool); -static bool ix86_save_reg (unsigned int, bool, bool); -static bool ix86_function_naked (const_tree); -static bool ix86_notrack_prefixed_insn_p (rtx); static void ix86_emit_restore_reg_using_pop (rtx); @@ -126,102 +124,6 @@ const struct processor_costs *ix86_tune_cost = NULL; /* Set by -mtune or -Os. */ const struct processor_costs *ix86_cost = NULL; -/* Processor feature/optimization bitmasks. */ -#define m_386 (HOST_WIDE_INT_1U<machine->call_ms2sysv_extra_regs and - 3.) rather or not stack alignment is being performed. */ - static rtx get_stub_rtx (enum xlogue_stub stub); - - /* Returns the amount of stack space (including padding) that the stub - needs to store registers based upon data in the machine_function. */ - HOST_WIDE_INT get_stack_space_used () const - { - const struct machine_function *m = cfun->machine; - unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1; - - gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS); - return m_regs[last_reg].offset + STUB_INDEX_OFFSET; - } - - /* Returns the offset for the base pointer used by the stub. */ - HOST_WIDE_INT get_stub_ptr_offset () const - { - return STUB_INDEX_OFFSET + m_stack_align_off_in; - } - - static const struct xlogue_layout &get_instance (); - static unsigned count_stub_managed_regs (); - static bool is_stub_managed_reg (unsigned regno, unsigned count); - - static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70; - static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS; - static const unsigned MAX_REGS = 18; - static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS; - static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1; - static const unsigned STUB_NAME_MAX_LEN = 20; - static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT]; - static const unsigned REG_ORDER[MAX_REGS]; - static const unsigned REG_ORDER_REALIGN[MAX_REGS]; - -private: - xlogue_layout (); - xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp); - xlogue_layout (const xlogue_layout &); - - /* True if hard frame pointer is used. */ - bool m_hfp; - - /* Max number of register this layout manages. */ - unsigned m_nregs; - - /* Incoming offset from 16-byte alignment. */ - HOST_WIDE_INT m_stack_align_off_in; - - /* Register order and offsets. */ - struct reginfo m_regs[MAX_REGS]; - - /* Lazy-inited cache of symbol names for stubs. */ - static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT] - [STUB_NAME_MAX_LEN]; - - static const xlogue_layout s_instances[XLOGUE_SET_COUNT]; -}; - -const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = { - "savms64", - "resms64", - "resms64x", - "savms64f", - "resms64f", - "resms64fx" -}; - -const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = { -/* The below offset values are where each register is stored for the layout - relative to incoming stack pointer. The value of each m_regs[].offset will - be relative to the incoming base pointer (rax or rsi) used by the stub. - - s_instances: 0 1 2 3 - Offset: realigned or aligned + 8 - Register aligned aligned + 8 aligned w/HFP w/HFP */ - XMM15_REG, /* 0x10 0x18 0x10 0x18 */ - XMM14_REG, /* 0x20 0x28 0x20 0x28 */ - XMM13_REG, /* 0x30 0x38 0x30 0x38 */ - XMM12_REG, /* 0x40 0x48 0x40 0x48 */ - XMM11_REG, /* 0x50 0x58 0x50 0x58 */ - XMM10_REG, /* 0x60 0x68 0x60 0x68 */ - XMM9_REG, /* 0x70 0x78 0x70 0x78 */ - XMM8_REG, /* 0x80 0x88 0x80 0x88 */ - XMM7_REG, /* 0x90 0x98 0x90 0x98 */ - XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */ - SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */ - DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */ - BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */ - BP_REG, /* 0xc0 0xc8 N/A N/A */ - R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */ - R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */ - R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */ - R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */ -}; - -/* Instantiate static const values. */ -const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET; -const unsigned xlogue_layout::MIN_REGS; -const unsigned xlogue_layout::MAX_REGS; -const unsigned xlogue_layout::MAX_EXTRA_REGS; -const unsigned xlogue_layout::VARIANT_COUNT; -const unsigned xlogue_layout::STUB_NAME_MAX_LEN; - -/* Initialize xlogue_layout::s_stub_names to zero. */ -char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT] - [STUB_NAME_MAX_LEN]; - -/* Instantiates all xlogue_layout instances. */ -const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = { - xlogue_layout (0, false), - xlogue_layout (8, false), - xlogue_layout (0, true), - xlogue_layout (8, true) -}; - -/* Return an appropriate const instance of xlogue_layout based upon values - in cfun->machine and crtl. */ -const struct xlogue_layout & -xlogue_layout::get_instance () -{ - enum xlogue_stub_sets stub_set; - bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in; - - if (stack_realign_fp) - stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN; - else if (frame_pointer_needed) - stub_set = aligned_plus_8 - ? XLOGUE_SET_HFP_ALIGNED_PLUS_8 - : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN; - else - stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED; - - return s_instances[stub_set]; -} - -/* Determine how many clobbered registers can be saved by the stub. - Returns the count of registers the stub will save and restore. */ -unsigned -xlogue_layout::count_stub_managed_regs () -{ - bool hfp = frame_pointer_needed || stack_realign_fp; - unsigned i, count; - unsigned regno; - - for (count = i = MIN_REGS; i < MAX_REGS; ++i) - { - regno = REG_ORDER[i]; - if (regno == BP_REG && hfp) - continue; - if (!ix86_save_reg (regno, false, false)) - break; - ++count; - } - return count; -} - -/* Determine if register REGNO is a stub managed register given the - total COUNT of stub managed registers. */ -bool -xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count) -{ - bool hfp = frame_pointer_needed || stack_realign_fp; - unsigned i; - - for (i = 0; i < count; ++i) - { - gcc_assert (i < MAX_REGS); - if (REG_ORDER[i] == BP_REG && hfp) - ++count; - else if (REG_ORDER[i] == regno) - return true; - } - return false; -} - -/* Constructor for xlogue_layout. */ -xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp) - : m_hfp (hfp) , m_nregs (hfp ? 17 : 18), - m_stack_align_off_in (stack_align_off_in) -{ - HOST_WIDE_INT offset = stack_align_off_in; - unsigned i, j; - - for (i = j = 0; i < MAX_REGS; ++i) - { - unsigned regno = REG_ORDER[i]; - - if (regno == BP_REG && hfp) - continue; - if (SSE_REGNO_P (regno)) - { - offset += 16; - /* Verify that SSE regs are always aligned. */ - gcc_assert (!((stack_align_off_in + offset) & 15)); - } - else - offset += 8; - - m_regs[j].regno = regno; - m_regs[j++].offset = offset - STUB_INDEX_OFFSET; - } - gcc_assert (j == m_nregs); -} - -const char * -xlogue_layout::get_stub_name (enum xlogue_stub stub, - unsigned n_extra_regs) -{ - const int have_avx = TARGET_AVX; - char *name = s_stub_names[!!have_avx][stub][n_extra_regs]; - - /* Lazy init */ - if (!*name) - { - int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u", - (have_avx ? "avx" : "sse"), - STUB_BASE_NAMES[stub], - MIN_REGS + n_extra_regs); - gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN); - } - - return name; -} - -/* Return rtx of a symbol ref for the entry point (based upon - cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */ -rtx -xlogue_layout::get_stub_rtx (enum xlogue_stub stub) -{ - const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs; - gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS); - gcc_assert (stub < XLOGUE_STUB_COUNT); - gcc_assert (crtl->stack_realign_finalized); - - return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs)); -} - /* Define the structure for the machine field in struct function. */ struct GTY(()) stack_local_entry { @@ -741,41 +349,37 @@ enum processor_type ix86_arch; /* True if processor has SSE prefetch instruction. */ unsigned char x86_prefetch_sse; -/* -mstackrealign option */ -static const char ix86_force_align_arg_pointer_string[] - = "force_align_arg_pointer"; - -static rtx (*ix86_gen_leave) (void); -static rtx (*ix86_gen_add3) (rtx, rtx, rtx); -static rtx (*ix86_gen_sub3) (rtx, rtx, rtx); -static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx); -static rtx (*ix86_gen_one_cmpl2) (rtx, rtx); -static rtx (*ix86_gen_monitor) (rtx, rtx, rtx); -static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx); -static rtx (*ix86_gen_clzero) (rtx); -static rtx (*ix86_gen_andsp) (rtx, rtx, rtx); -static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx); -static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx); -static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx); -static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx); -static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx); +rtx (*ix86_gen_leave) (void); +rtx (*ix86_gen_add3) (rtx, rtx, rtx); +rtx (*ix86_gen_sub3) (rtx, rtx, rtx); +rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx); +rtx (*ix86_gen_one_cmpl2) (rtx, rtx); +rtx (*ix86_gen_monitor) (rtx, rtx, rtx); +rtx (*ix86_gen_monitorx) (rtx, rtx, rtx); +rtx (*ix86_gen_clzero) (rtx); +rtx (*ix86_gen_andsp) (rtx, rtx, rtx); +rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx); +rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx); +rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx); +rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx); +rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx); /* Preferred alignment for stack boundary in bits. */ unsigned int ix86_preferred_stack_boundary; /* Alignment for incoming stack boundary in bits specified at command line. */ -static unsigned int ix86_user_incoming_stack_boundary; +unsigned int ix86_user_incoming_stack_boundary; /* Default alignment for incoming stack boundary in bits. */ -static unsigned int ix86_default_incoming_stack_boundary; +unsigned int ix86_default_incoming_stack_boundary; /* Alignment for incoming stack boundary in bits. */ unsigned int ix86_incoming_stack_boundary; /* Calling abi specific va_list type nodes. */ -static GTY(()) tree sysv_va_list_type_node; -static GTY(()) tree ms_va_list_type_node; +tree sysv_va_list_type_node; +tree ms_va_list_type_node; /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */ char internal_label_prefix[16]; @@ -813,7 +417,6 @@ static REAL_VALUE_TYPE ext_80387_constants_table [5]; static bool ext_80387_constants_init; -static struct machine_function * ix86_init_machine_status (void); static rtx ix86_function_value (const_tree, const_tree, bool); static bool ix86_function_value_regno_p (const unsigned int); static unsigned int ix86_function_arg_boundary (machine_mode, @@ -821,49115 +424,20697 @@ static unsigned int ix86_function_arg_boundary (machine_mode, static rtx ix86_static_chain (const_tree, bool); static int ix86_function_regparm (const_tree, const_tree); static void ix86_compute_frame_layout (void); -static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, - rtx, rtx, int); -static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT); static tree ix86_canonical_va_list_type (tree); -static void predict_jump (int); static unsigned int split_stack_prologue_scratch_regno (void); static bool i386_asm_output_addr_const_extra (FILE *, rtx); -enum ix86_function_specific_strings -{ - IX86_FUNCTION_SPECIFIC_ARCH, - IX86_FUNCTION_SPECIFIC_TUNE, - IX86_FUNCTION_SPECIFIC_MAX -}; - -static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int, - const char *, const char *, enum fpmath_unit, - bool, bool); -static void ix86_function_specific_save (struct cl_target_option *, - struct gcc_options *opts); -static void ix86_function_specific_restore (struct gcc_options *opts, - struct cl_target_option *); -static void ix86_function_specific_post_stream_in (struct cl_target_option *); -static void ix86_function_specific_print (FILE *, int, - struct cl_target_option *); -static bool ix86_valid_target_attribute_p (tree, tree, tree, int); -static bool ix86_valid_target_attribute_inner_p (tree, tree, char *[], - struct gcc_options *, - struct gcc_options *, - struct gcc_options *, - bool); static bool ix86_can_inline_p (tree, tree); -static void ix86_set_current_function (tree); static unsigned int ix86_minimum_incoming_stack_boundary (bool); -static enum calling_abi ix86_function_abi (const_tree); - -#ifndef SUBTARGET32_DEFAULT_CPU -#define SUBTARGET32_DEFAULT_CPU "i386" -#endif - /* Whether -mtune= or -march= were specified */ -static int ix86_tune_defaulted; -static int ix86_arch_specified; - -/* Vectorization library interface and handlers. */ -static tree (*ix86_veclib_handler) (combined_fn, tree, tree); - -static tree ix86_veclibabi_svml (combined_fn, tree, tree); -static tree ix86_veclibabi_acml (combined_fn, tree, tree); - -/* This table must be in sync with enum processor_type in i386.h. */ -static const struct processor_costs *processor_cost_table[] = -{ - &generic_cost, - &i386_cost, - &i486_cost, - &pentium_cost, - &lakemont_cost, - &pentiumpro_cost, - &pentium4_cost, - &nocona_cost, - &core_cost, - &core_cost, - &core_cost, - &core_cost, - &atom_cost, - &slm_cost, - &slm_cost, - &slm_cost, - &slm_cost, - &slm_cost, - &slm_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &skylake_cost, - &intel_cost, - &geode_cost, - &k6_cost, - &athlon_cost, - &k8_cost, - &amdfam10_cost, - &bdver_cost, - &bdver_cost, - &bdver_cost, - &bdver_cost, - &btver1_cost, - &btver2_cost, - &znver1_cost, - &znver2_cost -}; - -/* Guarantee that the array is aligned with enum processor_type. */ -STATIC_ASSERT (ARRAY_SIZE (processor_cost_table) == PROCESSOR_max); +int ix86_tune_defaulted; +int ix86_arch_specified; -static unsigned int -rest_of_handle_insert_vzeroupper (void) -{ - int i; - - /* vzeroupper instructions are inserted immediately after reload to - account for possible spills from 256bit or 512bit registers. The pass - reuses mode switching infrastructure by re-running mode insertion - pass, so disable entities that have already been processed. */ - for (i = 0; i < MAX_386_ENTITIES; i++) - ix86_optimize_mode_switching[i] = 0; +/* Return true if a red-zone is in use. We can't use red-zone when + there are local indirect jumps, like "indirect_jump" or "tablejump", + which jumps to another place in the function, since "call" in the + indirect thunk pushes the return address onto stack, destroying + red-zone. - ix86_optimize_mode_switching[AVX_U128] = 1; + TODO: If we can reserve the first 2 WORDs, for PUSH and, another + for CALL, in red-zone, we can allow local indirect jumps with + indirect thunk. */ - /* Call optimize_mode_switching. */ - g->get_passes ()->execute_pass_mode_switching (); - return 0; +bool +ix86_using_red_zone (void) +{ + return (TARGET_RED_ZONE + && !TARGET_64BIT_MS_ABI + && (!cfun->machine->has_local_indirect_jump + || cfun->machine->indirect_branch_type == indirect_branch_keep)); } - -/* Return 1 if INSN uses or defines a hard register. - Hard register uses in a memory address are ignored. - Clobbers and flags definitions are ignored. */ - + +/* Return true, if profiling code should be emitted before + prologue. Otherwise it returns false. + Note: For x86 with "hotfix" it is sorried. */ static bool -has_non_address_hard_reg (rtx_insn *insn) +ix86_profile_before_prologue (void) { - df_ref ref; - FOR_EACH_INSN_DEF (ref, insn) - if (HARD_REGISTER_P (DF_REF_REAL_REG (ref)) - && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER) - && DF_REF_REGNO (ref) != FLAGS_REG) - return true; - - FOR_EACH_INSN_USE (ref, insn) - if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref))) - return true; - - return false; + return flag_fentry != 0; } -/* Check if comparison INSN may be transformed - into vector comparison. Currently we transform - zero checks only which look like: - - (set (reg:CCZ 17 flags) - (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4) - (subreg:SI (reg:DI x) 0)) - (const_int 0 [0]))) */ +/* Update register usage after having seen the compiler flags. */ -static bool -convertible_comparison_p (rtx_insn *insn) +static void +ix86_conditional_register_usage (void) { - if (!TARGET_SSE4_1) - return false; - - rtx def_set = single_set (insn); - - gcc_assert (def_set); + int i, c_mask; - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); + /* If there are no caller-saved registers, preserve all registers. + except fixed_regs and registers used for function return value + since aggregate_value_p checks call_used_regs[regno] on return + value. */ + if (cfun && cfun->machine->no_caller_saved_registers) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (!fixed_regs[i] && !ix86_function_value_regno_p (i)) + call_used_regs[i] = 0; - gcc_assert (GET_CODE (src) == COMPARE); + /* For 32-bit targets, squash the REX registers. */ + if (! TARGET_64BIT) + { + for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + } - if (GET_CODE (dst) != REG - || REGNO (dst) != FLAGS_REG - || GET_MODE (dst) != CCZmode) - return false; + /* See the definition of CALL_USED_REGISTERS in i386.h. */ + c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI); + + CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]); - rtx op1 = XEXP (src, 0); - rtx op2 = XEXP (src, 1); + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + { + /* Set/reset conditionally defined registers from + CALL_USED_REGISTERS initializer. */ + if (call_used_regs[i] > 1) + call_used_regs[i] = !!(call_used_regs[i] & c_mask); - if (op2 != CONST0_RTX (GET_MODE (op2))) - return false; + /* Calculate registers of CLOBBERED_REGS register set + as call used registers from GENERAL_REGS register set. */ + if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i) + && call_used_regs[i]) + SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i); + } - if (GET_CODE (op1) != IOR) - return false; + /* If MMX is disabled, squash the registers. */ + if (! TARGET_MMX) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i)) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; - op2 = XEXP (op1, 1); - op1 = XEXP (op1, 0); - - if (!SUBREG_P (op1) - || !SUBREG_P (op2) - || GET_MODE (op1) != SImode - || GET_MODE (op2) != SImode - || ((SUBREG_BYTE (op1) != 0 - || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode)) - && (SUBREG_BYTE (op2) != 0 - || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode)))) - return false; + /* If SSE is disabled, squash the registers. */ + if (! TARGET_SSE) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i)) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; - op1 = SUBREG_REG (op1); - op2 = SUBREG_REG (op2); + /* If the FPU is disabled, squash the registers. */ + if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387)) + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i)) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; - if (op1 != op2 - || !REG_P (op1) - || GET_MODE (op1) != DImode) - return false; + /* If AVX512F is disabled, squash the registers. */ + if (! TARGET_AVX512F) + { + for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; - return true; + for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++) + fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + } } -/* The DImode version of scalar_to_vector_candidate_p. */ +/* Canonicalize a comparison from one we don't have to one we do have. */ -static bool -dimode_scalar_to_vector_candidate_p (rtx_insn *insn) +static void +ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1, + bool op0_preserve_value) { - rtx def_set = single_set (insn); - - if (!def_set) - return false; - - if (has_non_address_hard_reg (insn)) - return false; - - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); + /* The order of operands in x87 ficom compare is forced by combine in + simplify_comparison () function. Float operator is treated as RTX_OBJ + with a precedence over other operators and is always put in the first + place. Swap condition and operands to match ficom instruction. */ + if (!op0_preserve_value + && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1)) + { + enum rtx_code scode = swap_condition ((enum rtx_code) *code); - if (GET_CODE (src) == COMPARE) - return convertible_comparison_p (insn); + /* We are called only for compares that are split to SAHF instruction. + Ensure that we have setcc/jcc insn for the swapped condition. */ + if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN) + { + std::swap (*op0, *op1); + *code = (int) scode; + } + } +} + + +/* Hook to determine if one function can safely inline another. */ - /* We are interested in DImode promotion only. */ - if ((GET_MODE (src) != DImode - && !CONST_INT_P (src)) - || GET_MODE (dst) != DImode) - return false; +static bool +ix86_can_inline_p (tree caller, tree callee) +{ + tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller); + tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee); - if (!REG_P (dst) && !MEM_P (dst)) - return false; + /* Changes of those flags can be tolerated for always inlines. Lets hope + user knows what he is doing. */ + const unsigned HOST_WIDE_INT always_inline_safe_mask + = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS + | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD + | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD + | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS + | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE + | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER + | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER); - switch (GET_CODE (src)) - { - case ASHIFTRT: - if (!TARGET_AVX512VL) - return false; - /* FALLTHRU */ - case ASHIFT: - case LSHIFTRT: - if (!CONST_INT_P (XEXP (src, 1)) - || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)) - return false; - break; + if (!callee_tree) + callee_tree = target_option_default_node; + if (!caller_tree) + caller_tree = target_option_default_node; + if (callee_tree == caller_tree) + return true; - case PLUS: - case MINUS: - case IOR: - case XOR: - case AND: - if (!REG_P (XEXP (src, 1)) - && !MEM_P (XEXP (src, 1)) - && !CONST_INT_P (XEXP (src, 1))) - return false; + struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree); + struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree); + bool ret = false; + bool always_inline + = (DECL_DISREGARD_INLINE_LIMITS (callee) + && lookup_attribute ("always_inline", + DECL_ATTRIBUTES (callee))); - if (GET_MODE (XEXP (src, 1)) != DImode - && !CONST_INT_P (XEXP (src, 1))) - return false; - break; + cgraph_node *callee_node = cgraph_node::get (callee); + /* Callee's isa options should be a subset of the caller's, i.e. a SSE4 + function can inline a SSE2 function but a SSE2 function can't inline + a SSE4 function. */ + if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags) + != callee_opts->x_ix86_isa_flags) + || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2) + != callee_opts->x_ix86_isa_flags2)) + ret = false; - case NEG: - case NOT: - break; + /* See if we have the same non-isa options. */ + else if ((!always_inline + && caller_opts->x_target_flags != callee_opts->x_target_flags) + || (caller_opts->x_target_flags & ~always_inline_safe_mask) + != (callee_opts->x_target_flags & ~always_inline_safe_mask)) + ret = false; - case REG: - return true; + /* See if arch, tune, etc. are the same. */ + else if (caller_opts->arch != callee_opts->arch) + ret = false; - case MEM: - case CONST_INT: - return REG_P (dst); + else if (!always_inline && caller_opts->tune != callee_opts->tune) + ret = false; - default: - return false; - } + else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath + /* If the calle doesn't use FP expressions differences in + ix86_fpmath can be ignored. We are called from FEs + for multi-versioning call optimization, so beware of + ipa_fn_summaries not available. */ + && (! ipa_fn_summaries + || ipa_fn_summaries->get (callee_node) == NULL + || ipa_fn_summaries->get (callee_node)->fp_expressions)) + ret = false; - if (!REG_P (XEXP (src, 0)) - && !MEM_P (XEXP (src, 0)) - && !CONST_INT_P (XEXP (src, 0)) - /* Check for andnot case. */ - && (GET_CODE (src) != AND - || GET_CODE (XEXP (src, 0)) != NOT - || !REG_P (XEXP (XEXP (src, 0), 0)))) - return false; + else if (!always_inline + && caller_opts->branch_cost != callee_opts->branch_cost) + ret = false; - if (GET_MODE (XEXP (src, 0)) != DImode - && !CONST_INT_P (XEXP (src, 0))) - return false; + else + ret = true; - return true; + return ret; } - -/* The TImode version of scalar_to_vector_candidate_p. */ + +/* Return true if this goes in large data/bss. */ static bool -timode_scalar_to_vector_candidate_p (rtx_insn *insn) +ix86_in_large_data_p (tree exp) { - rtx def_set = single_set (insn); - - if (!def_set) + if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC) return false; - if (has_non_address_hard_reg (insn)) + if (exp == NULL_TREE) return false; - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); - - /* Only TImode load and store are allowed. */ - if (GET_MODE (dst) != TImode) + /* Functions are never large data. */ + if (TREE_CODE (exp) == FUNCTION_DECL) return false; - if (MEM_P (dst)) - { - /* Check for store. Memory must be aligned or unaligned store - is optimal. Only support store from register, standard SSE - constant or CONST_WIDE_INT generated from piecewise store. - - ??? Verify performance impact before enabling CONST_INT for - __int128 store. */ - if (misaligned_operand (dst, TImode) - && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL) - return false; - - switch (GET_CODE (src)) - { - default: - return false; - - case REG: - case CONST_WIDE_INT: - return true; + /* Automatic variables are never large data. */ + if (VAR_P (exp) && !is_global_var (exp)) + return false; - case CONST_INT: - return standard_sse_constant_p (src, TImode); - } - } - else if (MEM_P (src)) + if (VAR_P (exp) && DECL_SECTION_NAME (exp)) { - /* Check for load. Memory must be aligned or unaligned load is - optimal. */ - return (REG_P (dst) - && (!misaligned_operand (src, TImode) - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)); + const char *section = DECL_SECTION_NAME (exp); + if (strcmp (section, ".ldata") == 0 + || strcmp (section, ".lbss") == 0) + return true; + return false; } - - return false; -} - -/* Return 1 if INSN may be converted into vector - instruction. */ - -static bool -scalar_to_vector_candidate_p (rtx_insn *insn) -{ - if (TARGET_64BIT) - return timode_scalar_to_vector_candidate_p (insn); else - return dimode_scalar_to_vector_candidate_p (insn); -} - -/* The DImode version of remove_non_convertible_regs. */ - -static void -dimode_remove_non_convertible_regs (bitmap candidates) -{ - bitmap_iterator bi; - unsigned id; - bitmap regs = BITMAP_ALLOC (NULL); - - EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi) - { - rtx def_set = single_set (DF_INSN_UID_GET (id)->insn); - rtx reg = SET_DEST (def_set); - - if (!REG_P (reg) - || bitmap_bit_p (regs, REGNO (reg)) - || HARD_REGISTER_P (reg)) - continue; - - for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg)); - def; - def = DF_REF_NEXT_REG (def)) - { - if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) - { - if (dump_file) - fprintf (dump_file, - "r%d has non convertible definition in insn %d\n", - REGNO (reg), DF_REF_INSN_UID (def)); - - bitmap_set_bit (regs, REGNO (reg)); - break; - } - } - } - - EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi) { - for (df_ref def = DF_REG_DEF_CHAIN (id); - def; - def = DF_REF_NEXT_REG (def)) - if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) - { - if (dump_file) - fprintf (dump_file, "Removing insn %d from candidates list\n", - DF_REF_INSN_UID (def)); + HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp)); - bitmap_clear_bit (candidates, DF_REF_INSN_UID (def)); - } + /* If this is an incomplete type with size 0, then we can't put it + in data because it might be too big when completed. Also, + int_size_in_bytes returns -1 if size can vary or is larger than + an integer in which case also it is safer to assume that it goes in + large data. */ + if (size <= 0 || size > ix86_section_threshold) + return true; } - BITMAP_FREE (regs); + return false; } -/* For a register REGNO, scan instructions for its defs and uses. - Put REGNO in REGS if a def or use isn't in CANDIDATES. */ +/* i386-specific section flag to mark large sections. */ +#define SECTION_LARGE SECTION_MACH_DEP -static void -timode_check_non_convertible_regs (bitmap candidates, bitmap regs, - unsigned int regno) +/* Switch to the appropriate section for output of DECL. + DECL is either a `VAR_DECL' node or a constant of some sort. + RELOC indicates whether forming the initial value of DECL requires + link-time relocations. */ + +ATTRIBUTE_UNUSED static section * +x86_64_elf_select_section (tree decl, int reloc, + unsigned HOST_WIDE_INT align) { - for (df_ref def = DF_REG_DEF_CHAIN (regno); - def; - def = DF_REF_NEXT_REG (def)) + if (ix86_in_large_data_p (decl)) { - if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) + const char *sname = NULL; + unsigned int flags = SECTION_WRITE | SECTION_LARGE; + switch (categorize_decl_for_section (decl, reloc)) { - if (dump_file) - fprintf (dump_file, - "r%d has non convertible def in insn %d\n", - regno, DF_REF_INSN_UID (def)); - - bitmap_set_bit (regs, regno); + case SECCAT_DATA: + sname = ".ldata"; + break; + case SECCAT_DATA_REL: + sname = ".ldata.rel"; + break; + case SECCAT_DATA_REL_LOCAL: + sname = ".ldata.rel.local"; + break; + case SECCAT_DATA_REL_RO: + sname = ".ldata.rel.ro"; + break; + case SECCAT_DATA_REL_RO_LOCAL: + sname = ".ldata.rel.ro.local"; + break; + case SECCAT_BSS: + sname = ".lbss"; + flags |= SECTION_BSS; + break; + case SECCAT_RODATA: + case SECCAT_RODATA_MERGE_STR: + case SECCAT_RODATA_MERGE_STR_INIT: + case SECCAT_RODATA_MERGE_CONST: + sname = ".lrodata"; + flags &= ~SECTION_WRITE; + break; + case SECCAT_SRODATA: + case SECCAT_SDATA: + case SECCAT_SBSS: + gcc_unreachable (); + case SECCAT_TEXT: + case SECCAT_TDATA: + case SECCAT_TBSS: + /* We don't split these for medium model. Place them into + default sections and hope for best. */ break; } - } - - for (df_ref ref = DF_REG_USE_CHAIN (regno); - ref; - ref = DF_REF_NEXT_REG (ref)) - { - /* Debug instructions are skipped. */ - if (NONDEBUG_INSN_P (DF_REF_INSN (ref)) - && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))) + if (sname) { - if (dump_file) - fprintf (dump_file, - "r%d has non convertible use in insn %d\n", - regno, DF_REF_INSN_UID (ref)); - - bitmap_set_bit (regs, regno); - break; + /* We might get called with string constants, but get_named_section + doesn't like them as they are not DECLs. Also, we need to set + flags in that case. */ + if (!DECL_P (decl)) + return get_section (sname, flags, NULL); + return get_named_section (decl, sname, reloc); } } + return default_elf_select_section (decl, reloc, align); } -/* The TImode version of remove_non_convertible_regs. */ +/* Select a set of attributes for section NAME based on the properties + of DECL and whether or not RELOC indicates that DECL's initializer + might contain runtime relocations. */ -static void -timode_remove_non_convertible_regs (bitmap candidates) +static unsigned int ATTRIBUTE_UNUSED +x86_64_elf_section_type_flags (tree decl, const char *name, int reloc) { - bitmap_iterator bi; - unsigned id; - bitmap regs = BITMAP_ALLOC (NULL); - - EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi) - { - rtx def_set = single_set (DF_INSN_UID_GET (id)->insn); - rtx dest = SET_DEST (def_set); - rtx src = SET_SRC (def_set); - - if ((!REG_P (dest) - || bitmap_bit_p (regs, REGNO (dest)) - || HARD_REGISTER_P (dest)) - && (!REG_P (src) - || bitmap_bit_p (regs, REGNO (src)) - || HARD_REGISTER_P (src))) - continue; - - if (REG_P (dest)) - timode_check_non_convertible_regs (candidates, regs, - REGNO (dest)); - - if (REG_P (src)) - timode_check_non_convertible_regs (candidates, regs, - REGNO (src)); - } - - EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi) - { - for (df_ref def = DF_REG_DEF_CHAIN (id); - def; - def = DF_REF_NEXT_REG (def)) - if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def))) - { - if (dump_file) - fprintf (dump_file, "Removing insn %d from candidates list\n", - DF_REF_INSN_UID (def)); + unsigned int flags = default_section_type_flags (decl, name, reloc); - bitmap_clear_bit (candidates, DF_REF_INSN_UID (def)); - } + if (ix86_in_large_data_p (decl)) + flags |= SECTION_LARGE; - for (df_ref ref = DF_REG_USE_CHAIN (id); - ref; - ref = DF_REF_NEXT_REG (ref)) - if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))) - { - if (dump_file) - fprintf (dump_file, "Removing insn %d from candidates list\n", - DF_REF_INSN_UID (ref)); + if (decl == NULL_TREE + && (strcmp (name, ".ldata.rel.ro") == 0 + || strcmp (name, ".ldata.rel.ro.local") == 0)) + flags |= SECTION_RELRO; - bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref)); - } - } + if (strcmp (name, ".lbss") == 0 + || strncmp (name, ".lbss.", 5) == 0 + || strncmp (name, ".gnu.linkonce.lb.", 16) == 0) + flags |= SECTION_BSS; - BITMAP_FREE (regs); + return flags; } -/* For a given bitmap of insn UIDs scans all instruction and - remove insn from CANDIDATES in case it has both convertible - and not convertible definitions. +/* Build up a unique section name, expressed as a + STRING_CST node, and assign it to DECL_SECTION_NAME (decl). + RELOC indicates whether the initial value of EXP requires + link-time relocations. */ - All insns in a bitmap are conversion candidates according to - scalar_to_vector_candidate_p. Currently it implies all insns - are single_set. */ - -static void -remove_non_convertible_regs (bitmap candidates) -{ - if (TARGET_64BIT) - timode_remove_non_convertible_regs (candidates); - else - dimode_remove_non_convertible_regs (candidates); -} - -class scalar_chain -{ - public: - scalar_chain (); - virtual ~scalar_chain (); - - static unsigned max_id; - - /* ID of a chain. */ - unsigned int chain_id; - /* A queue of instructions to be included into a chain. */ - bitmap queue; - /* Instructions included into a chain. */ - bitmap insns; - /* All registers defined by a chain. */ - bitmap defs; - /* Registers used in both vector and sclar modes. */ - bitmap defs_conv; - - void build (bitmap candidates, unsigned insn_uid); - virtual int compute_convert_gain () = 0; - int convert (); - - protected: - void add_to_queue (unsigned insn_uid); - void emit_conversion_insns (rtx insns, rtx_insn *pos); - - private: - void add_insn (bitmap candidates, unsigned insn_uid); - void analyze_register_chain (bitmap candidates, df_ref ref); - virtual void mark_dual_mode_def (df_ref def) = 0; - virtual void convert_insn (rtx_insn *insn) = 0; - virtual void convert_registers () = 0; -}; - -class dimode_scalar_chain : public scalar_chain -{ - public: - int compute_convert_gain (); - private: - void mark_dual_mode_def (df_ref def); - rtx replace_with_subreg (rtx x, rtx reg, rtx subreg); - void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg); - void convert_insn (rtx_insn *insn); - void convert_op (rtx *op, rtx_insn *insn); - void convert_reg (unsigned regno); - void make_vector_copies (unsigned regno); - void convert_registers (); - int vector_const_cost (rtx exp); -}; - -class timode_scalar_chain : public scalar_chain +static void ATTRIBUTE_UNUSED +x86_64_elf_unique_section (tree decl, int reloc) { - public: - /* Convert from TImode to V1TImode is always faster. */ - int compute_convert_gain () { return 1; } - - private: - void mark_dual_mode_def (df_ref def); - void fix_debug_reg_uses (rtx reg); - void convert_insn (rtx_insn *insn); - /* We don't convert registers to difference size. */ - void convert_registers () {} -}; - -unsigned scalar_chain::max_id = 0; - -/* Initialize new chain. */ + if (ix86_in_large_data_p (decl)) + { + const char *prefix = NULL; + /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */ + bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP; -scalar_chain::scalar_chain () -{ - chain_id = ++max_id; + switch (categorize_decl_for_section (decl, reloc)) + { + case SECCAT_DATA: + case SECCAT_DATA_REL: + case SECCAT_DATA_REL_LOCAL: + case SECCAT_DATA_REL_RO: + case SECCAT_DATA_REL_RO_LOCAL: + prefix = one_only ? ".ld" : ".ldata"; + break; + case SECCAT_BSS: + prefix = one_only ? ".lb" : ".lbss"; + break; + case SECCAT_RODATA: + case SECCAT_RODATA_MERGE_STR: + case SECCAT_RODATA_MERGE_STR_INIT: + case SECCAT_RODATA_MERGE_CONST: + prefix = one_only ? ".lr" : ".lrodata"; + break; + case SECCAT_SRODATA: + case SECCAT_SDATA: + case SECCAT_SBSS: + gcc_unreachable (); + case SECCAT_TEXT: + case SECCAT_TDATA: + case SECCAT_TBSS: + /* We don't split these for medium model. Place them into + default sections and hope for best. */ + break; + } + if (prefix) + { + const char *name, *linkonce; + char *string; - if (dump_file) - fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id); + name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); + name = targetm.strip_name_encoding (name); - bitmap_obstack_initialize (NULL); - insns = BITMAP_ALLOC (NULL); - defs = BITMAP_ALLOC (NULL); - defs_conv = BITMAP_ALLOC (NULL); - queue = NULL; -} + /* If we're using one_only, then there needs to be a .gnu.linkonce + prefix to the section name. */ + linkonce = one_only ? ".gnu.linkonce" : ""; -/* Free chain's data. */ + string = ACONCAT ((linkonce, prefix, ".", name, NULL)); -scalar_chain::~scalar_chain () -{ - BITMAP_FREE (insns); - BITMAP_FREE (defs); - BITMAP_FREE (defs_conv); - bitmap_obstack_release (NULL); + set_decl_section_name (decl, string); + return; + } + } + default_unique_section (decl, reloc); } -/* Add instruction into chains' queue. */ - -void -scalar_chain::add_to_queue (unsigned insn_uid) -{ - if (bitmap_bit_p (insns, insn_uid) - || bitmap_bit_p (queue, insn_uid)) - return; +#ifdef COMMON_ASM_OP - if (dump_file) - fprintf (dump_file, " Adding insn %d into chain's #%d queue\n", - insn_uid, chain_id); - bitmap_set_bit (queue, insn_uid); -} +#ifndef LARGECOMM_SECTION_ASM_OP +#define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t" +#endif -/* For DImode conversion, mark register defined by DEF as requiring - conversion. */ +/* This says how to output assembler code to declare an + uninitialized external linkage data object. + For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for + large objects. */ void -dimode_scalar_chain::mark_dual_mode_def (df_ref def) +x86_elf_aligned_decl_common (FILE *file, tree decl, + const char *name, unsigned HOST_WIDE_INT size, + int align) { - gcc_assert (DF_REF_REG_DEF_P (def)); - - if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def))) - return; - - if (dump_file) - fprintf (dump_file, - " Mark r%d def in insn %d as requiring both modes in chain #%d\n", - DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id); - - bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); + if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) + && size > (unsigned int)ix86_section_threshold) + { + switch_to_section (get_named_section (decl, ".lbss", 0)); + fputs (LARGECOMM_SECTION_ASM_OP, file); + } + else + fputs (COMMON_ASM_OP, file); + assemble_name (file, name); + fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n", + size, align / BITS_PER_UNIT); } +#endif -/* For TImode conversion, it is unused. */ +/* Utility function for targets to use in implementing + ASM_OUTPUT_ALIGNED_BSS. */ void -timode_scalar_chain::mark_dual_mode_def (df_ref) +x86_output_aligned_bss (FILE *file, tree decl, const char *name, + unsigned HOST_WIDE_INT size, int align) { - gcc_unreachable (); + if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) + && size > (unsigned int)ix86_section_threshold) + switch_to_section (get_named_section (decl, ".lbss", 0)); + else + switch_to_section (bss_section); + ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT)); +#ifdef ASM_DECLARE_OBJECT_NAME + last_assemble_variable_decl = decl; + ASM_DECLARE_OBJECT_NAME (file, name, decl); +#else + /* Standard thing is just output label for the object. */ + ASM_OUTPUT_LABEL (file, name); +#endif /* ASM_DECLARE_OBJECT_NAME */ + ASM_OUTPUT_SKIP (file, size ? size : 1); } + +/* Decide whether we must probe the stack before any space allocation + on this target. It's essentially TARGET_STACK_PROBE except when + -fstack-check causes the stack to be already probed differently. */ -/* Check REF's chain to add new insns into a queue - and find registers requiring conversion. */ - -void -scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref) +bool +ix86_target_stack_probe (void) { - df_link *chain; - - gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)) - || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref))); - add_to_queue (DF_REF_INSN_UID (ref)); - - for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next) - { - unsigned uid = DF_REF_INSN_UID (chain->ref); - - if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref))) - continue; - - if (!DF_REF_REG_MEM_P (chain->ref)) - { - if (bitmap_bit_p (insns, uid)) - continue; - - if (bitmap_bit_p (candidates, uid)) - { - add_to_queue (uid); - continue; - } - } + /* Do not probe the stack twice if static stack checking is enabled. */ + if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK) + return false; - if (DF_REF_REG_DEF_P (chain->ref)) - { - if (dump_file) - fprintf (dump_file, " r%d def in insn %d isn't convertible\n", - DF_REF_REGNO (chain->ref), uid); - mark_dual_mode_def (chain->ref); - } - else - { - if (dump_file) - fprintf (dump_file, " r%d use in insn %d isn't convertible\n", - DF_REF_REGNO (chain->ref), uid); - mark_dual_mode_def (ref); - } - } + return TARGET_STACK_PROBE; } + +/* Decide whether we can make a sibling call to a function. DECL is the + declaration of the function being targeted by the call and EXP is the + CALL_EXPR representing the call. */ -/* Add instruction into a chain. */ - -void -scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid) +static bool +ix86_function_ok_for_sibcall (tree decl, tree exp) { - if (bitmap_bit_p (insns, insn_uid)) - return; - - if (dump_file) - fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id); - - bitmap_set_bit (insns, insn_uid); + tree type, decl_or_type; + rtx a, b; + bool bind_global = decl && !targetm.binds_local_p (decl); - rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; - rtx def_set = single_set (insn); - if (def_set && REG_P (SET_DEST (def_set)) - && !HARD_REGISTER_P (SET_DEST (def_set))) - bitmap_set_bit (defs, REGNO (SET_DEST (def_set))); + if (ix86_function_naked (current_function_decl)) + return false; - df_ref ref; - df_ref def; - for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) - if (!HARD_REGISTER_P (DF_REF_REG (ref))) - for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref)); - def; - def = DF_REF_NEXT_REG (def)) - analyze_register_chain (candidates, def); - for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) - if (!DF_REF_REG_MEM_P (ref)) - analyze_register_chain (candidates, ref); -} + /* Sibling call isn't OK if there are no caller-saved registers + since all registers must be preserved before return. */ + if (cfun->machine->no_caller_saved_registers) + return false; -/* Build new chain starting from insn INSN_UID recursively - adding all dependent uses and definitions. */ + /* If we are generating position-independent code, we cannot sibcall + optimize direct calls to global functions, as the PLT requires + %ebx be live. (Darwin does not have a PLT.) */ + if (!TARGET_MACHO + && !TARGET_64BIT + && flag_pic + && flag_plt + && bind_global) + return false; -void -scalar_chain::build (bitmap candidates, unsigned insn_uid) -{ - queue = BITMAP_ALLOC (NULL); - bitmap_set_bit (queue, insn_uid); + /* If we need to align the outgoing stack, then sibcalling would + unalign the stack, which may break the called function. */ + if (ix86_minimum_incoming_stack_boundary (true) + < PREFERRED_STACK_BOUNDARY) + return false; - if (dump_file) - fprintf (dump_file, "Building chain #%d...\n", chain_id); + if (decl) + { + decl_or_type = decl; + type = TREE_TYPE (decl); + } + else + { + /* We're looking at the CALL_EXPR, we need the type of the function. */ + type = CALL_EXPR_FN (exp); /* pointer expression */ + type = TREE_TYPE (type); /* pointer type */ + type = TREE_TYPE (type); /* function type */ + decl_or_type = type; + } - while (!bitmap_empty_p (queue)) + /* Check that the return value locations are the same. Like + if we are returning floats on the 80387 register stack, we cannot + make a sibcall from a function that doesn't return a float to a + function that does or, conversely, from a function that does return + a float to a function that doesn't; the necessary stack adjustment + would not be executed. This is also the place we notice + differences in the return value ABI. Note that it is ok for one + of the functions to have void return type as long as the return + value of the other is passed in a register. */ + a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false); + b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)), + cfun->decl, false); + if (STACK_REG_P (a) || STACK_REG_P (b)) { - insn_uid = bitmap_first_set_bit (queue); - bitmap_clear_bit (queue, insn_uid); - bitmap_clear_bit (candidates, insn_uid); - add_insn (candidates, insn_uid); + if (!rtx_equal_p (a, b)) + return false; } + else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl)))) + ; + else if (!rtx_equal_p (a, b)) + return false; - if (dump_file) + if (TARGET_64BIT) + { + /* The SYSV ABI has more call-clobbered registers; + disallow sibcalls from MS to SYSV. */ + if (cfun->machine->call_abi == MS_ABI + && ix86_function_type_abi (type) == SYSV_ABI) + return false; + } + else { - fprintf (dump_file, "Collected chain #%d...\n", chain_id); - fprintf (dump_file, " insns: "); - dump_bitmap (dump_file, insns); - if (!bitmap_empty_p (defs_conv)) + /* If this call is indirect, we'll need to be able to use a + call-clobbered register for the address of the target function. + Make sure that all such registers are not used for passing + parameters. Note that DLLIMPORT functions and call to global + function via GOT slot are indirect. */ + if (!decl + || (bind_global && flag_pic && !flag_plt) + || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)) + || flag_force_indirect_call) { - bitmap_iterator bi; - unsigned id; - const char *comma = ""; - fprintf (dump_file, " defs to convert: "); - EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi) - { - fprintf (dump_file, "%sr%d", comma, id); - comma = ", "; - } - fprintf (dump_file, "\n"); + /* Check if regparm >= 3 since arg_reg_available is set to + false if regparm == 0. If regparm is 1 or 2, there is + always a call-clobbered register available. + + ??? The symbol indirect call doesn't need a call-clobbered + register. But we don't know if this is a symbol indirect + call or not here. */ + if (ix86_function_regparm (type, decl) >= 3 + && !cfun->machine->arg_reg_available) + return false; } } - BITMAP_FREE (queue); + /* Otherwise okay. That also includes certain types of indirect calls. */ + return true; } -/* Return a cost of building a vector costant - instead of using a scalar one. */ +/* This function determines from TYPE the calling-convention. */ -int -dimode_scalar_chain::vector_const_cost (rtx exp) +unsigned int +ix86_get_callcvt (const_tree type) { - gcc_assert (CONST_INT_P (exp)); + unsigned int ret = 0; + bool is_stdarg; + tree attrs; - if (standard_sse_constant_p (exp, V2DImode)) - return COSTS_N_INSNS (1); - return ix86_cost->sse_load[1]; -} + if (TARGET_64BIT) + return IX86_CALLCVT_CDECL; -/* Compute a gain for chain conversion. */ + attrs = TYPE_ATTRIBUTES (type); + if (attrs != NULL_TREE) + { + if (lookup_attribute ("cdecl", attrs)) + ret |= IX86_CALLCVT_CDECL; + else if (lookup_attribute ("stdcall", attrs)) + ret |= IX86_CALLCVT_STDCALL; + else if (lookup_attribute ("fastcall", attrs)) + ret |= IX86_CALLCVT_FASTCALL; + else if (lookup_attribute ("thiscall", attrs)) + ret |= IX86_CALLCVT_THISCALL; -int -dimode_scalar_chain::compute_convert_gain () -{ - bitmap_iterator bi; - unsigned insn_uid; - int gain = 0; - int cost = 0; - - if (dump_file) - fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); - - EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi) - { - rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn; - rtx def_set = single_set (insn); - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); - - if (REG_P (src) && REG_P (dst)) - gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move; - else if (REG_P (src) && MEM_P (dst)) - gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1]; - else if (MEM_P (src) && REG_P (dst)) - gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1]; - else if (GET_CODE (src) == ASHIFT - || GET_CODE (src) == ASHIFTRT - || GET_CODE (src) == LSHIFTRT) - { - if (CONST_INT_P (XEXP (src, 0))) - gain -= vector_const_cost (XEXP (src, 0)); - - gain += ix86_cost->shift_const; - if (INTVAL (XEXP (src, 1)) >= 32) - gain -= COSTS_N_INSNS (1); - } - else if (GET_CODE (src) == PLUS - || GET_CODE (src) == MINUS - || GET_CODE (src) == IOR - || GET_CODE (src) == XOR - || GET_CODE (src) == AND) - { - gain += ix86_cost->add; - /* Additional gain for andnot for targets without BMI. */ - if (GET_CODE (XEXP (src, 0)) == NOT - && !TARGET_BMI) - gain += 2 * ix86_cost->add; - - if (CONST_INT_P (XEXP (src, 0))) - gain -= vector_const_cost (XEXP (src, 0)); - if (CONST_INT_P (XEXP (src, 1))) - gain -= vector_const_cost (XEXP (src, 1)); - } - else if (GET_CODE (src) == NEG - || GET_CODE (src) == NOT) - gain += ix86_cost->add - COSTS_N_INSNS (1); - else if (GET_CODE (src) == COMPARE) - { - /* Assume comparison cost is the same. */ - } - else if (CONST_INT_P (src)) - { - if (REG_P (dst)) - gain += COSTS_N_INSNS (2); - else if (MEM_P (dst)) - gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1]; - gain -= vector_const_cost (src); + /* Regparam isn't allowed for thiscall and fastcall. */ + if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0) + { + if (lookup_attribute ("regparm", attrs)) + ret |= IX86_CALLCVT_REGPARM; + if (lookup_attribute ("sseregparm", attrs)) + ret |= IX86_CALLCVT_SSEREGPARM; } - else - gcc_unreachable (); - } - if (dump_file) - fprintf (dump_file, " Instruction conversion gain: %d\n", gain); - - EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi) - cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer; - - if (dump_file) - fprintf (dump_file, " Registers conversion cost: %d\n", cost); + if (IX86_BASE_CALLCVT(ret) != 0) + return ret; + } - gain -= cost; + is_stdarg = stdarg_p (type); + if (TARGET_RTD && !is_stdarg) + return IX86_CALLCVT_STDCALL | ret; - if (dump_file) - fprintf (dump_file, " Total gain: %d\n", gain); + if (ret != 0 + || is_stdarg + || TREE_CODE (type) != METHOD_TYPE + || ix86_function_type_abi (type) != MS_ABI) + return IX86_CALLCVT_CDECL | ret; - return gain; + return IX86_CALLCVT_THISCALL; } -/* Replace REG in X with a V2DI subreg of NEW_REG. */ +/* Return 0 if the attributes for two types are incompatible, 1 if they + are compatible, and 2 if they are nearly compatible (which causes a + warning to be generated). */ -rtx -dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg) +static int +ix86_comp_type_attributes (const_tree type1, const_tree type2) { - if (x == reg) - return gen_rtx_SUBREG (V2DImode, new_reg, 0); + unsigned int ccvt1, ccvt2; - const char *fmt = GET_RTX_FORMAT (GET_CODE (x)); - int i, j; - for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--) - { - if (fmt[i] == 'e') - XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg); - else if (fmt[i] == 'E') - for (j = XVECLEN (x, i) - 1; j >= 0; j--) - XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j), - reg, new_reg); - } + if (TREE_CODE (type1) != FUNCTION_TYPE + && TREE_CODE (type1) != METHOD_TYPE) + return 1; - return x; -} + ccvt1 = ix86_get_callcvt (type1); + ccvt2 = ix86_get_callcvt (type2); + if (ccvt1 != ccvt2) + return 0; + if (ix86_function_regparm (type1, NULL) + != ix86_function_regparm (type2, NULL)) + return 0; -/* Replace REG in INSN with a V2DI subreg of NEW_REG. */ + return 1; +} + +/* Return the regparm value for a function with the indicated TYPE and DECL. + DECL may be NULL when calling function indirectly + or considering a libcall. */ -void -dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn, - rtx reg, rtx new_reg) +static int +ix86_function_regparm (const_tree type, const_tree decl) { - replace_with_subreg (single_set (insn), reg, new_reg); -} + tree attr; + int regparm; + unsigned int ccvt; -/* Insert generated conversion instruction sequence INSNS - after instruction AFTER. New BB may be required in case - instruction has EH region attached. */ + if (TARGET_64BIT) + return (ix86_function_type_abi (type) == SYSV_ABI + ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX); + ccvt = ix86_get_callcvt (type); + regparm = ix86_regparm; -void -scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after) -{ - if (!control_flow_insn_p (after)) + if ((ccvt & IX86_CALLCVT_REGPARM) != 0) { - emit_insn_after (insns, after); - return; + attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); + if (attr) + { + regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); + return regparm; + } } + else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) + return 2; + else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) + return 1; - basic_block bb = BLOCK_FOR_INSN (after); - edge e = find_fallthru_edge (bb->succs); - gcc_assert (e); + /* Use register calling convention for local functions when possible. */ + if (decl + && TREE_CODE (decl) == FUNCTION_DECL) + { + cgraph_node *target = cgraph_node::get (decl); + if (target) + target = target->function_symbol (); - basic_block new_bb = split_edge (e); - emit_insn_after (insns, BB_HEAD (new_bb)); -} + /* Caller and callee must agree on the calling convention, so + checking here just optimize means that with + __attribute__((optimize (...))) caller could use regparm convention + and callee not, or vice versa. Instead look at whether the callee + is optimized or not. */ + if (target && opt_for_fn (target->decl, optimize) + && !(profile_flag && !flag_fentry)) + { + cgraph_local_info *i = &target->local; + if (i && i->local && i->can_change_signature) + { + int local_regparm, globals = 0, regno; -/* Make vector copies for all register REGNO definitions - and replace its uses in a chain. */ + /* Make sure no regparm register is taken by a + fixed register variable. */ + for (local_regparm = 0; local_regparm < REGPARM_MAX; + local_regparm++) + if (fixed_regs[local_regparm]) + break; -void -dimode_scalar_chain::make_vector_copies (unsigned regno) -{ - rtx reg = regno_reg_rtx[regno]; - rtx vreg = gen_reg_rtx (DImode); - df_ref ref; + /* We don't want to use regparm(3) for nested functions as + these use a static chain pointer in the third argument. */ + if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl)) + local_regparm = 2; - for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) - if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) - { - start_sequence (); + /* Save a register for the split stack. */ + if (flag_split_stack) + { + if (local_regparm == 3) + local_regparm = 2; + else if (local_regparm == 2 + && DECL_STATIC_CHAIN (target->decl)) + local_regparm = 1; + } - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) - { - rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP); - emit_move_insn (adjust_address (tmp, SImode, 0), - gen_rtx_SUBREG (SImode, reg, 0)); - emit_move_insn (adjust_address (tmp, SImode, 4), - gen_rtx_SUBREG (SImode, reg, 4)); - emit_move_insn (vreg, tmp); - } - else if (TARGET_SSE4_1) - { - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 0))); - emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (SImode, reg, 4), - GEN_INT (2))); - } - else - { - rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 0))); - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 4))); - emit_insn (gen_vec_interleave_lowv4si - (gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, tmp, 0))); - } - rtx_insn *seq = get_insns (); - end_sequence (); - rtx_insn *insn = DF_REF_INSN (ref); - emit_conversion_insns (seq, insn); - - if (dump_file) - fprintf (dump_file, - " Copied r%d to a vector register r%d for insn %d\n", - regno, REGNO (vreg), INSN_UID (insn)); - } + /* Each fixed register usage increases register pressure, + so less registers should be used for argument passing. + This functionality can be overriden by an explicit + regparm value. */ + for (regno = AX_REG; regno <= DI_REG; regno++) + if (fixed_regs[regno]) + globals++; - for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) - if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) - { - rtx_insn *insn = DF_REF_INSN (ref); + local_regparm + = globals < local_regparm ? local_regparm - globals : 0; - replace_with_subreg_in_insn (insn, reg, vreg); + if (local_regparm > regparm) + regparm = local_regparm; + } + } + } - if (dump_file) - fprintf (dump_file, " Replaced r%d with r%d in insn %d\n", - regno, REGNO (vreg), INSN_UID (insn)); - } + return regparm; } -/* Convert all definitions of register REGNO - and fix its uses. Scalar copies may be created - in case register is used in not convertible insn. */ +/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and + DFmode (2) arguments in SSE registers for a function with the + indicated TYPE and DECL. DECL may be NULL when calling function + indirectly or considering a libcall. Return -1 if any FP parameter + should be rejected by error. This is used in siutation we imply SSE + calling convetion but the function is called from another function with + SSE disabled. Otherwise return 0. */ -void -dimode_scalar_chain::convert_reg (unsigned regno) +static int +ix86_function_sseregparm (const_tree type, const_tree decl, bool warn) { - bool scalar_copy = bitmap_bit_p (defs_conv, regno); - rtx reg = regno_reg_rtx[regno]; - rtx scopy = NULL_RTX; - df_ref ref; - bitmap conv; - - conv = BITMAP_ALLOC (NULL); - bitmap_copy (conv, insns); - - if (scalar_copy) - scopy = gen_reg_rtx (DImode); + gcc_assert (!TARGET_64BIT); - for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + /* Use SSE registers to pass SFmode and DFmode arguments if requested + by the sseregparm attribute. */ + if (TARGET_SSEREGPARM + || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type)))) { - rtx_insn *insn = DF_REF_INSN (ref); - rtx def_set = single_set (insn); - rtx src = SET_SRC (def_set); - rtx reg = DF_REF_REG (ref); - - if (!MEM_P (src)) - { - replace_with_subreg_in_insn (insn, reg, reg); - bitmap_clear_bit (conv, INSN_UID (insn)); - } - - if (scalar_copy) + if (!TARGET_SSE) { - start_sequence (); - if (!TARGET_INTER_UNIT_MOVES_FROM_VEC) - { - rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP); - emit_move_insn (tmp, reg); - emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), - adjust_address (tmp, SImode, 0)); - emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), - adjust_address (tmp, SImode, 4)); - } - else if (TARGET_SSE4_1) - { - rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); - emit_insn - (gen_rtx_SET - (gen_rtx_SUBREG (SImode, scopy, 0), - gen_rtx_VEC_SELECT (SImode, - gen_rtx_SUBREG (V4SImode, reg, 0), tmp))); - - tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx)); - emit_insn - (gen_rtx_SET - (gen_rtx_SUBREG (SImode, scopy, 4), - gen_rtx_VEC_SELECT (SImode, - gen_rtx_SUBREG (V4SImode, reg, 0), tmp))); - } - else + if (warn) { - rtx vcopy = gen_reg_rtx (V2DImode); - emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0)); - emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), - gen_rtx_SUBREG (SImode, vcopy, 0)); - emit_move_insn (vcopy, - gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32))); - emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), - gen_rtx_SUBREG (SImode, vcopy, 0)); + if (decl) + error ("calling %qD with attribute sseregparm without " + "SSE/SSE2 enabled", decl); + else + error ("calling %qT with attribute sseregparm without " + "SSE/SSE2 enabled", type); } - rtx_insn *seq = get_insns (); - end_sequence (); - emit_conversion_insns (seq, insn); - - if (dump_file) - fprintf (dump_file, - " Copied r%d to a scalar register r%d for insn %d\n", - regno, REGNO (scopy), INSN_UID (insn)); + return 0; } - } - - for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) - if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) - { - if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref))) - { - rtx_insn *insn = DF_REF_INSN (ref); - rtx def_set = single_set (insn); - gcc_assert (def_set); + return 2; + } - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); + if (!decl) + return 0; - if (!MEM_P (dst) || !REG_P (src)) - replace_with_subreg_in_insn (insn, reg, reg); + cgraph_node *target = cgraph_node::get (decl); + if (target) + target = target->function_symbol (); - bitmap_clear_bit (conv, INSN_UID (insn)); - } - } - /* Skip debug insns and uninitialized uses. */ - else if (DF_REF_CHAIN (ref) - && NONDEBUG_INSN_P (DF_REF_INSN (ref))) - { - gcc_assert (scopy); - replace_rtx (DF_REF_INSN (ref), reg, scopy); - df_insn_rescan (DF_REF_INSN (ref)); - } + /* For local functions, pass up to SSE_REGPARM_MAX SFmode + (and DFmode for SSE2) arguments in SSE registers. */ + if (target + /* TARGET_SSE_MATH */ + && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE) + && opt_for_fn (target->decl, optimize) + && !(profile_flag && !flag_fentry)) + { + cgraph_local_info *i = &target->local; + if (i && i->local && i->can_change_signature) + { + /* Refuse to produce wrong code when local function with SSE enabled + is called from SSE disabled function. + FIXME: We need a way to detect these cases cross-ltrans partition + and avoid using SSE calling conventions on local functions called + from function with SSE disabled. For now at least delay the + warning until we know we are going to produce wrong code. + See PR66047 */ + if (!TARGET_SSE && warn) + return -1; + return TARGET_SSE2_P (target_opts_for_fn (target->decl) + ->x_ix86_isa_flags) ? 2 : 1; + } + } - BITMAP_FREE (conv); + return 0; } -/* Convert operand OP in INSN. We should handle - memory operands and uninitialized registers. - All other register uses are converted during - registers conversion. */ +/* Return true if EAX is live at the start of the function. Used by + ix86_expand_prologue to determine if we need special help before + calling allocate_stack_worker. */ -void -dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn) +static bool +ix86_eax_live_at_start_p (void) { - *op = copy_rtx_if_shared (*op); + /* Cheat. Don't bother working forward from ix86_function_regparm + to the function type to whether an actual argument is located in + eax. Instead just look at cfg info, which is still close enough + to correct at this point. This gives false positives for broken + functions that might use uninitialized data that happens to be + allocated in eax, but who cares? */ + return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0); +} - if (GET_CODE (*op) == NOT) - { - convert_op (&XEXP (*op, 0), insn); - PUT_MODE (*op, V2DImode); - } - else if (MEM_P (*op)) - { - rtx tmp = gen_reg_rtx (DImode); - - emit_insn_before (gen_move_insn (tmp, *op), insn); - *op = gen_rtx_SUBREG (V2DImode, tmp, 0); +static bool +ix86_keep_aggregate_return_pointer (tree fntype) +{ + tree attr; - if (dump_file) - fprintf (dump_file, " Preloading operand for insn %d into r%d\n", - INSN_UID (insn), REGNO (tmp)); - } - else if (REG_P (*op)) - { - /* We may have not converted register usage in case - this register has no definition. Otherwise it - should be converted in convert_reg. */ - df_ref ref; - FOR_EACH_INSN_USE (ref, insn) - if (DF_REF_REGNO (ref) == REGNO (*op)) - { - gcc_assert (!DF_REF_CHAIN (ref)); - break; - } - *op = gen_rtx_SUBREG (V2DImode, *op, 0); - } - else if (CONST_INT_P (*op)) + if (!TARGET_64BIT) { - rtx vec_cst; - rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0); - - /* Prefer all ones vector in case of -1. */ - if (constm1_operand (*op, GET_MODE (*op))) - vec_cst = CONSTM1_RTX (V2DImode); - else - vec_cst = gen_rtx_CONST_VECTOR (V2DImode, - gen_rtvec (2, *op, const0_rtx)); - - if (!standard_sse_constant_p (vec_cst, V2DImode)) - { - start_sequence (); - vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst)); - rtx_insn *seq = get_insns (); - end_sequence (); - emit_insn_before (seq, insn); - } + attr = lookup_attribute ("callee_pop_aggregate_return", + TYPE_ATTRIBUTES (fntype)); + if (attr) + return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0); - emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn); - *op = tmp; - } - else - { - gcc_assert (SUBREG_P (*op)); - gcc_assert (GET_MODE (*op) == V2DImode); + /* For 32-bit MS-ABI the default is to keep aggregate + return pointer. */ + if (ix86_function_type_abi (fntype) == MS_ABI) + return true; } + return KEEP_AGGREGATE_RETURN_POINTER != 0; } -/* Convert INSN to vector mode. */ - -void -dimode_scalar_chain::convert_insn (rtx_insn *insn) -{ - rtx def_set = single_set (insn); - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); - rtx subreg; +/* Value is the number of bytes of arguments automatically + popped when returning from a subroutine call. + FUNDECL is the declaration node of the function (as a tree), + FUNTYPE is the data type of the function (as a tree), + or for a library call it is an identifier node for the subroutine name. + SIZE is the number of bytes of arguments passed on the stack. - if (MEM_P (dst) && !REG_P (src)) - { - /* There are no scalar integer instructions and therefore - temporary register usage is required. */ - rtx tmp = gen_reg_rtx (DImode); - emit_conversion_insns (gen_move_insn (dst, tmp), insn); - dst = gen_rtx_SUBREG (V2DImode, tmp, 0); - } + On the 80386, the RTD insn may be used to pop them if the number + of args is fixed, but if the number is variable then the caller + must pop them all. RTD can't be used for library calls now + because the library is compiled with the Unix compiler. + Use of RTD is a selectable option, since it is incompatible with + standard Unix calling sequences. If the option is not selected, + the caller must always pop the args. - switch (GET_CODE (src)) - { - case ASHIFT: - case ASHIFTRT: - case LSHIFTRT: - convert_op (&XEXP (src, 0), insn); - PUT_MODE (src, V2DImode); - break; + The attribute stdcall is equivalent to RTD on a per module basis. */ - case PLUS: - case MINUS: - case IOR: - case XOR: - case AND: - convert_op (&XEXP (src, 0), insn); - convert_op (&XEXP (src, 1), insn); - PUT_MODE (src, V2DImode); - break; +static poly_int64 +ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size) +{ + unsigned int ccvt; - case NEG: - src = XEXP (src, 0); - convert_op (&src, insn); - subreg = gen_reg_rtx (V2DImode); - emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn); - src = gen_rtx_MINUS (V2DImode, subreg, src); - break; + /* None of the 64-bit ABIs pop arguments. */ + if (TARGET_64BIT) + return 0; - case NOT: - src = XEXP (src, 0); - convert_op (&src, insn); - subreg = gen_reg_rtx (V2DImode); - emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn); - src = gen_rtx_XOR (V2DImode, src, subreg); - break; + ccvt = ix86_get_callcvt (funtype); - case MEM: - if (!REG_P (dst)) - convert_op (&src, insn); - break; + if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL + | IX86_CALLCVT_THISCALL)) != 0 + && ! stdarg_p (funtype)) + return size; - case REG: - if (!MEM_P (dst)) - convert_op (&src, insn); - break; + /* Lose any fake structure return argument if it is passed on the stack. */ + if (aggregate_value_p (TREE_TYPE (funtype), fundecl) + && !ix86_keep_aggregate_return_pointer (funtype)) + { + int nregs = ix86_function_regparm (funtype, fundecl); + if (nregs == 0) + return GET_MODE_SIZE (Pmode); + } - case SUBREG: - gcc_assert (GET_MODE (src) == V2DImode); - break; + return 0; +} - case COMPARE: - src = SUBREG_REG (XEXP (XEXP (src, 0), 0)); +/* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */ - gcc_assert ((REG_P (src) && GET_MODE (src) == DImode) - || (SUBREG_P (src) && GET_MODE (src) == V2DImode)); +static bool +ix86_legitimate_combined_insn (rtx_insn *insn) +{ + int i; - if (REG_P (src)) - subreg = gen_rtx_SUBREG (V2DImode, src, 0); - else - subreg = copy_rtx_if_shared (src); - emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg), - copy_rtx_if_shared (subreg), - copy_rtx_if_shared (subreg)), - insn); - dst = gen_rtx_REG (CCmode, FLAGS_REG); - src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src), - copy_rtx_if_shared (src)), - UNSPEC_PTEST); - break; + /* Check operand constraints in case hard registers were propagated + into insn pattern. This check prevents combine pass from + generating insn patterns with invalid hard register operands. + These invalid insns can eventually confuse reload to error out + with a spill failure. See also PRs 46829 and 46843. */ - case CONST_INT: - convert_op (&src, insn); - break; + gcc_assert (INSN_CODE (insn) >= 0); - default: - gcc_unreachable (); - } + extract_insn (insn); + preprocess_constraints (insn); - SET_SRC (def_set) = src; - SET_DEST (def_set) = dst; + int n_operands = recog_data.n_operands; + int n_alternatives = recog_data.n_alternatives; + for (i = 0; i < n_operands; i++) + { + rtx op = recog_data.operand[i]; + machine_mode mode = GET_MODE (op); + const operand_alternative *op_alt; + int offset = 0; + bool win; + int j; - /* Drop possible dead definitions. */ - PATTERN (insn) = def_set; + /* A unary operator may be accepted by the predicate, but it + is irrelevant for matching constraints. */ + if (UNARY_P (op)) + op = XEXP (op, 0); - INSN_CODE (insn) = -1; - recog_memoized (insn); - df_insn_rescan (insn); -} + if (SUBREG_P (op)) + { + if (REG_P (SUBREG_REG (op)) + && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER) + offset = subreg_regno_offset (REGNO (SUBREG_REG (op)), + GET_MODE (SUBREG_REG (op)), + SUBREG_BYTE (op), + GET_MODE (op)); + op = SUBREG_REG (op); + } -/* Fix uses of converted REG in debug insns. */ + if (!(REG_P (op) && HARD_REGISTER_P (op))) + continue; -void -timode_scalar_chain::fix_debug_reg_uses (rtx reg) -{ - if (!flag_var_tracking) - return; + op_alt = recog_op_alt; - df_ref ref, next; - for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next) - { - rtx_insn *insn = DF_REF_INSN (ref); - /* Make sure the next ref is for a different instruction, - so that we're not affected by the rescan. */ - next = DF_REF_NEXT_REG (ref); - while (next && DF_REF_INSN (next) == insn) - next = DF_REF_NEXT_REG (next); + /* Operand has no constraints, anything is OK. */ + win = !n_alternatives; - if (DEBUG_INSN_P (insn)) + alternative_mask preferred = get_preferred_alternatives (insn); + for (j = 0; j < n_alternatives; j++, op_alt += n_operands) { - /* It may be a debug insn with a TImode variable in - register. */ - bool changed = false; - for (; ref != next; ref = DF_REF_NEXT_REG (ref)) + if (!TEST_BIT (preferred, j)) + continue; + if (op_alt[i].anything_ok + || (op_alt[i].matches != -1 + && operands_match_p + (recog_data.operand[i], + recog_data.operand[op_alt[i].matches])) + || reg_fits_class_p (op, op_alt[i].cl, offset, mode)) { - rtx *loc = DF_REF_LOC (ref); - if (REG_P (*loc) && GET_MODE (*loc) == V1TImode) - { - *loc = gen_rtx_SUBREG (TImode, *loc, 0); - changed = true; - } + win = true; + break; } - if (changed) - df_insn_rescan (insn); } + + if (!win) + return false; } + + return true; } + +/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */ -/* Convert INSN from TImode to V1T1mode. */ +static unsigned HOST_WIDE_INT +ix86_asan_shadow_offset (void) +{ + return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44) + : HOST_WIDE_INT_C (0x7fff8000)) + : (HOST_WIDE_INT_1 << 29); +} + +/* Argument support functions. */ -void -timode_scalar_chain::convert_insn (rtx_insn *insn) +/* Return true when register may be used to pass function parameters. */ +bool +ix86_function_arg_regno_p (int regno) { - rtx def_set = single_set (insn); - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); + int i; + enum calling_abi call_abi; + const int *parm_regs; - switch (GET_CODE (dst)) + if (!TARGET_64BIT) { - case REG: - { - rtx tmp = find_reg_equal_equiv_note (insn); - if (tmp) - PUT_MODE (XEXP (tmp, 0), V1TImode); - PUT_MODE (dst, V1TImode); - fix_debug_reg_uses (dst); - } - break; - case MEM: - PUT_MODE (dst, V1TImode); - break; - - default: - gcc_unreachable (); + if (TARGET_MACHO) + return (regno < REGPARM_MAX + || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno])); + else + return (regno < REGPARM_MAX + || (TARGET_MMX && MMX_REGNO_P (regno) + && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX)) + || (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))); } - switch (GET_CODE (src)) - { - case REG: - PUT_MODE (src, V1TImode); - /* Call fix_debug_reg_uses only if SRC is never defined. */ - if (!DF_REG_DEF_CHAIN (REGNO (src))) - fix_debug_reg_uses (src); - break; - - case MEM: - PUT_MODE (src, V1TImode); - break; - - case CONST_WIDE_INT: - if (NONDEBUG_INSN_P (insn)) - { - /* Since there are no instructions to store 128-bit constant, - temporary register usage is required. */ - rtx tmp = gen_reg_rtx (V1TImode); - start_sequence (); - src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src)); - src = validize_mem (force_const_mem (V1TImode, src)); - rtx_insn *seq = get_insns (); - end_sequence (); - if (seq) - emit_insn_before (seq, insn); - emit_conversion_insns (gen_rtx_SET (dst, tmp), insn); - dst = tmp; - } - break; - - case CONST_INT: - switch (standard_sse_constant_p (src, TImode)) - { - case 1: - src = CONST0_RTX (GET_MODE (dst)); - break; - case 2: - src = CONSTM1_RTX (GET_MODE (dst)); - break; - default: - gcc_unreachable (); - } - if (NONDEBUG_INSN_P (insn)) - { - rtx tmp = gen_reg_rtx (V1TImode); - /* Since there are no instructions to store standard SSE - constant, temporary register usage is required. */ - emit_conversion_insns (gen_rtx_SET (dst, tmp), insn); - dst = tmp; - } - break; + if (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)) + return true; - default: - gcc_unreachable (); - } + /* TODO: The function should depend on current function ABI but + builtins.c would need updating then. Therefore we use the + default ABI. */ + call_abi = ix86_cfun_abi (); - SET_SRC (def_set) = src; - SET_DEST (def_set) = dst; + /* RAX is used as hidden argument to va_arg functions. */ + if (call_abi == SYSV_ABI && regno == AX_REG) + return true; - /* Drop possible dead definitions. */ - PATTERN (insn) = def_set; + if (call_abi == MS_ABI) + parm_regs = x86_64_ms_abi_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; - INSN_CODE (insn) = -1; - recog_memoized (insn); - df_insn_rescan (insn); + for (i = 0; i < (call_abi == MS_ABI + ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++) + if (regno == parm_regs[i]) + return true; + return false; } -void -dimode_scalar_chain::convert_registers () +/* Return if we do not know how to pass TYPE solely in registers. */ + +static bool +ix86_must_pass_in_stack (machine_mode mode, const_tree type) { - bitmap_iterator bi; - unsigned id; + if (must_pass_in_stack_var_size_or_pad (mode, type)) + return true; - EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi) - convert_reg (id); - - EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi) - make_vector_copies (id); + /* For 32-bit, we want TImode aggregates to go on the stack. But watch out! + The layout_type routine is crafty and tries to trick us into passing + currently unsupported vector types on the stack by using TImode. */ + return (!TARGET_64BIT && mode == TImode + && type && TREE_CODE (type) != VECTOR_TYPE); } -/* Convert whole chain creating required register - conversions and copies. */ - +/* It returns the size, in bytes, of the area reserved for arguments passed + in registers for the function represented by fndecl dependent to the used + abi format. */ int -scalar_chain::convert () +ix86_reg_parm_stack_space (const_tree fndecl) { - bitmap_iterator bi; - unsigned id; - int converted_insns = 0; + enum calling_abi call_abi = SYSV_ABI; + if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL) + call_abi = ix86_function_abi (fndecl); + else + call_abi = ix86_function_type_abi (fndecl); + if (TARGET_64BIT && call_abi == MS_ABI) + return 32; + return 0; +} - if (!dbg_cnt (stv_conversion)) - return 0; +/* We add this as a workaround in order to use libc_has_function + hook in i386.md. */ +bool +ix86_libc_has_function (enum function_class fn_class) +{ + return targetm.libc_has_function (fn_class); +} - if (dump_file) - fprintf (dump_file, "Converting chain #%d...\n", chain_id); +/* Returns value SYSV_ABI, MS_ABI dependent on fntype, + specifying the call abi used. */ +enum calling_abi +ix86_function_type_abi (const_tree fntype) +{ + enum calling_abi abi = ix86_abi; - convert_registers (); + if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE) + return abi; - EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi) + if (abi == SYSV_ABI + && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype))) { - convert_insn (DF_INSN_UID_GET (id)->insn); - converted_insns++; + static int warned; + if (TARGET_X32 && !warned) + { + error ("X32 does not support ms_abi attribute"); + warned = 1; + } + + abi = MS_ABI; } + else if (abi == MS_ABI + && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype))) + abi = SYSV_ABI; - return converted_insns; + return abi; } -/* Main STV pass function. Find and convert scalar - instructions into vector mode when profitable. */ - -static unsigned int -convert_scalars_to_vector () +enum calling_abi +ix86_function_abi (const_tree fndecl) { - basic_block bb; - bitmap candidates; - int converted_insns = 0; - - bitmap_obstack_initialize (NULL); - candidates = BITMAP_ALLOC (NULL); - - calculate_dominance_info (CDI_DOMINATORS); - df_set_flags (DF_DEFER_INSN_RESCAN); - df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); - df_md_add_problem (); - df_analyze (); + return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi; +} - /* Find all instructions we want to convert into vector mode. */ - if (dump_file) - fprintf (dump_file, "Searching for mode conversion candidates...\n"); +/* Returns value SYSV_ABI, MS_ABI dependent on cfun, + specifying the call abi used. */ +enum calling_abi +ix86_cfun_abi (void) +{ + return cfun ? cfun->machine->call_abi : ix86_abi; +} - FOR_EACH_BB_FN (bb, cfun) +bool +ix86_function_ms_hook_prologue (const_tree fn) +{ + if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn))) { - rtx_insn *insn; - FOR_BB_INSNS (bb, insn) - if (scalar_to_vector_candidate_p (insn)) - { - if (dump_file) - fprintf (dump_file, " insn %d is marked as a candidate\n", - INSN_UID (insn)); - - bitmap_set_bit (candidates, INSN_UID (insn)); - } + if (decl_function_context (fn) != NULL_TREE) + error_at (DECL_SOURCE_LOCATION (fn), + "ms_hook_prologue is not compatible with nested function"); + else + return true; } + return false; +} - remove_non_convertible_regs (candidates); - - if (bitmap_empty_p (candidates)) - if (dump_file) - fprintf (dump_file, "There are no candidates for optimization.\n"); +bool +ix86_function_naked (const_tree fn) +{ + if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn))) + return true; - while (!bitmap_empty_p (candidates)) - { - unsigned uid = bitmap_first_set_bit (candidates); - scalar_chain *chain; + return false; +} - if (TARGET_64BIT) - chain = new timode_scalar_chain; - else - chain = new dimode_scalar_chain; +/* Write the extra assembler code needed to declare a function properly. */ - /* Find instructions chain we want to convert to vector mode. - Check all uses and definitions to estimate all required - conversions. */ - chain->build (candidates, uid); +void +ix86_asm_output_function_label (FILE *asm_out_file, const char *fname, + tree decl) +{ + bool is_ms_hook = ix86_function_ms_hook_prologue (decl); - if (chain->compute_convert_gain () > 0) - converted_insns += chain->convert (); - else - if (dump_file) - fprintf (dump_file, "Chain #%d conversion is not profitable\n", - chain->chain_id); + if (is_ms_hook) + { + int i, filler_count = (TARGET_64BIT ? 32 : 16); + unsigned int filler_cc = 0xcccccccc; - delete chain; + for (i = 0; i < filler_count; i += 4) + fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc); } - if (dump_file) - fprintf (dump_file, "Total insns converted: %d\n", converted_insns); +#ifdef SUBTARGET_ASM_UNWIND_INIT + SUBTARGET_ASM_UNWIND_INIT (asm_out_file); +#endif - BITMAP_FREE (candidates); - bitmap_obstack_release (NULL); - df_process_deferred_rescans (); + ASM_OUTPUT_LABEL (asm_out_file, fname); - /* Conversion means we may have 128bit register spills/fills - which require aligned stack. */ - if (converted_insns) + /* Output magic byte marker, if hot-patch attribute is set. */ + if (is_ms_hook) { - if (crtl->stack_alignment_needed < 128) - crtl->stack_alignment_needed = 128; - if (crtl->stack_alignment_estimated < 128) - crtl->stack_alignment_estimated = 128; - /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */ if (TARGET_64BIT) - for (tree parm = DECL_ARGUMENTS (current_function_decl); - parm; parm = DECL_CHAIN (parm)) - { - if (TYPE_MODE (TREE_TYPE (parm)) != TImode) - continue; - if (DECL_RTL_SET_P (parm) - && GET_MODE (DECL_RTL (parm)) == V1TImode) - { - rtx r = DECL_RTL (parm); - if (REG_P (r)) - SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0)); - } - if (DECL_INCOMING_RTL (parm) - && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode) - { - rtx r = DECL_INCOMING_RTL (parm); - if (REG_P (r)) - DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0); - } - } + { + /* leaq [%rsp + 0], %rsp */ + fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n", + asm_out_file); + } + else + { + /* movl.s %edi, %edi + push %ebp + movl.s %esp, %ebp */ + fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file); + } } - - return 0; } -namespace { - -const pass_data pass_data_insert_vzeroupper = +/* Implementation of call abi switching target hook. Specific to FNDECL + the specific call register sets are set. See also + ix86_conditional_register_usage for more details. */ +void +ix86_call_abi_override (const_tree fndecl) { - RTL_PASS, /* type */ - "vzeroupper", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; + cfun->machine->call_abi = ix86_function_abi (fndecl); +} -class pass_insert_vzeroupper : public rtl_opt_pass +/* Return 1 if pseudo register should be created and used to hold + GOT address for PIC code. */ +bool +ix86_use_pseudo_pic_reg (void) { -public: - pass_insert_vzeroupper(gcc::context *ctxt) - : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - return TARGET_AVX - && TARGET_VZEROUPPER && flag_expensive_optimizations - && !optimize_size; - } - - virtual unsigned int execute (function *) - { - return rest_of_handle_insert_vzeroupper (); - } + if ((TARGET_64BIT + && (ix86_cmodel == CM_SMALL_PIC + || TARGET_PECOFF)) + || !flag_pic) + return false; + return true; +} -}; // class pass_insert_vzeroupper +/* Initialize large model PIC register. */ -const pass_data pass_data_stv = +static void +ix86_init_large_pic_reg (unsigned int tmp_regno) { - RTL_PASS, /* type */ - "stv", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; + rtx_code_label *label; + rtx tmp_reg; + + gcc_assert (Pmode == DImode); + label = gen_label_rtx (); + emit_label (label); + LABEL_PRESERVE_P (label) = 1; + tmp_reg = gen_rtx_REG (Pmode, tmp_regno); + gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno); + emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, + label)); + emit_insn (gen_set_got_offset_rex64 (tmp_reg, label)); + emit_insn (ix86_gen_add3 (pic_offset_table_rtx, + pic_offset_table_rtx, tmp_reg)); + const char *name = LABEL_NAME (label); + PUT_CODE (label, NOTE); + NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL; + NOTE_DELETED_LABEL_NAME (label) = name; +} -class pass_stv : public rtl_opt_pass +/* Create and initialize PIC register if required. */ +static void +ix86_init_pic_reg (void) { -public: - pass_stv (gcc::context *ctxt) - : rtl_opt_pass (pass_data_stv, ctxt), - timode_p (false) - {} + edge entry_edge; + rtx_insn *seq; - /* opt_pass methods: */ - virtual bool gate (function *) - { - return (timode_p == !!TARGET_64BIT - && TARGET_STV && TARGET_SSE2 && optimize > 1); - } + if (!ix86_use_pseudo_pic_reg ()) + return; - virtual unsigned int execute (function *) - { - return convert_scalars_to_vector (); - } + start_sequence (); - opt_pass *clone () + if (TARGET_64BIT) { - return new pass_stv (m_ctxt); + if (ix86_cmodel == CM_LARGE_PIC) + ix86_init_large_pic_reg (R11_REG); + else + emit_insn (gen_set_got_rex64 (pic_offset_table_rtx)); } - - void set_pass_param (unsigned int n, bool param) + else { - gcc_assert (n == 0); - timode_p = param; + /* If there is future mcount call in the function it is more profitable + to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */ + rtx reg = crtl->profile + ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM) + : pic_offset_table_rtx; + rtx_insn *insn = emit_insn (gen_set_got (reg)); + RTX_FRAME_RELATED_P (insn) = 1; + if (crtl->profile) + emit_move_insn (pic_offset_table_rtx, reg); + add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX); } -private: - bool timode_p; -}; // class pass_stv - -} // anon namespace - -rtl_opt_pass * -make_pass_insert_vzeroupper (gcc::context *ctxt) -{ - return new pass_insert_vzeroupper (ctxt); -} + seq = get_insns (); + end_sequence (); -rtl_opt_pass * -make_pass_stv (gcc::context *ctxt) -{ - return new pass_stv (ctxt); + entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + insert_insn_on_edge (seq, entry_edge); + commit_one_edge_insertion (entry_edge); } -/* Inserting ENDBRANCH instructions. */ +/* Initialize a variable CUM of type CUMULATIVE_ARGS + for a call to a function whose data type is FNTYPE. + For a library call, FNTYPE is 0. */ -static unsigned int -rest_of_insert_endbranch (void) +void +init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ + tree fntype, /* tree ptr for function decl */ + rtx libname, /* SYMBOL_REF of library name or 0 */ + tree fndecl, + int caller) { - timevar_push (TV_MACH_DEP); + struct cgraph_local_info *i = NULL; + struct cgraph_node *target = NULL; - rtx cet_eb; - rtx_insn *insn; - basic_block bb; + memset (cum, 0, sizeof (*cum)); - /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is - absent among function attributes. Later an optimization will be - introduced to make analysis if an address of a static function is - taken. A static function whose address is not taken will get a - nocf_check attribute. This will allow to reduce the number of EB. */ - - if (!lookup_attribute ("nocf_check", - TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) - && (!flag_manual_endbr - || lookup_attribute ("cf_check", - DECL_ATTRIBUTES (cfun->decl))) - && !cgraph_node::get (cfun->decl)->only_called_directly_p ()) - { - /* Queue ENDBR insertion to x86_function_profiler. */ - if (crtl->profile && flag_fentry) - cfun->machine->endbr_queued_at_entrance = true; - else + if (fndecl) + { + target = cgraph_node::get (fndecl); + if (target) { - cet_eb = gen_nop_endbr (); - - bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; - insn = BB_HEAD (bb); - emit_insn_before (cet_eb, insn); + target = target->function_symbol (); + i = cgraph_node::local_info (target->decl); + cum->call_abi = ix86_function_abi (target->decl); } + else + cum->call_abi = ix86_function_abi (fndecl); } + else + cum->call_abi = ix86_function_type_abi (fntype); - bb = 0; - FOR_EACH_BB_FN (bb, cfun) + cum->caller = caller; + + /* Set up the number of registers to use for passing arguments. */ + cum->nregs = ix86_regparm; + if (TARGET_64BIT) { - for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); - insn = NEXT_INSN (insn)) - { - if (CALL_P (insn)) - { - bool need_endbr; - need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL; - if (!need_endbr && !SIBLING_CALL_P (insn)) - { - rtx call = get_call_rtx_from (insn); - rtx fnaddr = XEXP (call, 0); - tree fndecl = NULL_TREE; - - /* Also generate ENDBRANCH for non-tail call which - may return via indirect branch. */ - if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) - fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0)); - if (fndecl == NULL_TREE) - fndecl = MEM_EXPR (fnaddr); - if (fndecl - && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE - && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE) - fndecl = NULL_TREE; - if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl))) - { - tree fntype = TREE_TYPE (fndecl); - if (lookup_attribute ("indirect_return", - TYPE_ATTRIBUTES (fntype))) - need_endbr = true; - } - } - if (!need_endbr) - continue; - /* Generate ENDBRANCH after CALL, which can return more than - twice, setjmp-like functions. */ - - cet_eb = gen_nop_endbr (); - emit_insn_after_setloc (cet_eb, insn, INSN_LOCATION (insn)); - continue; - } - - if (JUMP_P (insn) && flag_cet_switch) - { - rtx target = JUMP_LABEL (insn); - if (target == NULL_RTX || ANY_RETURN_P (target)) - continue; - - /* Check the jump is a switch table. */ - rtx_insn *label = as_a (target); - rtx_insn *table = next_insn (label); - if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table)) - continue; - - /* For the indirect jump find out all places it jumps and insert - ENDBRANCH there. It should be done under a special flag to - control ENDBRANCH generation for switch stmts. */ - edge_iterator ei; - edge e; - basic_block dest_blk; - - FOR_EACH_EDGE (e, ei, bb->succs) - { - rtx_insn *insn; - - dest_blk = e->dest; - insn = BB_HEAD (dest_blk); - gcc_assert (LABEL_P (insn)); - cet_eb = gen_nop_endbr (); - emit_insn_after (cet_eb, insn); - } - continue; - } - - if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn)) - || (NOTE_P (insn) - && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)) - /* TODO. Check /s bit also. */ - { - cet_eb = gen_nop_endbr (); - emit_insn_after (cet_eb, insn); - continue; - } - } - } - - timevar_pop (TV_MACH_DEP); - return 0; -} - -namespace { - -const pass_data pass_data_insert_endbranch = -{ - RTL_PASS, /* type. */ - "cet", /* name. */ - OPTGROUP_NONE, /* optinfo_flags. */ - TV_MACH_DEP, /* tv_id. */ - 0, /* properties_required. */ - 0, /* properties_provided. */ - 0, /* properties_destroyed. */ - 0, /* todo_flags_start. */ - 0, /* todo_flags_finish. */ -}; - -class pass_insert_endbranch : public rtl_opt_pass -{ -public: - pass_insert_endbranch (gcc::context *ctxt) - : rtl_opt_pass (pass_data_insert_endbranch, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - return ((flag_cf_protection & CF_BRANCH)); + cum->nregs = (cum->call_abi == SYSV_ABI + ? X86_64_REGPARM_MAX + : X86_64_MS_REGPARM_MAX); } - - virtual unsigned int execute (function *) + if (TARGET_SSE) { - return rest_of_insert_endbranch (); + cum->sse_nregs = SSE_REGPARM_MAX; + if (TARGET_64BIT) + { + cum->sse_nregs = (cum->call_abi == SYSV_ABI + ? X86_64_SSE_REGPARM_MAX + : X86_64_MS_SSE_REGPARM_MAX); + } } + if (TARGET_MMX) + cum->mmx_nregs = MMX_REGPARM_MAX; + cum->warn_avx512f = true; + cum->warn_avx = true; + cum->warn_sse = true; + cum->warn_mmx = true; -}; // class pass_insert_endbranch - -} // anon namespace - -rtl_opt_pass * -make_pass_insert_endbranch (gcc::context *ctxt) -{ - return new pass_insert_endbranch (ctxt); -} - -/* At entry of the nearest common dominator for basic blocks with - conversions, generate a single - vxorps %xmmN, %xmmN, %xmmN - for all - vcvtss2sd op, %xmmN, %xmmX - vcvtsd2ss op, %xmmN, %xmmX - vcvtsi2ss op, %xmmN, %xmmX - vcvtsi2sd op, %xmmN, %xmmX - - NB: We want to generate only a single vxorps to cover the whole - function. The LCM algorithm isn't appropriate here since it may - place a vxorps inside the loop. */ - -static unsigned int -remove_partial_avx_dependency (void) -{ - timevar_push (TV_MACH_DEP); - - bitmap_obstack_initialize (NULL); - bitmap convert_bbs = BITMAP_ALLOC (NULL); - - basic_block bb; - rtx_insn *insn, *set_insn; - rtx set; - rtx v4sf_const0 = NULL_RTX; + /* Because type might mismatch in between caller and callee, we need to + use actual type of function for local calls. + FIXME: cgraph_analyze can be told to actually record if function uses + va_start so for local functions maybe_vaarg can be made aggressive + helping K&R code. + FIXME: once typesytem is fixed, we won't need this code anymore. */ + if (i && i->local && i->can_change_signature) + fntype = TREE_TYPE (target->decl); + cum->stdarg = stdarg_p (fntype); + cum->maybe_vaarg = (fntype + ? (!prototype_p (fntype) || stdarg_p (fntype)) + : !libname); - auto_vec control_flow_insns; + cum->decl = fndecl; - FOR_EACH_BB_FN (bb, cfun) + cum->warn_empty = !warn_abi || cum->stdarg; + if (!cum->warn_empty && fntype) { - FOR_BB_INSNS (bb, insn) + function_args_iterator iter; + tree argtype; + bool seen_empty_type = false; + FOREACH_FUNCTION_ARGS (fntype, argtype, iter) { - if (!NONDEBUG_INSN_P (insn)) - continue; - - set = single_set (insn); - if (!set) - continue; - - if (get_attr_avx_partial_xmm_update (insn) - != AVX_PARTIAL_XMM_UPDATE_TRUE) - continue; - - if (!v4sf_const0) - { - calculate_dominance_info (CDI_DOMINATORS); - df_set_flags (DF_DEFER_INSN_RESCAN); - df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); - df_md_add_problem (); - df_analyze (); - v4sf_const0 = gen_reg_rtx (V4SFmode); - } - - /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF, - SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and - vec_merge with subreg. */ - rtx src = SET_SRC (set); - rtx dest = SET_DEST (set); - machine_mode dest_mode = GET_MODE (dest); - - rtx zero; - machine_mode dest_vecmode; - if (dest_mode == E_SFmode) - { - dest_vecmode = V4SFmode; - zero = v4sf_const0; - } - else - { - dest_vecmode = V2DFmode; - zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0); - } - - /* Change source to vector mode. */ - src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src); - src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero, - GEN_INT (HOST_WIDE_INT_1U)); - /* Change destination to vector mode. */ - rtx vec = gen_reg_rtx (dest_vecmode); - /* Generate an XMM vector SET. */ - set = gen_rtx_SET (vec, src); - set_insn = emit_insn_before (set, insn); - df_insn_rescan (set_insn); - - if (cfun->can_throw_non_call_exceptions) + if (argtype == error_mark_node || VOID_TYPE_P (argtype)) + break; + if (TYPE_EMPTY_P (argtype)) + seen_empty_type = true; + else if (seen_empty_type) { - /* Handle REG_EH_REGION note. */ - rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX); - if (note) - { - control_flow_insns.safe_push (set_insn); - add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0)); - } + cum->warn_empty = true; + break; } - - src = gen_rtx_SUBREG (dest_mode, vec, 0); - set = gen_rtx_SET (dest, src); - - /* Drop possible dead definitions. */ - PATTERN (insn) = set; - - INSN_CODE (insn) = -1; - recog_memoized (insn); - df_insn_rescan (insn); - bitmap_set_bit (convert_bbs, bb->index); } } - if (v4sf_const0) + if (!TARGET_64BIT) { - /* (Re-)discover loops so that bb->loop_father can be used in the - analysis below. */ - loop_optimizer_init (AVOID_CFG_MODIFICATIONS); - - /* Generate a vxorps at entry of the nearest dominator for basic - blocks with conversions, which is in the the fake loop that - contains the whole function, so that there is only a single - vxorps in the whole function. */ - bb = nearest_common_dominator_for_set (CDI_DOMINATORS, - convert_bbs); - while (bb->loop_father->latch - != EXIT_BLOCK_PTR_FOR_FN (cfun)) - bb = get_immediate_dominator (CDI_DOMINATORS, - bb->loop_father->header); - - set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode)); + /* If there are variable arguments, then we won't pass anything + in registers in 32-bit mode. */ + if (stdarg_p (fntype)) + { + cum->nregs = 0; + /* Since in 32-bit, variable arguments are always passed on + stack, there is scratch register available for indirect + sibcall. */ + cfun->machine->arg_reg_available = true; + cum->sse_nregs = 0; + cum->mmx_nregs = 0; + cum->warn_avx512f = false; + cum->warn_avx = false; + cum->warn_sse = false; + cum->warn_mmx = false; + return; + } - insn = BB_HEAD (bb); - while (insn && !NONDEBUG_INSN_P (insn)) + /* Use ecx and edx registers if function has fastcall attribute, + else look for regparm information. */ + if (fntype) { - if (insn == BB_END (bb)) + unsigned int ccvt = ix86_get_callcvt (fntype); + if ((ccvt & IX86_CALLCVT_THISCALL) != 0) { - insn = NULL; - break; + cum->nregs = 1; + cum->fastcall = 1; /* Same first register as in fastcall. */ + } + else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) + { + cum->nregs = 2; + cum->fastcall = 1; } - insn = NEXT_INSN (insn); + else + cum->nregs = ix86_function_regparm (fntype, fndecl); } - if (insn == BB_HEAD (bb)) - set_insn = emit_insn_before (set, insn); - else - set_insn = emit_insn_after (set, - insn ? PREV_INSN (insn) : BB_END (bb)); - df_insn_rescan (set_insn); - df_process_deferred_rescans (); - loop_optimizer_finalize (); - if (!control_flow_insns.is_empty ()) - { - free_dominance_info (CDI_DOMINATORS); - - unsigned int i; - FOR_EACH_VEC_ELT (control_flow_insns, i, insn) - if (control_flow_insn_p (insn)) - { - /* Split the block after insn. There will be a fallthru - edge, which is OK so we keep it. We have to create - the exception edges ourselves. */ - bb = BLOCK_FOR_INSN (insn); - split_block (bb, insn); - rtl_make_eh_edge (NULL, bb, BB_END (bb)); - } - } + /* Set up the number of SSE registers used for passing SFmode + and DFmode arguments. Warn for mismatching ABI. */ + cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true); } - bitmap_obstack_release (NULL); - BITMAP_FREE (convert_bbs); - - timevar_pop (TV_MACH_DEP); - return 0; + cfun->machine->arg_reg_available = (cum->nregs > 0); } -namespace { - -const pass_data pass_data_remove_partial_avx_dependency = -{ - RTL_PASS, /* type */ - "rpad", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_MACH_DEP, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; - -class pass_remove_partial_avx_dependency : public rtl_opt_pass -{ -public: - pass_remove_partial_avx_dependency (gcc::context *ctxt) - : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - return (TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY - && TARGET_SSE_MATH - && optimize - && optimize_function_for_speed_p (cfun)); - } - - virtual unsigned int execute (function *) - { - return remove_partial_avx_dependency (); - } -}; // class pass_rpad - -} // anon namespace - -rtl_opt_pass * -make_pass_remove_partial_avx_dependency (gcc::context *ctxt) -{ - return new pass_remove_partial_avx_dependency (ctxt); -} +/* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE. + But in the case of vector types, it is some vector mode. -/* Return true if a red-zone is in use. We can't use red-zone when - there are local indirect jumps, like "indirect_jump" or "tablejump", - which jumps to another place in the function, since "call" in the - indirect thunk pushes the return address onto stack, destroying - red-zone. + When we have only some of our vector isa extensions enabled, then there + are some modes for which vector_mode_supported_p is false. For these + modes, the generic vector support in gcc will choose some non-vector mode + in order to implement the type. By computing the natural mode, we'll + select the proper ABI location for the operand and not depend on whatever + the middle-end decides to do with these vector types. - TODO: If we can reserve the first 2 WORDs, for PUSH and, another - for CALL, in red-zone, we can allow local indirect jumps with - indirect thunk. */ + The midde-end can't deal with the vector types > 16 bytes. In this + case, we return the original mode and warn ABI change if CUM isn't + NULL. -bool -ix86_using_red_zone (void) -{ - return (TARGET_RED_ZONE - && !TARGET_64BIT_MS_ABI - && (!cfun->machine->has_local_indirect_jump - || cfun->machine->indirect_branch_type == indirect_branch_keep)); -} - -/* Return a string that documents the current -m options. The caller is - responsible for freeing the string. */ + If INT_RETURN is true, warn ABI change if the vector mode isn't + available for function return value. */ -static char * -ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2, - int flags, int flags2, - const char *arch, const char *tune, - enum fpmath_unit fpmath, bool add_nl_p, bool add_abi_p) +static machine_mode +type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum, + bool in_return) { - struct ix86_target_opts - { - const char *option; /* option string */ - HOST_WIDE_INT mask; /* isa mask options */ - }; - - /* This table is ordered so that options like -msse4.2 that imply other - ISAs come first. Target string will be displayed in the same order. */ - static struct ix86_target_opts isa2_opts[] = - { - { "-mcx16", OPTION_MASK_ISA_CX16 }, - { "-mvaes", OPTION_MASK_ISA_VAES }, - { "-mrdpid", OPTION_MASK_ISA_RDPID }, - { "-mpconfig", OPTION_MASK_ISA_PCONFIG }, - { "-mwbnoinvd", OPTION_MASK_ISA_WBNOINVD }, - { "-msgx", OPTION_MASK_ISA_SGX }, - { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW }, - { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS }, - { "-mhle", OPTION_MASK_ISA_HLE }, - { "-mmovbe", OPTION_MASK_ISA_MOVBE }, - { "-mclzero", OPTION_MASK_ISA_CLZERO }, - { "-mmwaitx", OPTION_MASK_ISA_MWAITX }, - { "-mmovdir64b", OPTION_MASK_ISA_MOVDIR64B }, - { "-mwaitpkg", OPTION_MASK_ISA_WAITPKG }, - { "-mcldemote", OPTION_MASK_ISA_CLDEMOTE }, - { "-mptwrite", OPTION_MASK_ISA_PTWRITE } - }; - static struct ix86_target_opts isa_opts[] = - { - { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ }, - { "-mavx512bitalg", OPTION_MASK_ISA_AVX512BITALG }, - { "-mvpclmulqdq", OPTION_MASK_ISA_VPCLMULQDQ }, - { "-mgfni", OPTION_MASK_ISA_GFNI }, - { "-mavx512vnni", OPTION_MASK_ISA_AVX512VNNI }, - { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 }, - { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI }, - { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA }, - { "-mavx512vl", OPTION_MASK_ISA_AVX512VL }, - { "-mavx512bw", OPTION_MASK_ISA_AVX512BW }, - { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ }, - { "-mavx512er", OPTION_MASK_ISA_AVX512ER }, - { "-mavx512pf", OPTION_MASK_ISA_AVX512PF }, - { "-mavx512cd", OPTION_MASK_ISA_AVX512CD }, - { "-mavx512f", OPTION_MASK_ISA_AVX512F }, - { "-mavx2", OPTION_MASK_ISA_AVX2 }, - { "-mfma", OPTION_MASK_ISA_FMA }, - { "-mxop", OPTION_MASK_ISA_XOP }, - { "-mfma4", OPTION_MASK_ISA_FMA4 }, - { "-mf16c", OPTION_MASK_ISA_F16C }, - { "-mavx", OPTION_MASK_ISA_AVX }, -/* { "-msse4" OPTION_MASK_ISA_SSE4 }, */ - { "-msse4.2", OPTION_MASK_ISA_SSE4_2 }, - { "-msse4.1", OPTION_MASK_ISA_SSE4_1 }, - { "-msse4a", OPTION_MASK_ISA_SSE4A }, - { "-mssse3", OPTION_MASK_ISA_SSSE3 }, - { "-msse3", OPTION_MASK_ISA_SSE3 }, - { "-maes", OPTION_MASK_ISA_AES }, - { "-msha", OPTION_MASK_ISA_SHA }, - { "-mpclmul", OPTION_MASK_ISA_PCLMUL }, - { "-msse2", OPTION_MASK_ISA_SSE2 }, - { "-msse", OPTION_MASK_ISA_SSE }, - { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A }, - { "-m3dnow", OPTION_MASK_ISA_3DNOW }, - { "-mmmx", OPTION_MASK_ISA_MMX }, - { "-mrtm", OPTION_MASK_ISA_RTM }, - { "-mprfchw", OPTION_MASK_ISA_PRFCHW }, - { "-mrdseed", OPTION_MASK_ISA_RDSEED }, - { "-madx", OPTION_MASK_ISA_ADX }, - { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 }, - { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT }, - { "-mxsaves", OPTION_MASK_ISA_XSAVES }, - { "-mxsavec", OPTION_MASK_ISA_XSAVEC }, - { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT }, - { "-mxsave", OPTION_MASK_ISA_XSAVE }, - { "-mabm", OPTION_MASK_ISA_ABM }, - { "-mbmi", OPTION_MASK_ISA_BMI }, - { "-mbmi2", OPTION_MASK_ISA_BMI2 }, - { "-mlzcnt", OPTION_MASK_ISA_LZCNT }, - { "-mtbm", OPTION_MASK_ISA_TBM }, - { "-mpopcnt", OPTION_MASK_ISA_POPCNT }, - { "-msahf", OPTION_MASK_ISA_SAHF }, - { "-mcrc32", OPTION_MASK_ISA_CRC32 }, - { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE }, - { "-mrdrnd", OPTION_MASK_ISA_RDRND }, - { "-mpku", OPTION_MASK_ISA_PKU }, - { "-mlwp", OPTION_MASK_ISA_LWP }, - { "-mfxsr", OPTION_MASK_ISA_FXSR }, - { "-mclwb", OPTION_MASK_ISA_CLWB }, - { "-mshstk", OPTION_MASK_ISA_SHSTK }, - { "-mmovdiri", OPTION_MASK_ISA_MOVDIRI } - }; - - /* Flag options. */ - static struct ix86_target_opts flag_opts[] = - { - { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE }, - { "-mlong-double-128", MASK_LONG_DOUBLE_128 }, - { "-mlong-double-64", MASK_LONG_DOUBLE_64 }, - { "-m80387", MASK_80387 }, - { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS }, - { "-malign-double", MASK_ALIGN_DOUBLE }, - { "-mcld", MASK_CLD }, - { "-mfp-ret-in-387", MASK_FLOAT_RETURNS }, - { "-mieee-fp", MASK_IEEE_FP }, - { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS }, - { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY }, - { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT }, - { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS }, - { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 }, - { "-mno-push-args", MASK_NO_PUSH_ARGS }, - { "-mno-red-zone", MASK_NO_RED_ZONE }, - { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER }, - { "-mrecip", MASK_RECIP }, - { "-mrtd", MASK_RTD }, - { "-msseregparm", MASK_SSEREGPARM }, - { "-mstack-arg-probe", MASK_STACK_PROBE }, - { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS }, - { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS }, - { "-m8bit-idiv", MASK_USE_8BIT_IDIV }, - { "-mvzeroupper", MASK_VZEROUPPER }, - { "-mstv", MASK_STV }, - { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD }, - { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE }, - { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES } - }; - - /* Additional flag options. */ - static struct ix86_target_opts flag2_opts[] = - { - { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY } - }; - - const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts) - + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2]; - - char isa_other[40]; - char isa2_other[40]; - char flags_other[40]; - char flags2_other[40]; - unsigned num = 0; - unsigned i, j; - char *ret; - char *ptr; - size_t len; - size_t line_len; - size_t sep_len; - const char *abi; - - memset (opts, '\0', sizeof (opts)); + machine_mode mode = TYPE_MODE (type); - /* Add -march= option. */ - if (arch) + if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode)) { - opts[num][0] = "-march="; - opts[num++][1] = arch; - } + HOST_WIDE_INT size = int_size_in_bytes (type); + if ((size == 8 || size == 16 || size == 32 || size == 64) + /* ??? Generic code allows us to create width 1 vectors. Ignore. */ + && TYPE_VECTOR_SUBPARTS (type) > 1) + { + machine_mode innermode = TYPE_MODE (TREE_TYPE (type)); - /* Add -mtune= option. */ - if (tune) - { - opts[num][0] = "-mtune="; - opts[num++][1] = tune; - } + /* There are no XFmode vector modes. */ + if (innermode == XFmode) + return mode; - /* Add -m32/-m64/-mx32. */ - if (add_abi_p) - { - if ((isa & OPTION_MASK_ISA_64BIT) != 0) - { - if ((isa & OPTION_MASK_ABI_64) != 0) - abi = "-m64"; + if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE) + mode = MIN_MODE_VECTOR_FLOAT; else - abi = "-mx32"; - } - else - abi = "-m32"; - opts[num++][0] = abi; - } - isa &= ~(OPTION_MASK_ISA_64BIT | OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32); + mode = MIN_MODE_VECTOR_INT; - /* Pick out the options in isa2 options. */ - for (i = 0; i < ARRAY_SIZE (isa2_opts); i++) - { - if ((isa2 & isa2_opts[i].mask) != 0) - { - opts[num++][0] = isa2_opts[i].option; - isa2 &= ~ isa2_opts[i].mask; - } - } - - if (isa2 && add_nl_p) - { - opts[num++][0] = isa2_other; - sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2); - } - - /* Pick out the options in isa options. */ - for (i = 0; i < ARRAY_SIZE (isa_opts); i++) - { - if ((isa & isa_opts[i].mask) != 0) - { - opts[num++][0] = isa_opts[i].option; - isa &= ~ isa_opts[i].mask; - } - } - - if (isa && add_nl_p) - { - opts[num++][0] = isa_other; - sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa); - } - - /* Add flag options. */ - for (i = 0; i < ARRAY_SIZE (flag_opts); i++) - { - if ((flags & flag_opts[i].mask) != 0) - { - opts[num++][0] = flag_opts[i].option; - flags &= ~ flag_opts[i].mask; - } - } + /* Get the mode which has this inner mode and number of units. */ + FOR_EACH_MODE_FROM (mode, mode) + if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type) + && GET_MODE_INNER (mode) == innermode) + { + if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU) + { + static bool warnedavx512f; + static bool warnedavx512f_ret; - if (flags && add_nl_p) - { - opts[num++][0] = flags_other; - sprintf (flags_other, "(other flags: %#x)", flags); - } + if (cum && cum->warn_avx512f && !warnedavx512f) + { + if (warning (OPT_Wpsabi, "AVX512F vector argument " + "without AVX512F enabled changes the ABI")) + warnedavx512f = true; + } + else if (in_return && !warnedavx512f_ret) + { + if (warning (OPT_Wpsabi, "AVX512F vector return " + "without AVX512F enabled changes the ABI")) + warnedavx512f_ret = true; + } - /* Add additional flag options. */ - for (i = 0; i < ARRAY_SIZE (flag2_opts); i++) - { - if ((flags2 & flag2_opts[i].mask) != 0) - { - opts[num++][0] = flag2_opts[i].option; - flags2 &= ~ flag2_opts[i].mask; - } - } + return TYPE_MODE (type); + } + else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU) + { + static bool warnedavx; + static bool warnedavx_ret; - if (flags2 && add_nl_p) - { - opts[num++][0] = flags2_other; - sprintf (flags2_other, "(other flags2: %#x)", flags2); - } + if (cum && cum->warn_avx && !warnedavx) + { + if (warning (OPT_Wpsabi, "AVX vector argument " + "without AVX enabled changes the ABI")) + warnedavx = true; + } + else if (in_return && !warnedavx_ret) + { + if (warning (OPT_Wpsabi, "AVX vector return " + "without AVX enabled changes the ABI")) + warnedavx_ret = true; + } - /* Add -fpmath= option. */ - if (fpmath) - { - opts[num][0] = "-mfpmath="; - switch ((int) fpmath) - { - case FPMATH_387: - opts[num++][1] = "387"; - break; + return TYPE_MODE (type); + } + else if (((size == 8 && TARGET_64BIT) || size == 16) + && !TARGET_SSE + && !TARGET_IAMCU) + { + static bool warnedsse; + static bool warnedsse_ret; - case FPMATH_SSE: - opts[num++][1] = "sse"; - break; + if (cum && cum->warn_sse && !warnedsse) + { + if (warning (OPT_Wpsabi, "SSE vector argument " + "without SSE enabled changes the ABI")) + warnedsse = true; + } + else if (!TARGET_64BIT && in_return && !warnedsse_ret) + { + if (warning (OPT_Wpsabi, "SSE vector return " + "without SSE enabled changes the ABI")) + warnedsse_ret = true; + } + } + else if ((size == 8 && !TARGET_64BIT) + && (!cfun + || cfun->machine->func_type == TYPE_NORMAL) + && !TARGET_MMX + && !TARGET_IAMCU) + { + static bool warnedmmx; + static bool warnedmmx_ret; - case FPMATH_387 | FPMATH_SSE: - opts[num++][1] = "sse+387"; - break; + if (cum && cum->warn_mmx && !warnedmmx) + { + if (warning (OPT_Wpsabi, "MMX vector argument " + "without MMX enabled changes the ABI")) + warnedmmx = true; + } + else if (in_return && !warnedmmx_ret) + { + if (warning (OPT_Wpsabi, "MMX vector return " + "without MMX enabled changes the ABI")) + warnedmmx_ret = true; + } + } + return mode; + } - default: gcc_unreachable (); } } - /* Any options? */ - if (num == 0) - return NULL; - - gcc_assert (num < ARRAY_SIZE (opts)); - - /* Size the string. */ - len = 0; - sep_len = (add_nl_p) ? 3 : 1; - for (i = 0; i < num; i++) - { - len += sep_len; - for (j = 0; j < 2; j++) - if (opts[i][j]) - len += strlen (opts[i][j]); - } - - /* Build the string. */ - ret = ptr = (char *) xmalloc (len); - line_len = 0; - - for (i = 0; i < num; i++) - { - size_t len2[2]; - - for (j = 0; j < 2; j++) - len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0; - - if (i != 0) - { - *ptr++ = ' '; - line_len++; - - if (add_nl_p && line_len + len2[0] + len2[1] > 70) - { - *ptr++ = '\\'; - *ptr++ = '\n'; - line_len = 0; - } - } - - for (j = 0; j < 2; j++) - if (opts[i][j]) - { - memcpy (ptr, opts[i][j], len2[j]); - ptr += len2[j]; - line_len += len2[j]; - } - } - - *ptr = '\0'; - gcc_assert (ret + len >= ptr); - - return ret; + return mode; } -/* Return true, if profiling code should be emitted before - prologue. Otherwise it returns false. - Note: For x86 with "hotfix" it is sorried. */ -static bool -ix86_profile_before_prologue (void) -{ - return flag_fentry != 0; -} +/* We want to pass a value in REGNO whose "natural" mode is MODE. However, + this may not agree with the mode that the type system has chosen for the + register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can + go ahead and use it. Otherwise we have to build a PARALLEL instead. */ -/* Function that is callable from the debugger to print the current - options. */ -void ATTRIBUTE_UNUSED -ix86_debug_options (void) +static rtx +gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode, + unsigned int regno) { - char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2, - target_flags, ix86_target_flags, - ix86_arch_string,ix86_tune_string, - ix86_fpmath, true, true); + rtx tmp; - if (opts) + if (orig_mode != BLKmode) + tmp = gen_rtx_REG (orig_mode, regno); + else { - fprintf (stderr, "%s\n\n", opts); - free (opts); + tmp = gen_rtx_REG (mode, regno); + tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx); + tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp)); } - else - fputs ("\n\n", stderr); - return; + return tmp; } -static const char *stringop_alg_names[] = { -#define DEF_ENUM -#define DEF_ALG(alg, name) #name, -#include "stringop.def" -#undef DEF_ENUM -#undef DEF_ALG -}; +/* x86-64 register passing implementation. See x86-64 ABI for details. Goal + of this code is to classify each 8bytes of incoming argument by the register + class and assign registers accordingly. */ -/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. - The string is of the following form (or comma separated list of it): +/* Return the union class of CLASS1 and CLASS2. + See the x86-64 PS ABI for details. */ - strategy_alg:max_size:[align|noalign] +static enum x86_64_reg_class +merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2) +{ + /* Rule #1: If both classes are equal, this is the resulting class. */ + if (class1 == class2) + return class1; - where the full size range for the strategy is either [0, max_size] or - [min_size, max_size], in which min_size is the max_size + 1 of the - preceding range. The last size range must have max_size == -1. + /* Rule #2: If one of the classes is NO_CLASS, the resulting class is + the other class. */ + if (class1 == X86_64_NO_CLASS) + return class2; + if (class2 == X86_64_NO_CLASS) + return class1; - Examples: + /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */ + if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS) + return X86_64_MEMORY_CLASS; - 1. - -mmemcpy-strategy=libcall:-1:noalign + /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */ + if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS) + || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS)) + return X86_64_INTEGERSI_CLASS; + if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS + || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS) + return X86_64_INTEGER_CLASS; - this is equivalent to (for known size memcpy) -mstringop-strategy=libcall + /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class, + MEMORY is used. */ + if (class1 == X86_64_X87_CLASS + || class1 == X86_64_X87UP_CLASS + || class1 == X86_64_COMPLEX_X87_CLASS + || class2 == X86_64_X87_CLASS + || class2 == X86_64_X87UP_CLASS + || class2 == X86_64_COMPLEX_X87_CLASS) + return X86_64_MEMORY_CLASS; + /* Rule #6: Otherwise class SSE is used. */ + return X86_64_SSE_CLASS; +} - 2. - -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign +/* Classify the argument of type TYPE and mode MODE. + CLASSES will be filled by the register class used to pass each word + of the operand. The number of words is returned. In case the parameter + should be passed in memory, 0 is returned. As a special case for zero + sized containers, classes[0] will be NO_CLASS and 1 is returned. - This is to tell the compiler to use the following strategy for memset - 1) when the expected size is between [1, 16], use rep_8byte strategy; - 2) when the size is between [17, 2048], use vector_loop; - 3) when the size is > 2048, use libcall. */ + BIT_OFFSET is used internally for handling records and specifies offset + of the offset in bits modulo 512 to avoid overflow cases. -struct stringop_size_range -{ - int max; - stringop_alg alg; - bool noalign; -}; + See the x86-64 PS ABI for details. +*/ -static void -ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) +static int +classify_argument (machine_mode mode, const_tree type, + enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset) { - const struct stringop_algs *default_algs; - stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; - char *curr_range_str, *next_range_str; - const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="; - int i = 0, n = 0; + HOST_WIDE_INT bytes + = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); + int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD); - if (is_memset) - default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; - else - default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + /* Variable sized entities are always passed/returned in memory. */ + if (bytes < 0) + return 0; - curr_range_str = strategy_str; + if (mode != VOIDmode + && targetm.calls.must_pass_in_stack (mode, type)) + return 0; - do + if (type && AGGREGATE_TYPE_P (type)) { - int maxs; - char alg_name[128]; - char align[16]; - next_range_str = strchr (curr_range_str, ','); - if (next_range_str) - *next_range_str++ = '\0'; - - if (sscanf (curr_range_str, "%20[^:]:%d:%10s", alg_name, &maxs, - align) != 3) - { - error ("wrong argument %qs to option %qs", curr_range_str, opt); - return; - } - - if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1)) - { - error ("size ranges of option %qs should be increasing", opt); - return; - } + int i; + tree field; + enum x86_64_reg_class subclasses[MAX_CLASSES]; - for (i = 0; i < last_alg; i++) - if (!strcmp (alg_name, stringop_alg_names[i])) - break; + /* On x86-64 we pass structures larger than 64 bytes on the stack. */ + if (bytes > 64) + return 0; - if (i == last_alg) - { - error ("wrong strategy name %qs specified for option %qs", - alg_name, opt); - - auto_vec candidates; - for (i = 0; i < last_alg; i++) - if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT) - candidates.safe_push (stringop_alg_names[i]); - - char *s; - const char *hint - = candidates_list_and_hint (alg_name, s, candidates); - if (hint) - inform (input_location, - "valid arguments to %qs are: %s; did you mean %qs?", - opt, s, hint); - else - inform (input_location, "valid arguments to %qs are: %s", - opt, s); - XDELETEVEC (s); - return; - } + for (i = 0; i < words; i++) + classes[i] = X86_64_NO_CLASS; - if ((stringop_alg) i == rep_prefix_8_byte - && !TARGET_64BIT) + /* Zero sized arrays or structures are NO_CLASS. We return 0 to + signalize memory class, so handle it as special case. */ + if (!words) { - /* rep; movq isn't available in 32-bit code. */ - error ("strategy name %qs specified for option %qs " - "not supported for 32-bit code", alg_name, opt); - return; + classes[0] = X86_64_NO_CLASS; + return 1; } - input_ranges[n].max = maxs; - input_ranges[n].alg = (stringop_alg) i; - if (!strcmp (align, "align")) - input_ranges[n].noalign = false; - else if (!strcmp (align, "noalign")) - input_ranges[n].noalign = true; - else - { - error ("unknown alignment %qs specified for option %qs", align, opt); - return; - } - n++; - curr_range_str = next_range_str; - } - while (curr_range_str); - - if (input_ranges[n - 1].max != -1) - { - error ("the max value for the last size range should be -1" - " for option %qs", opt); - return; - } - - if (n > MAX_STRINGOP_ALGS) - { - error ("too many size ranges specified in option %qs", opt); - return; - } - - /* Now override the default algs array. */ - for (i = 0; i < n; i++) - { - *const_cast(&default_algs->size[i].max) = input_ranges[i].max; - *const_cast(&default_algs->size[i].alg) - = input_ranges[i].alg; - *const_cast(&default_algs->size[i].noalign) - = input_ranges[i].noalign; - } -} - - -/* parse -mtune-ctrl= option. When DUMP is true, - print the features that are explicitly set. */ + /* Classify each field of record and merge classes. */ + switch (TREE_CODE (type)) + { + case RECORD_TYPE: + /* And now merge the fields of structure. */ + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) == FIELD_DECL) + { + int num; -static void -parse_mtune_ctrl_str (bool dump) -{ - if (!ix86_tune_ctrl_string) - return; + if (TREE_TYPE (field) == error_mark_node) + continue; - char *next_feature_string = NULL; - char *curr_feature_string = xstrdup (ix86_tune_ctrl_string); - char *orig = curr_feature_string; - int i; - do - { - bool clear = false; + /* Bitfields are always classified as integer. Handle them + early, since later code would consider them to be + misaligned integers. */ + if (DECL_BIT_FIELD (field)) + { + for (i = (int_bit_position (field) + + (bit_offset % 64)) / 8 / 8; + i < ((int_bit_position (field) + (bit_offset % 64)) + + tree_to_shwi (DECL_SIZE (field)) + + 63) / 8 / 8; i++) + classes[i] + = merge_classes (X86_64_INTEGER_CLASS, classes[i]); + } + else + { + int pos; - next_feature_string = strchr (curr_feature_string, ','); - if (next_feature_string) - *next_feature_string++ = '\0'; - if (*curr_feature_string == '^') - { - curr_feature_string++; - clear = true; - } - for (i = 0; i < X86_TUNE_LAST; i++) - { - if (!strcmp (curr_feature_string, ix86_tune_feature_names[i])) - { - ix86_tune_features[i] = !clear; - if (dump) - fprintf (stderr, "Explicitly %s feature %s\n", - clear ? "clear" : "set", ix86_tune_feature_names[i]); - break; - } - } - if (i == X86_TUNE_LAST) - error ("unknown parameter to option %<-mtune-ctrl%>: %s", - clear ? curr_feature_string - 1 : curr_feature_string); - curr_feature_string = next_feature_string; - } - while (curr_feature_string); - free (orig); -} + type = TREE_TYPE (field); -/* Helper function to set ix86_tune_features. IX86_TUNE is the - processor type. */ + /* Flexible array member is ignored. */ + if (TYPE_MODE (type) == BLKmode + && TREE_CODE (type) == ARRAY_TYPE + && TYPE_SIZE (type) == NULL_TREE + && TYPE_DOMAIN (type) != NULL_TREE + && (TYPE_MAX_VALUE (TYPE_DOMAIN (type)) + == NULL_TREE)) + { + static bool warned; -static void -set_ix86_tune_features (enum processor_type ix86_tune, bool dump) -{ - unsigned HOST_WIDE_INT ix86_tune_mask = HOST_WIDE_INT_1U << ix86_tune; - int i; + if (!warned && warn_psabi) + { + warned = true; + inform (input_location, + "the ABI of passing struct with" + " a flexible array member has" + " changed in GCC 4.4"); + } + continue; + } + num = classify_argument (TYPE_MODE (type), type, + subclasses, + (int_bit_position (field) + + bit_offset) % 512); + if (!num) + return 0; + pos = (int_bit_position (field) + + (bit_offset % 64)) / 8 / 8; + for (i = 0; i < num && (i + pos) < words; i++) + classes[i + pos] + = merge_classes (subclasses[i], classes[i + pos]); + } + } + } + break; - for (i = 0; i < X86_TUNE_LAST; ++i) - { - if (ix86_tune_no_default) - ix86_tune_features[i] = 0; - else - ix86_tune_features[i] - = !!(initial_ix86_tune_features[i] & ix86_tune_mask); - } + case ARRAY_TYPE: + /* Arrays are handled as small records. */ + { + int num; + num = classify_argument (TYPE_MODE (TREE_TYPE (type)), + TREE_TYPE (type), subclasses, bit_offset); + if (!num) + return 0; - if (dump) - { - fprintf (stderr, "List of x86 specific tuning parameter names:\n"); - for (i = 0; i < X86_TUNE_LAST; i++) - fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i], - ix86_tune_features[i] ? "on" : "off"); - } + /* The partial classes are now full classes. */ + if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4) + subclasses[0] = X86_64_SSE_CLASS; + if (subclasses[0] == X86_64_INTEGERSI_CLASS + && !((bit_offset % 64) == 0 && bytes == 4)) + subclasses[0] = X86_64_INTEGER_CLASS; - parse_mtune_ctrl_str (dump); -} + for (i = 0; i < words; i++) + classes[i] = subclasses[i % num]; + break; + } + case UNION_TYPE: + case QUAL_UNION_TYPE: + /* Unions are similar to RECORD_TYPE but offset is always 0. + */ + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) == FIELD_DECL) + { + int num; -/* Default align_* from the processor table. */ + if (TREE_TYPE (field) == error_mark_node) + continue; -static void -ix86_default_align (struct gcc_options *opts) -{ - /* -falign-foo without argument: supply one. */ - if (opts->x_flag_align_loops && !opts->x_str_align_loops) - opts->x_str_align_loops = processor_cost_table[ix86_tune]->align_loop; - if (opts->x_flag_align_jumps && !opts->x_str_align_jumps) - opts->x_str_align_jumps = processor_cost_table[ix86_tune]->align_jump; - if (opts->x_flag_align_labels && !opts->x_str_align_labels) - opts->x_str_align_labels = processor_cost_table[ix86_tune]->align_label; - if (opts->x_flag_align_functions && !opts->x_str_align_functions) - opts->x_str_align_functions = processor_cost_table[ix86_tune]->align_func; -} + num = classify_argument (TYPE_MODE (TREE_TYPE (field)), + TREE_TYPE (field), subclasses, + bit_offset); + if (!num) + return 0; + for (i = 0; i < num && i < words; i++) + classes[i] = merge_classes (subclasses[i], classes[i]); + } + } + break; -/* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */ + default: + gcc_unreachable (); + } -static void -ix86_override_options_after_change (void) -{ - ix86_default_align (&global_options); -} + if (words > 2) + { + /* When size > 16 bytes, if the first one isn't + X86_64_SSE_CLASS or any other ones aren't + X86_64_SSEUP_CLASS, everything should be passed in + memory. */ + if (classes[0] != X86_64_SSE_CLASS) + return 0; + for (i = 1; i < words; i++) + if (classes[i] != X86_64_SSEUP_CLASS) + return 0; + } + /* Final merger cleanup. */ + for (i = 0; i < words; i++) + { + /* If one class is MEMORY, everything should be passed in + memory. */ + if (classes[i] == X86_64_MEMORY_CLASS) + return 0; -/* Override various settings based on options. If MAIN_ARGS_P, the - options are from the command line, otherwise they are from - attributes. Return true if there's an error related to march - option. */ + /* The X86_64_SSEUP_CLASS should be always preceded by + X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */ + if (classes[i] == X86_64_SSEUP_CLASS + && classes[i - 1] != X86_64_SSE_CLASS + && classes[i - 1] != X86_64_SSEUP_CLASS) + { + /* The first one should never be X86_64_SSEUP_CLASS. */ + gcc_assert (i != 0); + classes[i] = X86_64_SSE_CLASS; + } -static bool -ix86_option_override_internal (bool main_args_p, - struct gcc_options *opts, - struct gcc_options *opts_set) -{ - int i; - unsigned HOST_WIDE_INT ix86_arch_mask; - const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL); + /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS, + everything should be passed in memory. */ + if (classes[i] == X86_64_X87UP_CLASS + && (classes[i - 1] != X86_64_X87_CLASS)) + { + static bool warned; - /* -mrecip options. */ - static struct - { - const char *string; /* option name */ - unsigned int mask; /* mask bits to set */ + /* The first one should never be X86_64_X87UP_CLASS. */ + gcc_assert (i != 0); + if (!warned && warn_psabi) + { + warned = true; + inform (input_location, + "the ABI of passing union with long double" + " has changed in GCC 4.4"); + } + return 0; + } + } + return words; } - const recip_options[] = - { - { "all", RECIP_MASK_ALL }, - { "none", RECIP_MASK_NONE }, - { "div", RECIP_MASK_DIV }, - { "sqrt", RECIP_MASK_SQRT }, - { "vec-div", RECIP_MASK_VEC_DIV }, - { "vec-sqrt", RECIP_MASK_VEC_SQRT }, - }; - - /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if - TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */ - if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32); -#ifdef TARGET_BI_ARCH - else + /* Compute alignment needed. We align all types to natural boundaries with + exception of XFmode that is aligned to 64bits. */ + if (mode != VOIDmode && mode != BLKmode) { -#if TARGET_BI_ARCH == 1 - /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64 - is on and OPTION_MASK_ABI_X32 is off. We turn off - OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by - -mx32. */ - if (TARGET_X32_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64; -#else - /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is - on and OPTION_MASK_ABI_64 is off. We turn off - OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by - -m64 or OPTION_MASK_CODE16 is turned on by -m16. */ - if (TARGET_LP64_P (opts->x_ix86_isa_flags) - || TARGET_16BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32; -#endif - if (TARGET_64BIT_P (opts->x_ix86_isa_flags) - && TARGET_IAMCU_P (opts->x_target_flags)) - sorry ("Intel MCU psABI isn%'t supported in %s mode", - TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit"); - } -#endif + int mode_alignment = GET_MODE_BITSIZE (mode); - if (TARGET_X32_P (opts->x_ix86_isa_flags)) - { - /* Always turn on OPTION_MASK_ISA_64BIT and turn off - OPTION_MASK_ABI_64 for TARGET_X32. */ - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT; - opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64; - } - else if (TARGET_16BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT - | OPTION_MASK_ABI_X32 - | OPTION_MASK_ABI_64); - else if (TARGET_LP64_P (opts->x_ix86_isa_flags)) - { - /* Always turn on OPTION_MASK_ISA_64BIT and turn off - OPTION_MASK_ABI_X32 for TARGET_LP64. */ - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT; - opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32; + if (mode == XFmode) + mode_alignment = 128; + else if (mode == XCmode) + mode_alignment = 256; + if (COMPLEX_MODE_P (mode)) + mode_alignment /= 2; + /* Misaligned fields are always returned in memory. */ + if (bit_offset % mode_alignment) + return 0; } -#ifdef SUBTARGET_OVERRIDE_OPTIONS - SUBTARGET_OVERRIDE_OPTIONS; -#endif - -#ifdef SUBSUBTARGET_OVERRIDE_OPTIONS - SUBSUBTARGET_OVERRIDE_OPTIONS; -#endif - - /* -fPIC is the default for x86_64. */ - if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_flag_pic = 2; + /* for V1xx modes, just use the base mode */ + if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode + && GET_MODE_UNIT_SIZE (mode) == bytes) + mode = GET_MODE_INNER (mode); - /* Need to check -mtune=generic first. */ - if (opts->x_ix86_tune_string) - { - /* As special support for cross compilers we read -mtune=native - as -mtune=generic. With native compilers we won't see the - -mtune=native, as it was changed by the driver. */ - if (!strcmp (opts->x_ix86_tune_string, "native")) - { - opts->x_ix86_tune_string = "generic"; - } - else if (!strcmp (opts->x_ix86_tune_string, "x86-64")) - warning (OPT_Wdeprecated, - main_args_p - ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> " - "or %<-mtune=generic%> instead as appropriate") - : G_("% is deprecated; use " - "% or %" - " instead as appropriate")); - } - else + /* Classification of atomic types. */ + switch (mode) { - if (opts->x_ix86_arch_string) - opts->x_ix86_tune_string = opts->x_ix86_arch_string; - if (!opts->x_ix86_tune_string) - { - opts->x_ix86_tune_string = processor_names[TARGET_CPU_DEFAULT]; - ix86_tune_defaulted = 1; - } + case E_SDmode: + case E_DDmode: + classes[0] = X86_64_SSE_CLASS; + return 1; + case E_TDmode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + return 2; + case E_DImode: + case E_SImode: + case E_HImode: + case E_QImode: + case E_CSImode: + case E_CHImode: + case E_CQImode: + { + int size = bit_offset + (int) GET_MODE_BITSIZE (mode); - /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string - or defaulted. We need to use a sensible tune option. */ - if (!strcmp (opts->x_ix86_tune_string, "x86-64")) - { - opts->x_ix86_tune_string = "generic"; - } - } - - if (opts->x_ix86_stringop_alg == rep_prefix_8_byte - && !TARGET_64BIT_P (opts->x_ix86_isa_flags)) - { - /* rep; movq isn't available in 32-bit code. */ - error ("%<-mstringop-strategy=rep_8byte%> not supported for 32-bit code"); - opts->x_ix86_stringop_alg = no_stringop; - } - - if (!opts->x_ix86_arch_string) - opts->x_ix86_arch_string - = TARGET_64BIT_P (opts->x_ix86_isa_flags) - ? "x86-64" : SUBTARGET32_DEFAULT_CPU; - else - ix86_arch_specified = 1; - - if (opts_set->x_ix86_pmode) - { - if ((TARGET_LP64_P (opts->x_ix86_isa_flags) - && opts->x_ix86_pmode == PMODE_SI) - || (!TARGET_64BIT_P (opts->x_ix86_isa_flags) - && opts->x_ix86_pmode == PMODE_DI)) - error ("address mode %qs not supported in the %s bit mode", - TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long", - TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32"); - } - else - opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags) - ? PMODE_DI : PMODE_SI; - - if (!opts_set->x_ix86_abi) - opts->x_ix86_abi = DEFAULT_ABI; - - if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags)) - error ("%<-mabi=ms%> not supported with X32 ABI"); - gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI); - - if ((opts->x_flag_sanitize & SANITIZE_USER_ADDRESS) && opts->x_ix86_abi == MS_ABI) - error ("%<-mabi=ms%> not supported with %<-fsanitize=address%>"); - if ((opts->x_flag_sanitize & SANITIZE_KERNEL_ADDRESS) && opts->x_ix86_abi == MS_ABI) - error ("%<-mabi=ms%> not supported with %<-fsanitize=kernel-address%>"); - if ((opts->x_flag_sanitize & SANITIZE_THREAD) && opts->x_ix86_abi == MS_ABI) - error ("%<-mabi=ms%> not supported with %<-fsanitize=thread%>"); - - /* For targets using ms ABI enable ms-extensions, if not - explicit turned off. For non-ms ABI we turn off this - option. */ - if (!opts_set->x_flag_ms_extensions) - opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI); - - if (opts_set->x_ix86_cmodel) - { - switch (opts->x_ix86_cmodel) - { - case CM_SMALL: - case CM_SMALL_PIC: - if (opts->x_flag_pic) - opts->x_ix86_cmodel = CM_SMALL_PIC; - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "small", "32"); - break; - - case CM_MEDIUM: - case CM_MEDIUM_PIC: - if (opts->x_flag_pic) - opts->x_ix86_cmodel = CM_MEDIUM_PIC; - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "medium", "32"); - else if (TARGET_X32_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in x32 mode", - "medium"); - break; - - case CM_LARGE: - case CM_LARGE_PIC: - if (opts->x_flag_pic) - opts->x_ix86_cmodel = CM_LARGE_PIC; - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "large", "32"); - else if (TARGET_X32_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in x32 mode", - "large"); - break; - - case CM_32: - if (opts->x_flag_pic) - error ("code model %s does not support PIC mode", "32"); - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "32", "64"); - break; - - case CM_KERNEL: - if (opts->x_flag_pic) - { - error ("code model %s does not support PIC mode", "kernel"); - opts->x_ix86_cmodel = CM_32; - } - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - error ("code model %qs not supported in the %s bit mode", - "kernel", "32"); - break; - - default: - gcc_unreachable (); - } - } - else - { - /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the - use of rip-relative addressing. This eliminates fixups that - would otherwise be needed if this object is to be placed in a - DLL, and is essentially just as efficient as direct addressing. */ - if (TARGET_64BIT_P (opts->x_ix86_isa_flags) - && (TARGET_RDOS || TARGET_PECOFF)) - opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1; - else if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL; - else - opts->x_ix86_cmodel = CM_32; - } - if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL) - { - error ("%<-masm=intel%> not supported in this configuration"); - opts->x_ix86_asm_dialect = ASM_ATT; - } - if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0) - != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0)) - sorry ("%i-bit mode not compiled in", - (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32); + /* Analyze last 128 bits only. */ + size = (size - 1) & 0x7f; - for (i = 0; i < pta_size; i++) - if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name)) - { - if (!strcmp (opts->x_ix86_arch_string, "generic")) + if (size < 32) { - error (main_args_p - ? G_("% CPU can be used only for %<-mtune=%> " - "switch") - : G_("% CPU can be used only for " - "% attribute")); - return false; + classes[0] = X86_64_INTEGERSI_CLASS; + return 1; } - else if (!strcmp (opts->x_ix86_arch_string, "intel")) + else if (size < 64) { - error (main_args_p - ? G_("% CPU can be used only for %<-mtune=%> " - "switch") - : G_("% CPU can be used only for " - "% attribute")); - return false; + classes[0] = X86_64_INTEGER_CLASS; + return 1; } - - if (TARGET_64BIT_P (opts->x_ix86_isa_flags) - && !((processor_alias_table[i].flags & PTA_64BIT) != 0)) + else if (size < 64+32) { - error ("CPU you selected does not support x86-64 " - "instruction set"); - return false; + classes[0] = X86_64_INTEGER_CLASS; + classes[1] = X86_64_INTEGERSI_CLASS; + return 2; } - - ix86_schedule = processor_alias_table[i].schedule; - ix86_arch = processor_alias_table[i].processor; - /* Default cpu tuning to the architecture. */ - ix86_tune = ix86_arch; - - if (((processor_alias_table[i].flags & PTA_MMX) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX; - if (((processor_alias_table[i].flags & PTA_3DNOW) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW; - if (((processor_alias_table[i].flags & PTA_3DNOW_A) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A; - if (((processor_alias_table[i].flags & PTA_SSE) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE; - if (((processor_alias_table[i].flags & PTA_SSE2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2; - if (((processor_alias_table[i].flags & PTA_SSE3) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3; - if (((processor_alias_table[i].flags & PTA_SSSE3) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3; - if (((processor_alias_table[i].flags & PTA_SSE4_1) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1; - if (((processor_alias_table[i].flags & PTA_SSE4_2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2; - if (((processor_alias_table[i].flags & PTA_AVX) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX; - if (((processor_alias_table[i].flags & PTA_AVX2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2; - if (((processor_alias_table[i].flags & PTA_FMA) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA; - if (((processor_alias_table[i].flags & PTA_SSE4A) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; - if (((processor_alias_table[i].flags & PTA_FMA4) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4; - if (((processor_alias_table[i].flags & PTA_XOP) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP; - if (((processor_alias_table[i].flags & PTA_LWP) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP; - if (((processor_alias_table[i].flags & PTA_ABM) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM; - if (((processor_alias_table[i].flags & PTA_BMI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI; - if (((processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT; - if (((processor_alias_table[i].flags & PTA_TBM) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM; - if (((processor_alias_table[i].flags & PTA_BMI2) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2; - if (((processor_alias_table[i].flags & PTA_CX16) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CX16)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CX16; - if (((processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT; - if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags) - && ((processor_alias_table[i].flags & PTA_NO_SAHF) != 0)) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF; - if (((processor_alias_table[i].flags & PTA_MOVBE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MOVBE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MOVBE; - if (((processor_alias_table[i].flags & PTA_AES) != 0) - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES)) - ix86_isa_flags |= OPTION_MASK_ISA_AES; - if (((processor_alias_table[i].flags & PTA_SHA) != 0) - && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA)) - ix86_isa_flags |= OPTION_MASK_ISA_SHA; - if (((processor_alias_table[i].flags & PTA_PCLMUL) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL; - if (((processor_alias_table[i].flags & PTA_FSGSBASE) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE; - if (((processor_alias_table[i].flags & PTA_RDRND) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND; - if (((processor_alias_table[i].flags & PTA_F16C) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C; - if (((processor_alias_table[i].flags & PTA_RTM) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM; - if (((processor_alias_table[i].flags & PTA_HLE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_HLE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_HLE; - if (((processor_alias_table[i].flags & PTA_PRFCHW) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW; - if (((processor_alias_table[i].flags & PTA_RDSEED) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED; - if (((processor_alias_table[i].flags & PTA_ADX) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX; - if (((processor_alias_table[i].flags & PTA_FXSR) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR; - if (((processor_alias_table[i].flags & PTA_XSAVE) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE; - if (((processor_alias_table[i].flags & PTA_XSAVEOPT) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT; - if (((processor_alias_table[i].flags & PTA_AVX512F) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F; - if (((processor_alias_table[i].flags & PTA_AVX512ER) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER; - if (((processor_alias_table[i].flags & PTA_AVX512PF) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF; - if (((processor_alias_table[i].flags & PTA_AVX512CD) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD; - if (((processor_alias_table[i].flags & PTA_PREFETCHWT1) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1; - if (((processor_alias_table[i].flags & PTA_CLWB) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB; - if (((processor_alias_table[i].flags & PTA_CLFLUSHOPT) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT; - if (((processor_alias_table[i].flags & PTA_CLZERO) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_CLZERO)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_CLZERO; - if (((processor_alias_table[i].flags & PTA_XSAVEC) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC; - if (((processor_alias_table[i].flags & PTA_XSAVES) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES; - if (((processor_alias_table[i].flags & PTA_AVX512DQ) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ; - if (((processor_alias_table[i].flags & PTA_AVX512BW) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW; - if (((processor_alias_table[i].flags & PTA_AVX512VL) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL; - if (((processor_alias_table[i].flags & PTA_AVX512VBMI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI; - if (((processor_alias_table[i].flags & PTA_AVX512IFMA) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA; - if (((processor_alias_table[i].flags & PTA_AVX512VNNI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VNNI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VNNI; - if (((processor_alias_table[i].flags & PTA_GFNI) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_GFNI)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_GFNI; - if (((processor_alias_table[i].flags & PTA_AVX512VBMI2) != 0) - && !(opts->x_ix86_isa_flags_explicit - & OPTION_MASK_ISA_AVX512VBMI2)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI2; - if (((processor_alias_table[i].flags & PTA_VPCLMULQDQ) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_VPCLMULQDQ)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_VPCLMULQDQ; - if (((processor_alias_table[i].flags & PTA_AVX512BITALG) != 0) - && !(opts->x_ix86_isa_flags_explicit - & OPTION_MASK_ISA_AVX512BITALG)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BITALG; - - if (((processor_alias_table[i].flags & PTA_AVX5124VNNIW) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA_AVX5124VNNIW)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW; - if (((processor_alias_table[i].flags & PTA_AVX5124FMAPS) != 0) - && !(opts->x_ix86_isa_flags2_explicit - & OPTION_MASK_ISA_AVX5124FMAPS)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS; - if (((processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ) != 0) - && !(opts->x_ix86_isa_flags_explicit - & OPTION_MASK_ISA_AVX512VPOPCNTDQ)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VPOPCNTDQ; - if (((processor_alias_table[i].flags & PTA_SGX) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX; - if (((processor_alias_table[i].flags & PTA_VAES) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_VAES)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_VAES; - if (((processor_alias_table[i].flags & PTA_RDPID) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_RDPID)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_RDPID; - if (((processor_alias_table[i].flags & PTA_PCONFIG) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PCONFIG)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PCONFIG; - if (((processor_alias_table[i].flags & PTA_WBNOINVD) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_WBNOINVD)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_WBNOINVD; - if (((processor_alias_table[i].flags & PTA_PTWRITE) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_PTWRITE)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_PTWRITE; - - if ((processor_alias_table[i].flags - & (PTA_PREFETCH_SSE | PTA_SSE)) != 0) - x86_prefetch_sse = true; - if (((processor_alias_table[i].flags & PTA_MWAITX) != 0) - && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MWAITX)) - opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MWAITX; - if (((processor_alias_table[i].flags & PTA_PKU) != 0) - && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU)) - opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU; - - /* Don't enable x87 instructions if only - general registers are allowed. */ - if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY) - && !(opts_set->x_target_flags & MASK_80387)) + else if (size < 64+64) { - if (((processor_alias_table[i].flags & PTA_NO_80387) != 0)) - opts->x_target_flags &= ~MASK_80387; - else - opts->x_target_flags |= MASK_80387; + classes[0] = classes[1] = X86_64_INTEGER_CLASS; + return 2; } - break; + else + gcc_unreachable (); } + case E_CDImode: + case E_TImode: + classes[0] = classes[1] = X86_64_INTEGER_CLASS; + return 2; + case E_COImode: + case E_OImode: + /* OImode shouldn't be used directly. */ + gcc_unreachable (); + case E_CTImode: + return 0; + case E_SFmode: + if (!(bit_offset % 64)) + classes[0] = X86_64_SSESF_CLASS; + else + classes[0] = X86_64_SSE_CLASS; + return 1; + case E_DFmode: + classes[0] = X86_64_SSEDF_CLASS; + return 1; + case E_XFmode: + classes[0] = X86_64_X87_CLASS; + classes[1] = X86_64_X87UP_CLASS; + return 2; + case E_TFmode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + return 2; + case E_SCmode: + classes[0] = X86_64_SSE_CLASS; + if (!(bit_offset % 64)) + return 1; + else + { + static bool warned; - if (i == pta_size) - { - error (main_args_p - ? G_("bad value (%qs) for %<-march=%> switch") - : G_("bad value (%qs) for % attribute"), - opts->x_ix86_arch_string); + if (!warned && warn_psabi) + { + warned = true; + inform (input_location, + "the ABI of passing structure with complex float" + " member has changed in GCC 4.4"); + } + classes[1] = X86_64_SSESF_CLASS; + return 2; + } + case E_DCmode: + classes[0] = X86_64_SSEDF_CLASS; + classes[1] = X86_64_SSEDF_CLASS; + return 2; + case E_XCmode: + classes[0] = X86_64_COMPLEX_X87_CLASS; + return 1; + case E_TCmode: + /* This modes is larger than 16 bytes. */ + return 0; + case E_V8SFmode: + case E_V8SImode: + case E_V32QImode: + case E_V16HImode: + case E_V4DFmode: + case E_V4DImode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + classes[2] = X86_64_SSEUP_CLASS; + classes[3] = X86_64_SSEUP_CLASS; + return 4; + case E_V8DFmode: + case E_V16SFmode: + case E_V8DImode: + case E_V16SImode: + case E_V32HImode: + case E_V64QImode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + classes[2] = X86_64_SSEUP_CLASS; + classes[3] = X86_64_SSEUP_CLASS; + classes[4] = X86_64_SSEUP_CLASS; + classes[5] = X86_64_SSEUP_CLASS; + classes[6] = X86_64_SSEUP_CLASS; + classes[7] = X86_64_SSEUP_CLASS; + return 8; + case E_V4SFmode: + case E_V4SImode: + case E_V16QImode: + case E_V8HImode: + case E_V2DFmode: + case E_V2DImode: + classes[0] = X86_64_SSE_CLASS; + classes[1] = X86_64_SSEUP_CLASS; + return 2; + case E_V1TImode: + case E_V1DImode: + case E_V2SFmode: + case E_V2SImode: + case E_V4HImode: + case E_V8QImode: + classes[0] = X86_64_SSE_CLASS; + return 1; + case E_BLKmode: + case E_VOIDmode: + return 0; + default: + gcc_assert (VECTOR_MODE_P (mode)); - auto_vec candidates; - for (i = 0; i < pta_size; i++) - if (strcmp (processor_alias_table[i].name, "generic") - && strcmp (processor_alias_table[i].name, "intel") - && (!TARGET_64BIT_P (opts->x_ix86_isa_flags) - || ((processor_alias_table[i].flags & PTA_64BIT) != 0))) - candidates.safe_push (processor_alias_table[i].name); + if (bytes > 16) + return 0; -#ifdef HAVE_LOCAL_CPU_DETECT - /* Add also "native" as possible value. */ - candidates.safe_push ("native"); -#endif + gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT); - char *s; - const char *hint - = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates); - if (hint) - inform (input_location, - main_args_p - ? G_("valid arguments to %<-march=%> switch are: " - "%s; did you mean %qs?") - : G_("valid arguments to % attribute are: " - "%s; did you mean %qs?"), s, hint); + if (bit_offset + GET_MODE_BITSIZE (mode) <= 32) + classes[0] = X86_64_INTEGERSI_CLASS; else - inform (input_location, - main_args_p - ? G_("valid arguments to %<-march=%> switch are: %s") - : G_("valid arguments to % attribute " - "are: %s"), s); - XDELETEVEC (s); + classes[0] = X86_64_INTEGER_CLASS; + classes[1] = X86_64_INTEGER_CLASS; + return 1 + (bytes > 8); } +} + +/* Examine the argument and return set number of register required in each + class. Return true iff parameter should be passed in memory. */ + +static bool +examine_argument (machine_mode mode, const_tree type, int in_return, + int *int_nregs, int *sse_nregs) +{ + enum x86_64_reg_class regclass[MAX_CLASSES]; + int n = classify_argument (mode, type, regclass, 0); - ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch; - for (i = 0; i < X86_ARCH_LAST; ++i) - ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask); + *int_nregs = 0; + *sse_nregs = 0; - for (i = 0; i < pta_size; i++) - if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name)) + if (!n) + return true; + for (n--; n >= 0; n--) + switch (regclass[n]) { - ix86_schedule = processor_alias_table[i].schedule; - ix86_tune = processor_alias_table[i].processor; - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - { - if (!((processor_alias_table[i].flags & PTA_64BIT) != 0)) - { - if (ix86_tune_defaulted) - { - opts->x_ix86_tune_string = "x86-64"; - for (i = 0; i < pta_size; i++) - if (! strcmp (opts->x_ix86_tune_string, - processor_alias_table[i].name)) - break; - ix86_schedule = processor_alias_table[i].schedule; - ix86_tune = processor_alias_table[i].processor; - } - else - error ("CPU you selected does not support x86-64 " - "instruction set"); - } - } - /* Intel CPUs have always interpreted SSE prefetch instructions as - NOPs; so, we can enable SSE prefetch instructions even when - -mtune (rather than -march) points us to a processor that has them. - However, the VIA C3 gives a SIGILL, so we only do that for i686 and - higher processors. */ - if (TARGET_CMOV - && ((processor_alias_table[i].flags - & (PTA_PREFETCH_SSE | PTA_SSE)) != 0)) - x86_prefetch_sse = true; + case X86_64_INTEGER_CLASS: + case X86_64_INTEGERSI_CLASS: + (*int_nregs)++; + break; + case X86_64_SSE_CLASS: + case X86_64_SSESF_CLASS: + case X86_64_SSEDF_CLASS: + (*sse_nregs)++; + break; + case X86_64_NO_CLASS: + case X86_64_SSEUP_CLASS: + break; + case X86_64_X87_CLASS: + case X86_64_X87UP_CLASS: + case X86_64_COMPLEX_X87_CLASS: + if (!in_return) + return true; break; + case X86_64_MEMORY_CLASS: + gcc_unreachable (); } - if (ix86_tune_specified && i == pta_size) - { - error (main_args_p - ? G_("bad value (%qs) for %<-mtune=%> switch") - : G_("bad value (%qs) for % attribute"), - opts->x_ix86_tune_string); - - auto_vec candidates; - for (i = 0; i < pta_size; i++) - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) - || ((processor_alias_table[i].flags & PTA_64BIT) != 0)) - candidates.safe_push (processor_alias_table[i].name); - -#ifdef HAVE_LOCAL_CPU_DETECT - /* Add also "native" as possible value. */ - candidates.safe_push ("native"); -#endif + return false; +} - char *s; - const char *hint - = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates); - if (hint) - inform (input_location, - main_args_p - ? G_("valid arguments to %<-mtune=%> switch are: " - "%s; did you mean %qs?") - : G_("valid arguments to % attribute are: " - "%s; did you mean %qs?"), s, hint); - else - inform (input_location, - main_args_p - ? G_("valid arguments to %<-mtune=%> switch are: %s") - : G_("valid arguments to % attribute " - "are: %s"), s); - XDELETEVEC (s); - } +/* Construct container for the argument used by GCC interface. See + FUNCTION_ARG for the detailed description. */ - set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes); +static rtx +construct_container (machine_mode mode, machine_mode orig_mode, + const_tree type, int in_return, int nintregs, int nsseregs, + const int *intreg, int sse_regno) +{ + /* The following variables hold the static issued_error state. */ + static bool issued_sse_arg_error; + static bool issued_sse_ret_error; + static bool issued_x87_ret_error; -#ifndef USE_IX86_FRAME_POINTER -#define USE_IX86_FRAME_POINTER 0 -#endif + machine_mode tmpmode; + int bytes + = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); + enum x86_64_reg_class regclass[MAX_CLASSES]; + int n; + int i; + int nexps = 0; + int needed_sseregs, needed_intregs; + rtx exp[MAX_CLASSES]; + rtx ret; -#ifndef USE_X86_64_FRAME_POINTER -#define USE_X86_64_FRAME_POINTER 0 -#endif + n = classify_argument (mode, type, regclass, 0); + if (!n) + return NULL; + if (examine_argument (mode, type, in_return, &needed_intregs, + &needed_sseregs)) + return NULL; + if (needed_intregs > nintregs || needed_sseregs > nsseregs) + return NULL; - /* Set the default values for switches whose default depends on TARGET_64BIT - in case they weren't overwritten by command line options. */ - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - { - if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer) - opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER; - if (opts->x_flag_asynchronous_unwind_tables - && !opts_set->x_flag_unwind_tables - && TARGET_64BIT_MS_ABI) - opts->x_flag_unwind_tables = 1; - if (opts->x_flag_asynchronous_unwind_tables == 2) - opts->x_flag_unwind_tables - = opts->x_flag_asynchronous_unwind_tables = 1; - if (opts->x_flag_pcc_struct_return == 2) - opts->x_flag_pcc_struct_return = 0; - } - else - { - if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer) - opts->x_flag_omit_frame_pointer - = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size); - if (opts->x_flag_asynchronous_unwind_tables == 2) - opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER; - if (opts->x_flag_pcc_struct_return == 2) - { - /* Intel MCU psABI specifies that -freg-struct-return should - be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1, - we check -miamcu so that -freg-struct-return is always - turned on if -miamcu is used. */ - if (TARGET_IAMCU_P (opts->x_target_flags)) - opts->x_flag_pcc_struct_return = 0; - else - opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN; - } - } - - ix86_tune_cost = processor_cost_table[ix86_tune]; - /* TODO: ix86_cost should be chosen at instruction or function granuality - so for cold code we use size_cost even in !optimize_size compilation. */ - if (opts->x_optimize_size) - ix86_cost = &ix86_size_cost; - else - ix86_cost = ix86_tune_cost; - - /* Arrange to set up i386_stack_locals for all functions. */ - init_machine_status = ix86_init_machine_status; - - /* Validate -mregparm= value. */ - if (opts_set->x_ix86_regparm) + /* We allowed the user to turn off SSE for kernel mode. Don't crash if + some less clueful developer tries to use floating-point anyway. */ + if (needed_sseregs && !TARGET_SSE) { - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - warning (0, "%<-mregparm%> is ignored in 64-bit mode"); - else if (TARGET_IAMCU_P (opts->x_target_flags)) - warning (0, "%<-mregparm%> is ignored for Intel MCU psABI"); - if (opts->x_ix86_regparm > REGPARM_MAX) + if (in_return) { - error ("%<-mregparm=%d%> is not between 0 and %d", - opts->x_ix86_regparm, REGPARM_MAX); - opts->x_ix86_regparm = 0; - } - } - if (TARGET_IAMCU_P (opts->x_target_flags) - || TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_ix86_regparm = REGPARM_MAX; - - /* Default align_* from the processor table. */ - ix86_default_align (opts); - - /* Provide default for -mbranch-cost= value. */ - if (!opts_set->x_ix86_branch_cost) - opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost; - - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - { - opts->x_target_flags - |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags; - - if (!ix86_arch_specified) - opts->x_ix86_isa_flags - |= TARGET_SUBTARGET64_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit; - - if (TARGET_RTD_P (opts->x_target_flags)) - warning (0, - main_args_p - ? G_("%<-mrtd%> is ignored in 64bit mode") - : G_("% is ignored in 64bit mode")); - } - else - { - opts->x_target_flags - |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags; - - if (!ix86_arch_specified) - opts->x_ix86_isa_flags - |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit; - - /* i386 ABI does not specify red zone. It still makes sense to use it - when programmer takes care to stack from being destroyed. */ - if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE)) - opts->x_target_flags |= MASK_NO_RED_ZONE; - } - - /* Keep nonleaf frame pointers. */ - if (opts->x_flag_omit_frame_pointer) - opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER; - else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags)) - opts->x_flag_omit_frame_pointer = 1; - - /* If we're doing fast math, we don't care about comparison order - wrt NaNs. This lets us use a shorter comparison sequence. */ - if (opts->x_flag_finite_math_only) - opts->x_target_flags &= ~MASK_IEEE_FP; - - /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387, - since the insns won't need emulation. */ - if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387]) - opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387; - - /* Likewise, if the target doesn't have a 387, or we've specified - software floating point, don't use 387 inline intrinsics. */ - if (!TARGET_80387_P (opts->x_target_flags)) - opts->x_target_flags |= MASK_NO_FANCY_MATH_387; - - /* Turn on MMX builtins for -msse. */ - if (TARGET_SSE_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags - |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit; - - /* Enable SSE prefetch. */ - if (TARGET_SSE_P (opts->x_ix86_isa_flags) - || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags) - && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)) - || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags)) - x86_prefetch_sse = true; - - /* Enable popcnt instruction for -msse4.2 or -mabm. */ - if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags) - || TARGET_ABM_P (opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags - |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit; - - /* Enable lzcnt instruction for -mabm. */ - if (TARGET_ABM_P(opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags - |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit; - - /* Disable BMI, BMI2 and TBM instructions for -m16. */ - if (TARGET_16BIT_P(opts->x_ix86_isa_flags)) - opts->x_ix86_isa_flags - &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM) - & ~opts->x_ix86_isa_flags_explicit); - - /* Validate -mpreferred-stack-boundary= value or default it to - PREFERRED_STACK_BOUNDARY_DEFAULT. */ - ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT; - if (opts_set->x_ix86_preferred_stack_boundary_arg) - { - int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2; - int max = TARGET_SEH ? 4 : 12; - - if (opts->x_ix86_preferred_stack_boundary_arg < min - || opts->x_ix86_preferred_stack_boundary_arg > max) - { - if (min == max) - error ("%<-mpreferred-stack-boundary%> is not supported " - "for this target"); - else - error ("%<-mpreferred-stack-boundary=%d%> is not between %d and %d", - opts->x_ix86_preferred_stack_boundary_arg, min, max); + if (!issued_sse_ret_error) + { + error ("SSE register return with SSE disabled"); + issued_sse_ret_error = true; + } } - else - ix86_preferred_stack_boundary - = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT; - } - - /* Set the default value for -mstackrealign. */ - if (!opts_set->x_ix86_force_align_arg_pointer) - opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT; - - ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; - - /* Validate -mincoming-stack-boundary= value or default it to - MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */ - ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary; - if (opts_set->x_ix86_incoming_stack_boundary_arg) - { - int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2; - - if (opts->x_ix86_incoming_stack_boundary_arg < min - || opts->x_ix86_incoming_stack_boundary_arg > 12) - error ("%<-mincoming-stack-boundary=%d%> is not between %d and 12", - opts->x_ix86_incoming_stack_boundary_arg, min); - else + else if (!issued_sse_arg_error) { - ix86_user_incoming_stack_boundary - = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT; - ix86_incoming_stack_boundary - = ix86_user_incoming_stack_boundary; + error ("SSE register argument with SSE disabled"); + issued_sse_arg_error = true; } + return NULL; } -#ifndef NO_PROFILE_COUNTERS - if (flag_nop_mcount) - error ("%<-mnop-mcount%> is not compatible with this target"); -#endif - if (flag_nop_mcount && flag_pic) - error ("%<-mnop-mcount%> is not implemented for %<-fPIC%>"); - - /* Accept -msseregparm only if at least SSE support is enabled. */ - if (TARGET_SSEREGPARM_P (opts->x_target_flags) - && ! TARGET_SSE_P (opts->x_ix86_isa_flags)) - error (main_args_p - ? G_("%<-msseregparm%> used without SSE enabled") - : G_("% used without SSE enabled")); - - if (opts_set->x_ix86_fpmath) - { - if (opts->x_ix86_fpmath & FPMATH_SSE) + /* Likewise, error if the ABI requires us to return values in the + x87 registers and the user specified -mno-80387. */ + if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return) + for (i = 0; i < n; i++) + if (regclass[i] == X86_64_X87_CLASS + || regclass[i] == X86_64_X87UP_CLASS + || regclass[i] == X86_64_COMPLEX_X87_CLASS) { - if (!TARGET_SSE_P (opts->x_ix86_isa_flags)) - { - if (TARGET_80387_P (opts->x_target_flags)) - { - warning (0, "SSE instruction set disabled, using 387 arithmetics"); - opts->x_ix86_fpmath = FPMATH_387; - } - } - else if ((opts->x_ix86_fpmath & FPMATH_387) - && !TARGET_80387_P (opts->x_target_flags)) + if (!issued_x87_ret_error) { - warning (0, "387 instruction set disabled, using SSE arithmetics"); - opts->x_ix86_fpmath = FPMATH_SSE; + error ("x87 register return with x87 disabled"); + issued_x87_ret_error = true; } + return NULL; } - } - /* For all chips supporting SSE2, -mfpmath=sse performs better than - fpmath=387. The second is however default at many targets since the - extra 80bit precision of temporaries is considered to be part of ABI. - Overwrite the default at least for -ffast-math. - TODO: -mfpmath=both seems to produce same performing code with bit - smaller binaries. It is however not clear if register allocation is - ready for this setting. - Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE - codegen. We may switch to 387 with -ffast-math for size optimized - functions. */ - else if (fast_math_flags_set_p (&global_options) - && TARGET_SSE2_P (opts->x_ix86_isa_flags)) - opts->x_ix86_fpmath = FPMATH_SSE; - else - opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags); - /* Use external vectorized library in vectorizing intrinsics. */ - if (opts_set->x_ix86_veclibabi_type) - switch (opts->x_ix86_veclibabi_type) + /* First construct simple cases. Avoid SCmode, since we want to use + single register to pass this type. */ + if (n == 1 && mode != SCmode) + switch (regclass[0]) { - case ix86_veclibabi_type_svml: - ix86_veclib_handler = ix86_veclibabi_svml; - break; - - case ix86_veclibabi_type_acml: - ix86_veclib_handler = ix86_veclibabi_acml; + case X86_64_INTEGER_CLASS: + case X86_64_INTEGERSI_CLASS: + return gen_rtx_REG (mode, intreg[0]); + case X86_64_SSE_CLASS: + case X86_64_SSESF_CLASS: + case X86_64_SSEDF_CLASS: + if (mode != BLKmode) + return gen_reg_or_parallel (mode, orig_mode, + GET_SSE_REGNO (sse_regno)); break; - + case X86_64_X87_CLASS: + case X86_64_COMPLEX_X87_CLASS: + return gen_rtx_REG (mode, FIRST_STACK_REG); + case X86_64_NO_CLASS: + /* Zero sized array, struct or class. */ + return NULL; default: gcc_unreachable (); } + if (n == 2 + && regclass[0] == X86_64_SSE_CLASS + && regclass[1] == X86_64_SSEUP_CLASS + && mode != BLKmode) + return gen_reg_or_parallel (mode, orig_mode, + GET_SSE_REGNO (sse_regno)); + if (n == 4 + && regclass[0] == X86_64_SSE_CLASS + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS + && mode != BLKmode) + return gen_reg_or_parallel (mode, orig_mode, + GET_SSE_REGNO (sse_regno)); + if (n == 8 + && regclass[0] == X86_64_SSE_CLASS + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS + && regclass[4] == X86_64_SSEUP_CLASS + && regclass[5] == X86_64_SSEUP_CLASS + && regclass[6] == X86_64_SSEUP_CLASS + && regclass[7] == X86_64_SSEUP_CLASS + && mode != BLKmode) + return gen_reg_or_parallel (mode, orig_mode, + GET_SSE_REGNO (sse_regno)); + if (n == 2 + && regclass[0] == X86_64_X87_CLASS + && regclass[1] == X86_64_X87UP_CLASS) + return gen_rtx_REG (XFmode, FIRST_STACK_REG); - if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS] - && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) - opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; - - /* If stack probes are required, the space used for large function - arguments on the stack must also be probed, so enable - -maccumulate-outgoing-args so this happens in the prologue. */ - if (TARGET_STACK_PROBE_P (opts->x_target_flags) - && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) - { - if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS) - warning (0, - main_args_p - ? G_("stack probing requires %<-maccumulate-outgoing-args%> " - "for correctness") - : G_("stack probing requires " - "% for " - "correctness")); - opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; - } - - /* Stack realignment without -maccumulate-outgoing-args requires %ebp, - so enable -maccumulate-outgoing-args when %ebp is fixed. */ - if (fixed_regs[BP_REG] - && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) - { - if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS) - warning (0, - main_args_p - ? G_("fixed ebp register requires " - "%<-maccumulate-outgoing-args%>") - : G_("fixed ebp register requires " - "%")); - opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; - } - - /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */ - { - char *p; - ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0); - p = strchr (internal_label_prefix, 'X'); - internal_label_prefix_len = p - internal_label_prefix; - *p = '\0'; - } - - /* When scheduling description is not available, disable scheduler pass - so it won't slow down the compilation and make x87 code slower. */ - if (!TARGET_SCHEDULE) - opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0; - - maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES, - ix86_tune_cost->simultaneous_prefetches, - opts->x_param_values, - opts_set->x_param_values); - maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, - ix86_tune_cost->prefetch_block, - opts->x_param_values, - opts_set->x_param_values); - maybe_set_param_value (PARAM_L1_CACHE_SIZE, - ix86_tune_cost->l1_cache_size, - opts->x_param_values, - opts_set->x_param_values); - maybe_set_param_value (PARAM_L2_CACHE_SIZE, - ix86_tune_cost->l2_cache_size, - opts->x_param_values, - opts_set->x_param_values); - - /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ - if (opts->x_flag_prefetch_loop_arrays < 0 - && HAVE_prefetch - && (opts->x_optimize >= 3 || opts->x_flag_profile_use) - && !opts->x_optimize_size - && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL) - opts->x_flag_prefetch_loop_arrays = 1; - - /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0) - can be opts->x_optimized to ap = __builtin_next_arg (0). */ - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack) - targetm.expand_builtin_va_start = NULL; - - if (TARGET_64BIT_P (opts->x_ix86_isa_flags)) - { - ix86_gen_leave = gen_leave_rex64; - if (Pmode == DImode) - { - ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di; - ix86_gen_tls_local_dynamic_base_64 - = gen_tls_local_dynamic_base_64_di; - } - else + if (n == 2 + && regclass[0] == X86_64_INTEGER_CLASS + && regclass[1] == X86_64_INTEGER_CLASS + && (mode == CDImode || mode == TImode || mode == BLKmode) + && intreg[0] + 1 == intreg[1]) + { + if (mode == BLKmode) { - ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si; - ix86_gen_tls_local_dynamic_base_64 - = gen_tls_local_dynamic_base_64_si; + /* Use TImode for BLKmode values in 2 integer registers. */ + exp[0] = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (TImode, intreg[0]), + GEN_INT (0)); + ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1)); + XVECEXP (ret, 0, 0) = exp[0]; + return ret; } + else + return gen_rtx_REG (mode, intreg[0]); } - else - ix86_gen_leave = gen_leave; - - if (Pmode == DImode) - { - ix86_gen_add3 = gen_adddi3; - ix86_gen_sub3 = gen_subdi3; - ix86_gen_sub3_carry = gen_subdi3_carry; - ix86_gen_one_cmpl2 = gen_one_cmpldi2; - ix86_gen_andsp = gen_anddi3; - ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di; - ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi; - ix86_gen_probe_stack_range = gen_probe_stack_rangedi; - ix86_gen_monitor = gen_sse3_monitor_di; - ix86_gen_monitorx = gen_monitorx_di; - ix86_gen_clzero = gen_clzero_di; - } - else - { - ix86_gen_add3 = gen_addsi3; - ix86_gen_sub3 = gen_subsi3; - ix86_gen_sub3_carry = gen_subsi3_carry; - ix86_gen_one_cmpl2 = gen_one_cmplsi2; - ix86_gen_andsp = gen_andsi3; - ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si; - ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi; - ix86_gen_probe_stack_range = gen_probe_stack_rangesi; - ix86_gen_monitor = gen_sse3_monitor_si; - ix86_gen_monitorx = gen_monitorx_si; - ix86_gen_clzero = gen_clzero_si; - } - -#ifdef USE_IX86_CLD - /* Use -mcld by default for 32-bit code if configured with --enable-cld. */ - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)) - opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags; -#endif - /* Set the default value for -mfentry. */ - if (!opts_set->x_flag_fentry) - opts->x_flag_fentry = TARGET_SEH; - else + /* Otherwise figure out the entries of the PARALLEL. */ + for (i = 0; i < n; i++) { - if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic - && opts->x_flag_fentry) - sorry ("%<-mfentry%> isn%'t supported for 32-bit in combination " - "with %<-fpic%>"); - else if (TARGET_SEH && !opts->x_flag_fentry) - sorry ("%<-mno-fentry%> isn%'t compatible with SEH"); - } - - if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES) - sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH"); - - if (!(opts_set->x_target_flags & MASK_VZEROUPPER) - && TARGET_EMIT_VZEROUPPER) - opts->x_target_flags |= MASK_VZEROUPPER; - if (!(opts_set->x_target_flags & MASK_STV)) - opts->x_target_flags |= MASK_STV; - /* Disable STV if -mpreferred-stack-boundary={2,3} or - -mincoming-stack-boundary={2,3} or -mstackrealign - the needed - stack realignment will be extra cost the pass doesn't take into - account and the pass can't realign the stack. */ - if (ix86_preferred_stack_boundary < 128 - || ix86_incoming_stack_boundary < 128 - || opts->x_ix86_force_align_arg_pointer) - opts->x_target_flags &= ~MASK_STV; - if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL] - && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) - opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; - if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL] - && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE)) - opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; - - /* Enable 128-bit AVX instruction generation - for the auto-vectorizer. */ - if (TARGET_AVX128_OPTIMAL - && (opts_set->x_prefer_vector_width_type == PVW_NONE)) - opts->x_prefer_vector_width_type = PVW_AVX128; - - /* Use 256-bit AVX instruction generation - in the auto-vectorizer. */ - if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL] - && (opts_set->x_prefer_vector_width_type == PVW_NONE)) - opts->x_prefer_vector_width_type = PVW_AVX256; - - if (opts->x_ix86_recip_name) - { - char *p = ASTRDUP (opts->x_ix86_recip_name); - char *q; - unsigned int mask, i; - bool invert; - - while ((q = strtok (p, ",")) != NULL) - { - p = NULL; - if (*q == '!') - { - invert = true; - q++; - } - else - invert = false; + int pos; - if (!strcmp (q, "default")) - mask = RECIP_MASK_ALL; - else - { - for (i = 0; i < ARRAY_SIZE (recip_options); i++) - if (!strcmp (q, recip_options[i].string)) + switch (regclass[i]) + { + case X86_64_NO_CLASS: + break; + case X86_64_INTEGER_CLASS: + case X86_64_INTEGERSI_CLASS: + /* Merge TImodes on aligned occasions here too. */ + if (i * 8 + 8 > bytes) + { + unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT; + if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode)) + /* We've requested 24 bytes we + don't have mode for. Use DImode. */ + tmpmode = DImode; + } + else if (regclass[i] == X86_64_INTEGERSI_CLASS) + tmpmode = SImode; + else + tmpmode = DImode; + exp [nexps++] + = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (tmpmode, *intreg), + GEN_INT (i*8)); + intreg++; + break; + case X86_64_SSESF_CLASS: + exp [nexps++] + = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (SFmode, + GET_SSE_REGNO (sse_regno)), + GEN_INT (i*8)); + sse_regno++; + break; + case X86_64_SSEDF_CLASS: + exp [nexps++] + = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (DFmode, + GET_SSE_REGNO (sse_regno)), + GEN_INT (i*8)); + sse_regno++; + break; + case X86_64_SSE_CLASS: + pos = i; + switch (n) + { + case 1: + tmpmode = DImode; + break; + case 2: + if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS) { - mask = recip_options[i].mask; - break; + tmpmode = TImode; + i++; } - - if (i == ARRAY_SIZE (recip_options)) - { - error ("unknown option for %<-mrecip=%s%>", q); - invert = false; - mask = RECIP_MASK_NONE; - } - } - - opts->x_recip_mask_explicit |= mask; - if (invert) - opts->x_recip_mask &= ~mask; - else - opts->x_recip_mask |= mask; + else + tmpmode = DImode; + break; + case 4: + gcc_assert (i == 0 + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS); + tmpmode = OImode; + i += 3; + break; + case 8: + gcc_assert (i == 0 + && regclass[1] == X86_64_SSEUP_CLASS + && regclass[2] == X86_64_SSEUP_CLASS + && regclass[3] == X86_64_SSEUP_CLASS + && regclass[4] == X86_64_SSEUP_CLASS + && regclass[5] == X86_64_SSEUP_CLASS + && regclass[6] == X86_64_SSEUP_CLASS + && regclass[7] == X86_64_SSEUP_CLASS); + tmpmode = XImode; + i += 7; + break; + default: + gcc_unreachable (); + } + exp [nexps++] + = gen_rtx_EXPR_LIST (VOIDmode, + gen_rtx_REG (tmpmode, + GET_SSE_REGNO (sse_regno)), + GEN_INT (pos*8)); + sse_regno++; + break; + default: + gcc_unreachable (); } } - if (TARGET_RECIP_P (opts->x_target_flags)) - opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit; - else if (opts_set->x_target_flags & MASK_RECIP) - opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit); + /* Empty aligned struct, union or class. */ + if (nexps == 0) + return NULL; - /* Default long double to 64-bit for 32-bit Bionic and to __float128 - for 64-bit Bionic. Also default long double to 64-bit for Intel - MCU psABI. */ - if ((TARGET_HAS_BIONIC || TARGET_IAMCU) - && !(opts_set->x_target_flags - & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128))) - opts->x_target_flags |= (TARGET_64BIT - ? MASK_LONG_DOUBLE_128 - : MASK_LONG_DOUBLE_64); + ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps)); + for (i = 0; i < nexps; i++) + XVECEXP (ret, 0, i) = exp [i]; + return ret; +} - /* Only one of them can be active. */ - gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0 - || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0); +/* Update the data in CUM to advance over an argument of mode MODE + and data type TYPE. (TYPE is null for libcalls where that information + may not be available.) - /* Handle stack protector */ - if (!opts_set->x_ix86_stack_protector_guard) + Return a number of integer regsiters advanced over. */ + +static int +function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode, + const_tree type, HOST_WIDE_INT bytes, + HOST_WIDE_INT words) +{ + int res = 0; + bool error_p = false; + + if (TARGET_IAMCU) { -#ifdef TARGET_THREAD_SSP_OFFSET - if (!TARGET_HAS_BIONIC) - opts->x_ix86_stack_protector_guard = SSP_TLS; - else -#endif - opts->x_ix86_stack_protector_guard = SSP_GLOBAL; + /* Intel MCU psABI passes scalars and aggregates no larger than 8 + bytes in registers. */ + if (!VECTOR_MODE_P (mode) && bytes <= 8) + goto pass_in_reg; + return res; } - if (opts_set->x_ix86_stack_protector_guard_offset_str) + switch (mode) { - char *endp; - const char *str = opts->x_ix86_stack_protector_guard_offset_str; + default: + break; - errno = 0; - int64_t offset; + case E_BLKmode: + if (bytes < 0) + break; + /* FALLTHRU */ -#if defined(INT64_T_IS_LONG) - offset = strtol (str, &endp, 0); -#else - offset = strtoll (str, &endp, 0); -#endif + case E_DImode: + case E_SImode: + case E_HImode: + case E_QImode: +pass_in_reg: + cum->words += words; + cum->nregs -= words; + cum->regno += words; + if (cum->nregs >= 0) + res = words; + if (cum->nregs <= 0) + { + cum->nregs = 0; + cfun->machine->arg_reg_available = false; + cum->regno = 0; + } + break; - if (!*str || *endp || errno) - error ("%qs is not a valid number " - "in %<-mstack-protector-guard-offset=%>", str); + case E_OImode: + /* OImode shouldn't be used directly. */ + gcc_unreachable (); - if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000), - HOST_WIDE_INT_C (0x7fffffff))) - error ("%qs is not a valid offset " - "in %<-mstack-protector-guard-offset=%>", str); + case E_DFmode: + if (cum->float_in_sse == -1) + error_p = true; + if (cum->float_in_sse < 2) + break; + /* FALLTHRU */ + case E_SFmode: + if (cum->float_in_sse == -1) + error_p = true; + if (cum->float_in_sse < 1) + break; + /* FALLTHRU */ - opts->x_ix86_stack_protector_guard_offset = offset; - } -#ifdef TARGET_THREAD_SSP_OFFSET - else - opts->x_ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET; -#endif + case E_V8SFmode: + case E_V8SImode: + case E_V64QImode: + case E_V32HImode: + case E_V16SImode: + case E_V8DImode: + case E_V16SFmode: + case E_V8DFmode: + case E_V32QImode: + case E_V16HImode: + case E_V4DFmode: + case E_V4DImode: + case E_TImode: + case E_V16QImode: + case E_V8HImode: + case E_V4SImode: + case E_V2DImode: + case E_V4SFmode: + case E_V2DFmode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + cum->sse_words += words; + cum->sse_nregs -= 1; + cum->sse_regno += 1; + if (cum->sse_nregs <= 0) + { + cum->sse_nregs = 0; + cum->sse_regno = 0; + } + } + break; - if (opts_set->x_ix86_stack_protector_guard_reg_str) + case E_V8QImode: + case E_V4HImode: + case E_V2SImode: + case E_V2SFmode: + case E_V1TImode: + case E_V1DImode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + cum->mmx_words += words; + cum->mmx_nregs -= 1; + cum->mmx_regno += 1; + if (cum->mmx_nregs <= 0) + { + cum->mmx_nregs = 0; + cum->mmx_regno = 0; + } + } + break; + } + if (error_p) { - const char *str = opts->x_ix86_stack_protector_guard_reg_str; - addr_space_t seg = ADDR_SPACE_GENERIC; + cum->float_in_sse = 0; + error ("calling %qD with SSE calling convention without " + "SSE/SSE2 enabled", cum->decl); + sorry ("this is a GCC bug that can be worked around by adding " + "attribute used to function called"); + } - /* Discard optional register prefix. */ - if (str[0] == '%') - str++; + return res; +} - if (strlen (str) == 2 && str[1] == 's') - { - if (str[0] == 'f') - seg = ADDR_SPACE_SEG_FS; - else if (str[0] == 'g') - seg = ADDR_SPACE_SEG_GS; - } +static int +function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode, + const_tree type, HOST_WIDE_INT words, bool named) +{ + int int_nregs, sse_nregs; - if (seg == ADDR_SPACE_GENERIC) - error ("%qs is not a valid base register " - "in %<-mstack-protector-guard-reg=%>", - opts->x_ix86_stack_protector_guard_reg_str); + /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */ + if (!named && (VALID_AVX512F_REG_MODE (mode) + || VALID_AVX256_REG_MODE (mode))) + return 0; - opts->x_ix86_stack_protector_guard_reg = seg; + if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs) + && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs) + { + cum->nregs -= int_nregs; + cum->sse_nregs -= sse_nregs; + cum->regno += int_nregs; + cum->sse_regno += sse_nregs; + return int_nregs; } else { - opts->x_ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG; - - /* The kernel uses a different segment register for performance - reasons; a system call would not have to trash the userspace - segment register, which would be expensive. */ - if (opts->x_ix86_cmodel == CM_KERNEL) - opts->x_ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS; + int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD; + cum->words = ROUND_UP (cum->words, align); + cum->words += words; + return 0; } +} - /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ - if (opts->x_ix86_tune_memcpy_strategy) - { - char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy); - ix86_parse_stringop_strategy_string (str, false); - free (str); - } +static int +function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes, + HOST_WIDE_INT words) +{ + /* Otherwise, this should be passed indirect. */ + gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8); - if (opts->x_ix86_tune_memset_strategy) + cum->words += words; + if (cum->nregs > 0) { - char *str = xstrdup (opts->x_ix86_tune_memset_strategy); - ix86_parse_stringop_strategy_string (str, true); - free (str); + cum->nregs -= 1; + cum->regno += 1; + return 1; } + return 0; +} - /* Save the initial options in case the user does function specific - options. */ - if (main_args_p) - target_option_default_node = target_option_current_node - = build_target_option_node (opts); +/* Update the data in CUM to advance over an argument of mode MODE and + data type TYPE. (TYPE is null for libcalls where that information + may not be available.) */ - if (opts->x_flag_cf_protection != CF_NONE) - opts->x_flag_cf_protection - = (cf_protection_level) (opts->x_flag_cf_protection | CF_SET); +static void +ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode, + const_tree type, bool named) +{ + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + HOST_WIDE_INT bytes, words; + int nregs; - if (ix86_tune_features [X86_TUNE_AVOID_128FMA_CHAINS]) - maybe_set_param_value (PARAM_AVOID_FMA_MAX_BITS, 128, - opts->x_param_values, - opts_set->x_param_values); + /* The argument of interrupt handler is a special case and is + handled in ix86_function_arg. */ + if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL) + return; - /* PR86952: jump table usage with retpolines is slow. - The PR provides some numbers about the slowness. */ - if (ix86_indirect_branch != indirect_branch_keep - && !opts_set->x_flag_jump_tables) - opts->x_flag_jump_tables = 0; + if (mode == BLKmode) + bytes = int_size_in_bytes (type); + else + bytes = GET_MODE_SIZE (mode); + words = CEIL (bytes, UNITS_PER_WORD); - return true; -} + if (type) + mode = type_natural_mode (type, NULL, false); -/* Implement the TARGET_OPTION_OVERRIDE hook. */ + if (TARGET_64BIT) + { + enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi; -static void -ix86_option_override (void) -{ - ix86_option_override_internal (true, &global_options, &global_options_set); -} + if (call_abi == MS_ABI) + nregs = function_arg_advance_ms_64 (cum, bytes, words); + else + nregs = function_arg_advance_64 (cum, mode, type, words, named); + } + else + nregs = function_arg_advance_32 (cum, mode, type, bytes, words); -/* Implement the TARGET_OFFLOAD_OPTIONS hook. */ -static char * -ix86_offload_options (void) -{ - if (TARGET_LP64) - return xstrdup ("-foffload-abi=lp64"); - return xstrdup ("-foffload-abi=ilp32"); + if (!nregs) + { + /* Track if there are outgoing arguments on stack. */ + if (cum->caller) + cfun->machine->outgoing_args_on_stack = true; + } } -/* Update register usage after having seen the compiler flags. */ +/* Define where to put the arguments to a function. + Value is zero to push the argument on the stack, + or a hard register in which to store the argument. -static void -ix86_conditional_register_usage (void) + MODE is the argument's machine mode. + TYPE is the data type of the argument (as a tree). + This is null for libcalls where that information may + not be available. + CUM is a variable of type CUMULATIVE_ARGS which gives info about + the preceding args and about the function being called. + NAMED is nonzero if this argument is a named parameter + (otherwise it is an extra parameter matching an ellipsis). */ + +static rtx +function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode, + machine_mode orig_mode, const_tree type, + HOST_WIDE_INT bytes, HOST_WIDE_INT words) { - int i, c_mask; + bool error_p = false; - /* If there are no caller-saved registers, preserve all registers. - except fixed_regs and registers used for function return value - since aggregate_value_p checks call_used_regs[regno] on return - value. */ - if (cfun && cfun->machine->no_caller_saved_registers) - for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) - if (!fixed_regs[i] && !ix86_function_value_regno_p (i)) - call_used_regs[i] = 0; + /* Avoid the AL settings for the Unix64 ABI. */ + if (mode == VOIDmode) + return constm1_rtx; - /* For 32-bit targets, squash the REX registers. */ - if (! TARGET_64BIT) + if (TARGET_IAMCU) { - for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++) - fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; - for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++) - fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; - for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++) - fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + /* Intel MCU psABI passes scalars and aggregates no larger than 8 + bytes in registers. */ + if (!VECTOR_MODE_P (mode) && bytes <= 8) + goto pass_in_reg; + return NULL_RTX; } - /* See the definition of CALL_USED_REGISTERS in i386.h. */ - c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI); - - CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]); - - for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + switch (mode) { - /* Set/reset conditionally defined registers from - CALL_USED_REGISTERS initializer. */ - if (call_used_regs[i] > 1) - call_used_regs[i] = !!(call_used_regs[i] & c_mask); - - /* Calculate registers of CLOBBERED_REGS register set - as call used registers from GENERAL_REGS register set. */ - if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i) - && call_used_regs[i]) - SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i); - } - - /* If MMX is disabled, squash the registers. */ - if (! TARGET_MMX) - for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) - if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i)) - fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + default: + break; - /* If SSE is disabled, squash the registers. */ - if (! TARGET_SSE) - for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) - if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i)) - fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + case E_BLKmode: + if (bytes < 0) + break; + /* FALLTHRU */ + case E_DImode: + case E_SImode: + case E_HImode: + case E_QImode: +pass_in_reg: + if (words <= cum->nregs) + { + int regno = cum->regno; - /* If the FPU is disabled, squash the registers. */ - if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387)) - for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) - if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i)) - fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + /* Fastcall allocates the first two DWORD (SImode) or + smaller arguments to ECX and EDX if it isn't an + aggregate type . */ + if (cum->fastcall) + { + if (mode == BLKmode + || mode == DImode + || (type && AGGREGATE_TYPE_P (type))) + break; - /* If AVX512F is disabled, squash the registers. */ - if (! TARGET_AVX512F) - { - for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++) - fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; + /* ECX not EAX is the first allocated register. */ + if (regno == AX_REG) + regno = CX_REG; + } + return gen_rtx_REG (mode, regno); + } + break; - for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++) - fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = ""; - } -} + case E_DFmode: + if (cum->float_in_sse == -1) + error_p = true; + if (cum->float_in_sse < 2) + break; + /* FALLTHRU */ + case E_SFmode: + if (cum->float_in_sse == -1) + error_p = true; + if (cum->float_in_sse < 1) + break; + /* FALLTHRU */ + case E_TImode: + /* In 32bit, we pass TImode in xmm registers. */ + case E_V16QImode: + case E_V8HImode: + case E_V4SImode: + case E_V2DImode: + case E_V4SFmode: + case E_V2DFmode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + if (cum->sse_nregs) + return gen_reg_or_parallel (mode, orig_mode, + cum->sse_regno + FIRST_SSE_REG); + } + break; -/* Canonicalize a comparison from one we don't have to one we do have. */ + case E_OImode: + case E_XImode: + /* OImode and XImode shouldn't be used directly. */ + gcc_unreachable (); -static void -ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1, - bool op0_preserve_value) -{ - /* The order of operands in x87 ficom compare is forced by combine in - simplify_comparison () function. Float operator is treated as RTX_OBJ - with a precedence over other operators and is always put in the first - place. Swap condition and operands to match ficom instruction. */ - if (!op0_preserve_value - && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1)) - { - enum rtx_code scode = swap_condition ((enum rtx_code) *code); + case E_V64QImode: + case E_V32HImode: + case E_V16SImode: + case E_V8DImode: + case E_V16SFmode: + case E_V8DFmode: + case E_V8SFmode: + case E_V8SImode: + case E_V32QImode: + case E_V16HImode: + case E_V4DFmode: + case E_V4DImode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + if (cum->sse_nregs) + return gen_reg_or_parallel (mode, orig_mode, + cum->sse_regno + FIRST_SSE_REG); + } + break; - /* We are called only for compares that are split to SAHF instruction. - Ensure that we have setcc/jcc insn for the swapped condition. */ - if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN) + case E_V8QImode: + case E_V4HImode: + case E_V2SImode: + case E_V2SFmode: + case E_V1TImode: + case E_V1DImode: + if (!type || !AGGREGATE_TYPE_P (type)) { - std::swap (*op0, *op1); - *code = (int) scode; + if (cum->mmx_nregs) + return gen_reg_or_parallel (mode, orig_mode, + cum->mmx_regno + FIRST_MMX_REG); } + break; } -} - -/* Save the current options */ - -static void -ix86_function_specific_save (struct cl_target_option *ptr, - struct gcc_options *opts) -{ - ptr->arch = ix86_arch; - ptr->schedule = ix86_schedule; - ptr->prefetch_sse = x86_prefetch_sse; - ptr->tune = ix86_tune; - ptr->branch_cost = ix86_branch_cost; - ptr->tune_defaulted = ix86_tune_defaulted; - ptr->arch_specified = ix86_arch_specified; - ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit; - ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit; - ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit; - ptr->x_ix86_arch_string = opts->x_ix86_arch_string; - ptr->x_ix86_tune_string = opts->x_ix86_tune_string; - ptr->x_ix86_cmodel = opts->x_ix86_cmodel; - ptr->x_ix86_abi = opts->x_ix86_abi; - ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect; - ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost; - ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes; - ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer; - ptr->x_ix86_force_drap = opts->x_ix86_force_drap; - ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg; - ptr->x_ix86_pmode = opts->x_ix86_pmode; - ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg; - ptr->x_ix86_recip_name = opts->x_ix86_recip_name; - ptr->x_ix86_regparm = opts->x_ix86_regparm; - ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold; - ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx; - ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard; - ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg; - ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect; - ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string; - ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy; - ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy; - ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default; - ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type; - - /* The fields are char but the variables are not; make sure the - values fit in the fields. */ - gcc_assert (ptr->arch == ix86_arch); - gcc_assert (ptr->schedule == ix86_schedule); - gcc_assert (ptr->tune == ix86_tune); - gcc_assert (ptr->branch_cost == ix86_branch_cost); -} - -/* Restore the current options */ - -static void -ix86_function_specific_restore (struct gcc_options *opts, - struct cl_target_option *ptr) -{ - enum processor_type old_tune = ix86_tune; - enum processor_type old_arch = ix86_arch; - unsigned HOST_WIDE_INT ix86_arch_mask; - int i; - - /* We don't change -fPIC. */ - opts->x_flag_pic = flag_pic; - - ix86_arch = (enum processor_type) ptr->arch; - ix86_schedule = (enum attr_cpu) ptr->schedule; - ix86_tune = (enum processor_type) ptr->tune; - x86_prefetch_sse = ptr->prefetch_sse; - opts->x_ix86_branch_cost = ptr->branch_cost; - ix86_tune_defaulted = ptr->tune_defaulted; - ix86_arch_specified = ptr->arch_specified; - opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit; - opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit; - opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit; - opts->x_ix86_arch_string = ptr->x_ix86_arch_string; - opts->x_ix86_tune_string = ptr->x_ix86_tune_string; - opts->x_ix86_cmodel = ptr->x_ix86_cmodel; - opts->x_ix86_abi = ptr->x_ix86_abi; - opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect; - opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost; - opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes; - opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer; - opts->x_ix86_force_drap = ptr->x_ix86_force_drap; - opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg; - opts->x_ix86_pmode = ptr->x_ix86_pmode; - opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg; - opts->x_ix86_recip_name = ptr->x_ix86_recip_name; - opts->x_ix86_regparm = ptr->x_ix86_regparm; - opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold; - opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx; - opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard; - opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg; - opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect; - opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string; - opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy; - opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy; - opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default; - opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type; - ix86_tune_cost = processor_cost_table[ix86_tune]; - /* TODO: ix86_cost should be chosen at instruction or function granuality - so for cold code we use size_cost even in !optimize_size compilation. */ - if (opts->x_optimize_size) - ix86_cost = &ix86_size_cost; - else - ix86_cost = ix86_tune_cost; - - /* Recreate the arch feature tests if the arch changed */ - if (old_arch != ix86_arch) + if (error_p) { - ix86_arch_mask = HOST_WIDE_INT_1U << ix86_arch; - for (i = 0; i < X86_ARCH_LAST; ++i) - ix86_arch_features[i] - = !!(initial_ix86_arch_features[i] & ix86_arch_mask); + cum->float_in_sse = 0; + error ("calling %qD with SSE calling convention without " + "SSE/SSE2 enabled", cum->decl); + sorry ("this is a GCC bug that can be worked around by adding " + "attribute used to function called"); } - /* Recreate the tune optimization tests */ - if (old_tune != ix86_tune) - set_ix86_tune_features (ix86_tune, false); + return NULL_RTX; } -/* Adjust target options after streaming them in. This is mainly about - reconciling them with global options. */ - -static void -ix86_function_specific_post_stream_in (struct cl_target_option *ptr) +static rtx +function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode, + machine_mode orig_mode, const_tree type, bool named) { - /* flag_pic is a global option, but ix86_cmodel is target saved option - partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel - for PIC, or error out. */ - if (flag_pic) - switch (ptr->x_ix86_cmodel) - { - case CM_SMALL: - ptr->x_ix86_cmodel = CM_SMALL_PIC; - break; - - case CM_MEDIUM: - ptr->x_ix86_cmodel = CM_MEDIUM_PIC; - break; - - case CM_LARGE: - ptr->x_ix86_cmodel = CM_LARGE_PIC; - break; - - case CM_KERNEL: - error ("code model %s does not support PIC mode", "kernel"); - break; - - default: - break; - } - else - switch (ptr->x_ix86_cmodel) - { - case CM_SMALL_PIC: - ptr->x_ix86_cmodel = CM_SMALL; - break; + /* Handle a hidden AL argument containing number of registers + for varargs x86-64 functions. */ + if (mode == VOIDmode) + return GEN_INT (cum->maybe_vaarg + ? (cum->sse_nregs < 0 + ? X86_64_SSE_REGPARM_MAX + : cum->sse_regno) + : -1); - case CM_MEDIUM_PIC: - ptr->x_ix86_cmodel = CM_MEDIUM; - break; + switch (mode) + { + default: + break; - case CM_LARGE_PIC: - ptr->x_ix86_cmodel = CM_LARGE; - break; + case E_V8SFmode: + case E_V8SImode: + case E_V32QImode: + case E_V16HImode: + case E_V4DFmode: + case E_V4DImode: + case E_V16SFmode: + case E_V16SImode: + case E_V64QImode: + case E_V32HImode: + case E_V8DFmode: + case E_V8DImode: + /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */ + if (!named) + return NULL; + break; + } - default: - break; - } + return construct_container (mode, orig_mode, type, 0, cum->nregs, + cum->sse_nregs, + &x86_64_int_parameter_registers [cum->regno], + cum->sse_regno); } -/* Print the current options */ - -static void -ix86_function_specific_print (FILE *file, int indent, - struct cl_target_option *ptr) +static rtx +function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode, + machine_mode orig_mode, bool named, + HOST_WIDE_INT bytes) { - char *target_string - = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2, - ptr->x_target_flags, ptr->x_ix86_target_flags, - NULL, NULL, ptr->x_ix86_fpmath, false, true); + unsigned int regno; - gcc_assert (ptr->arch < PROCESSOR_max); - fprintf (file, "%*sarch = %d (%s)\n", - indent, "", - ptr->arch, processor_names[ptr->arch]); + /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call. + We use value of -2 to specify that current function call is MSABI. */ + if (mode == VOIDmode) + return GEN_INT (-2); - gcc_assert (ptr->tune < PROCESSOR_max); - fprintf (file, "%*stune = %d (%s)\n", - indent, "", - ptr->tune, processor_names[ptr->tune]); + /* If we've run out of registers, it goes on the stack. */ + if (cum->nregs == 0) + return NULL_RTX; - fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost); + regno = x86_64_ms_abi_int_parameter_registers[cum->regno]; - if (target_string) + /* Only floating point modes are passed in anything but integer regs. */ + if (TARGET_SSE && (mode == SFmode || mode == DFmode)) { - fprintf (file, "%*s%s\n", indent, "", target_string); - free (target_string); - } -} - - -/* Inner function to process the attribute((target(...))), take an argument and - set the current options from the argument. If we have a list, recursively go - over the list. */ - -static bool -ix86_valid_target_attribute_inner_p (tree fndecl, tree args, char *p_strings[], - struct gcc_options *opts, - struct gcc_options *opts_set, - struct gcc_options *enum_opts_set, - bool target_clone_attr) -{ - char *next_optstr; - bool ret = true; - -#define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 } -#define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 } -#define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 } -#define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M } -#define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M } - - enum ix86_opt_type - { - ix86_opt_unknown, - ix86_opt_yes, - ix86_opt_no, - ix86_opt_str, - ix86_opt_enum, - ix86_opt_isa - }; + if (named) + regno = cum->regno + FIRST_SSE_REG; + else + { + rtx t1, t2; - static const struct - { - const char *string; - size_t len; - enum ix86_opt_type type; - int opt; - int mask; - } attrs[] = { - /* isa options */ - IX86_ATTR_ISA ("pconfig", OPT_mpconfig), - IX86_ATTR_ISA ("wbnoinvd", OPT_mwbnoinvd), - IX86_ATTR_ISA ("sgx", OPT_msgx), - IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps), - IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw), - IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq), - IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2), - IX86_ATTR_ISA ("avx512vnni", OPT_mavx512vnni), - IX86_ATTR_ISA ("avx512bitalg", OPT_mavx512bitalg), - - IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi), - IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma), - IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl), - IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw), - IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq), - IX86_ATTR_ISA ("avx512er", OPT_mavx512er), - IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf), - IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd), - IX86_ATTR_ISA ("avx512f", OPT_mavx512f), - IX86_ATTR_ISA ("avx2", OPT_mavx2), - IX86_ATTR_ISA ("fma", OPT_mfma), - IX86_ATTR_ISA ("xop", OPT_mxop), - IX86_ATTR_ISA ("fma4", OPT_mfma4), - IX86_ATTR_ISA ("f16c", OPT_mf16c), - IX86_ATTR_ISA ("avx", OPT_mavx), - IX86_ATTR_ISA ("sse4", OPT_msse4), - IX86_ATTR_ISA ("sse4.2", OPT_msse4_2), - IX86_ATTR_ISA ("sse4.1", OPT_msse4_1), - IX86_ATTR_ISA ("sse4a", OPT_msse4a), - IX86_ATTR_ISA ("ssse3", OPT_mssse3), - IX86_ATTR_ISA ("sse3", OPT_msse3), - IX86_ATTR_ISA ("aes", OPT_maes), - IX86_ATTR_ISA ("sha", OPT_msha), - IX86_ATTR_ISA ("pclmul", OPT_mpclmul), - IX86_ATTR_ISA ("sse2", OPT_msse2), - IX86_ATTR_ISA ("sse", OPT_msse), - IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa), - IX86_ATTR_ISA ("3dnow", OPT_m3dnow), - IX86_ATTR_ISA ("mmx", OPT_mmmx), - IX86_ATTR_ISA ("rtm", OPT_mrtm), - IX86_ATTR_ISA ("prfchw", OPT_mprfchw), - IX86_ATTR_ISA ("rdseed", OPT_mrdseed), - IX86_ATTR_ISA ("adx", OPT_madx), - IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1), - IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt), - IX86_ATTR_ISA ("xsaves", OPT_mxsaves), - IX86_ATTR_ISA ("xsavec", OPT_mxsavec), - IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt), - IX86_ATTR_ISA ("xsave", OPT_mxsave), - IX86_ATTR_ISA ("abm", OPT_mabm), - IX86_ATTR_ISA ("bmi", OPT_mbmi), - IX86_ATTR_ISA ("bmi2", OPT_mbmi2), - IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt), - IX86_ATTR_ISA ("tbm", OPT_mtbm), - IX86_ATTR_ISA ("popcnt", OPT_mpopcnt), - IX86_ATTR_ISA ("cx16", OPT_mcx16), - IX86_ATTR_ISA ("sahf", OPT_msahf), - IX86_ATTR_ISA ("movbe", OPT_mmovbe), - IX86_ATTR_ISA ("crc32", OPT_mcrc32), - IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase), - IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd), - IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx), - IX86_ATTR_ISA ("clzero", OPT_mclzero), - IX86_ATTR_ISA ("pku", OPT_mpku), - IX86_ATTR_ISA ("lwp", OPT_mlwp), - IX86_ATTR_ISA ("hle", OPT_mhle), - IX86_ATTR_ISA ("fxsr", OPT_mfxsr), - IX86_ATTR_ISA ("clwb", OPT_mclwb), - IX86_ATTR_ISA ("rdpid", OPT_mrdpid), - IX86_ATTR_ISA ("gfni", OPT_mgfni), - IX86_ATTR_ISA ("shstk", OPT_mshstk), - IX86_ATTR_ISA ("vaes", OPT_mvaes), - IX86_ATTR_ISA ("vpclmulqdq", OPT_mvpclmulqdq), - IX86_ATTR_ISA ("movdiri", OPT_mmovdiri), - IX86_ATTR_ISA ("movdir64b", OPT_mmovdir64b), - IX86_ATTR_ISA ("waitpkg", OPT_mwaitpkg), - IX86_ATTR_ISA ("cldemote", OPT_mcldemote), - IX86_ATTR_ISA ("ptwrite", OPT_mptwrite), - - /* enum options */ - IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_), - - /* string options */ - IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH), - IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE), - - /* flag options */ - IX86_ATTR_YES ("cld", - OPT_mcld, - MASK_CLD), - - IX86_ATTR_NO ("fancy-math-387", - OPT_mfancy_math_387, - MASK_NO_FANCY_MATH_387), - - IX86_ATTR_YES ("ieee-fp", - OPT_mieee_fp, - MASK_IEEE_FP), - - IX86_ATTR_YES ("inline-all-stringops", - OPT_minline_all_stringops, - MASK_INLINE_ALL_STRINGOPS), - - IX86_ATTR_YES ("inline-stringops-dynamically", - OPT_minline_stringops_dynamically, - MASK_INLINE_STRINGOPS_DYNAMICALLY), - - IX86_ATTR_NO ("align-stringops", - OPT_mno_align_stringops, - MASK_NO_ALIGN_STRINGOPS), - - IX86_ATTR_YES ("recip", - OPT_mrecip, - MASK_RECIP), - }; - - location_t loc - = fndecl == NULL ? UNKNOWN_LOCATION : DECL_SOURCE_LOCATION (fndecl); - const char *attr_name = target_clone_attr ? "target_clone" : "target"; - - /* If this is a list, recurse to get the options. */ - if (TREE_CODE (args) == TREE_LIST) - { - bool ret = true; - - for (; args; args = TREE_CHAIN (args)) - if (TREE_VALUE (args) - && !ix86_valid_target_attribute_inner_p (fndecl, TREE_VALUE (args), - p_strings, opts, opts_set, - enum_opts_set, - target_clone_attr)) - ret = false; - - return ret; + /* Unnamed floating parameters are passed in both the + SSE and integer registers. */ + t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG); + t2 = gen_rtx_REG (mode, regno); + t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx); + t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx); + return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2)); + } } - - else if (TREE_CODE (args) != STRING_CST) + /* Handle aggregated types passed in register. */ + if (orig_mode == BLKmode) { - error_at (loc, "attribute %qs argument is not a string", attr_name); - return false; + if (bytes > 0 && bytes <= 8) + mode = (bytes > 4 ? DImode : SImode); + if (mode == BLKmode) + mode = DImode; } - /* Handle multiple arguments separated by commas. */ - next_optstr = ASTRDUP (TREE_STRING_POINTER (args)); - - while (next_optstr && *next_optstr != '\0') - { - char *p = next_optstr; - char *orig_p = p; - char *comma = strchr (next_optstr, ','); - size_t len, opt_len; - int opt; - bool opt_set_p; - char ch; - unsigned i; - enum ix86_opt_type type = ix86_opt_unknown; - int mask = 0; + return gen_reg_or_parallel (mode, orig_mode, regno); +} - if (comma) - { - *comma = '\0'; - len = comma - next_optstr; - next_optstr = comma + 1; - } - else - { - len = strlen (p); - next_optstr = NULL; - } +/* Return where to put the arguments to a function. + Return zero to push the argument on the stack, or a hard register in which to store the argument. - /* Recognize no-xxx. */ - if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-') - { - opt_set_p = false; - p += 3; - len -= 3; - } - else - opt_set_p = true; - - /* Find the option. */ - ch = *p; - opt = N_OPTS; - for (i = 0; i < ARRAY_SIZE (attrs); i++) - { - type = attrs[i].type; - opt_len = attrs[i].len; - if (ch == attrs[i].string[0] - && ((type != ix86_opt_str && type != ix86_opt_enum) - ? len == opt_len - : len > opt_len) - && memcmp (p, attrs[i].string, opt_len) == 0) - { - opt = attrs[i].opt; - mask = attrs[i].mask; - break; - } - } + MODE is the argument's machine mode. TYPE is the data type of the + argument. It is null for libcalls where that information may not be + available. CUM gives information about the preceding args and about + the function being called. NAMED is nonzero if this argument is a + named parameter (otherwise it is an extra parameter matching an + ellipsis). */ - /* Process the option. */ - if (opt == N_OPTS) - { - error_at (loc, "attribute %qs argument %qs is unknown", - orig_p, attr_name); - ret = false; - } +static rtx +ix86_function_arg (cumulative_args_t cum_v, machine_mode omode, + const_tree type, bool named) +{ + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + machine_mode mode = omode; + HOST_WIDE_INT bytes, words; + rtx arg; - else if (type == ix86_opt_isa) + if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL) + { + gcc_assert (type != NULL_TREE); + if (POINTER_TYPE_P (type)) { - struct cl_decoded_option decoded; - - generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded); - ix86_handle_option (opts, opts_set, - &decoded, input_location); + /* This is the pointer argument. */ + gcc_assert (TYPE_MODE (type) == Pmode); + /* It is at -WORD(AP) in the current frame in interrupt and + exception handlers. */ + arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD); } - - else if (type == ix86_opt_yes || type == ix86_opt_no) + else { - if (type == ix86_opt_no) - opt_set_p = !opt_set_p; - - if (opt_set_p) - opts->x_target_flags |= mask; - else - opts->x_target_flags &= ~mask; + gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION + && TREE_CODE (type) == INTEGER_TYPE + && TYPE_MODE (type) == word_mode); + /* The error code is the word-mode integer argument at + -2 * WORD(AP) in the current frame of the exception + handler. */ + arg = gen_rtx_MEM (word_mode, + plus_constant (Pmode, + arg_pointer_rtx, + -2 * UNITS_PER_WORD)); } + return arg; + } - else if (type == ix86_opt_str) - { - if (p_strings[opt]) - { - error_at (loc, "attribute value %qs was already specified " - "in %qs attribute", orig_p, attr_name); - ret = false; - } - else - p_strings[opt] = xstrdup (p + opt_len); - } + if (mode == BLKmode) + bytes = int_size_in_bytes (type); + else + bytes = GET_MODE_SIZE (mode); + words = CEIL (bytes, UNITS_PER_WORD); - else if (type == ix86_opt_enum) - { - bool arg_ok; - int value; + /* To simplify the code below, represent vector types with a vector mode + even if MMX/SSE are not active. */ + if (type && TREE_CODE (type) == VECTOR_TYPE) + mode = type_natural_mode (type, cum, false); - arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET); - if (arg_ok) - set_option (opts, enum_opts_set, opt, value, - p + opt_len, DK_UNSPECIFIED, input_location, - global_dc); - else - { - error_at (loc, "attribute value %qs is unknown in %qs attribute", - orig_p, attr_name); - ret = false; - } - } + if (TARGET_64BIT) + { + enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi; + if (call_abi == MS_ABI) + arg = function_arg_ms_64 (cum, mode, omode, named, bytes); else - gcc_unreachable (); + arg = function_arg_64 (cum, mode, omode, type, named); } + else + arg = function_arg_32 (cum, mode, omode, type, bytes, words); - return ret; -} + /* Track if there are outgoing arguments on stack. */ + if (arg == NULL_RTX && cum->caller) + cfun->machine->outgoing_args_on_stack = true; -/* Release allocated strings. */ -static void -release_options_strings (char **option_strings) -{ - /* Free up memory allocated to hold the strings */ - for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++) - free (option_strings[i]); + return arg; } -/* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */ +/* A C expression that indicates when an argument must be passed by + reference. If nonzero for an argument, a copy of that argument is + made in memory and a pointer to the argument is passed instead of + the argument itself. The pointer is passed in whatever way is + appropriate for passing a pointer to that type. */ -tree -ix86_valid_target_attribute_tree (tree fndecl, tree args, - struct gcc_options *opts, - struct gcc_options *opts_set, - bool target_clone_attr) +static bool +ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode, + const_tree type, bool) { - const char *orig_arch_string = opts->x_ix86_arch_string; - const char *orig_tune_string = opts->x_ix86_tune_string; - enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath; - int orig_tune_defaulted = ix86_tune_defaulted; - int orig_arch_specified = ix86_arch_specified; - char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL }; - tree t = NULL_TREE; - struct cl_target_option *def - = TREE_TARGET_OPTION (target_option_default_node); - struct gcc_options enum_opts_set; - - memset (&enum_opts_set, 0, sizeof (enum_opts_set)); - - /* Process each of the options on the chain. */ - if (!ix86_valid_target_attribute_inner_p (fndecl, args, option_strings, opts, - opts_set, &enum_opts_set, - target_clone_attr)) - return error_mark_node; + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); - /* If the changed options are different from the default, rerun - ix86_option_override_internal, and then save the options away. - The string options are attribute options, and will be undone - when we copy the save structure. */ - if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags - || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2 - || opts->x_target_flags != def->x_target_flags - || option_strings[IX86_FUNCTION_SPECIFIC_ARCH] - || option_strings[IX86_FUNCTION_SPECIFIC_TUNE] - || enum_opts_set.x_ix86_fpmath) + if (TARGET_64BIT) { - /* If we are using the default tune= or arch=, undo the string assigned, - and use the default. */ - if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]) - { - opts->x_ix86_arch_string - = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]); - - /* If arch= is set, clear all bits in x_ix86_isa_flags, - except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */ - opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT - | OPTION_MASK_ABI_64 - | OPTION_MASK_ABI_X32 - | OPTION_MASK_CODE16); - opts->x_ix86_isa_flags2 = 0; - } - else if (!orig_arch_specified) - opts->x_ix86_arch_string = NULL; - - if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]) - opts->x_ix86_tune_string - = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]); - else if (orig_tune_defaulted) - opts->x_ix86_tune_string = NULL; - - /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */ - if (enum_opts_set.x_ix86_fpmath) - opts_set->x_ix86_fpmath = (enum fpmath_unit) 1; + enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi; - /* Do any overrides, such as arch=xxx, or tune=xxx support. */ - bool r = ix86_option_override_internal (false, opts, opts_set); - if (!r) + /* See Windows x64 Software Convention. */ + if (call_abi == MS_ABI) { - release_options_strings (option_strings); - return error_mark_node; - } - - /* Add any builtin functions with the new isa if any. */ - ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2); + HOST_WIDE_INT msize = GET_MODE_SIZE (mode); - /* Save the current options unless we are validating options for - #pragma. */ - t = build_target_option_node (opts); + if (type) + { + /* Arrays are passed by reference. */ + if (TREE_CODE (type) == ARRAY_TYPE) + return true; - opts->x_ix86_arch_string = orig_arch_string; - opts->x_ix86_tune_string = orig_tune_string; - opts_set->x_ix86_fpmath = orig_fpmath_set; + if (RECORD_OR_UNION_TYPE_P (type)) + { + /* Structs/unions of sizes other than 8, 16, 32, or 64 bits + are passed by reference. */ + msize = int_size_in_bytes (type); + } + } - release_options_strings (option_strings); + /* __m128 is passed by reference. */ + return msize != 1 && msize != 2 && msize != 4 && msize != 8; + } + else if (type && int_size_in_bytes (type) == -1) + return true; } - return t; + return false; } -/* Hook to validate attribute((target("string"))). */ +/* Return true when TYPE should be 128bit aligned for 32bit argument + passing ABI. XXX: This function is obsolete and is only used for + checking psABI compatibility with previous versions of GCC. */ static bool -ix86_valid_target_attribute_p (tree fndecl, - tree ARG_UNUSED (name), - tree args, - int flags) -{ - struct gcc_options func_options; - tree new_target, new_optimize; - bool ret = true; - - /* attribute((target("default"))) does nothing, beyond - affecting multi-versioning. */ - if (TREE_VALUE (args) - && TREE_CODE (TREE_VALUE (args)) == STRING_CST - && TREE_CHAIN (args) == NULL_TREE - && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0) +ix86_compat_aligned_value_p (const_tree type) +{ + machine_mode mode = TYPE_MODE (type); + if (((TARGET_SSE && SSE_REG_MODE_P (mode)) + || mode == TDmode + || mode == TFmode + || mode == TCmode) + && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128)) return true; + if (TYPE_ALIGN (type) < 128) + return false; - tree old_optimize = build_optimization_node (&global_options); - - /* Get the optimization options of the current function. */ - tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); - - if (!func_optimize) - func_optimize = old_optimize; - - /* Init func_options. */ - memset (&func_options, 0, sizeof (func_options)); - init_options_struct (&func_options, NULL); - lang_hooks.init_options_struct (&func_options); - - cl_optimization_restore (&func_options, - TREE_OPTIMIZATION (func_optimize)); + if (AGGREGATE_TYPE_P (type)) + { + /* Walk the aggregates recursively. */ + switch (TREE_CODE (type)) + { + case RECORD_TYPE: + case UNION_TYPE: + case QUAL_UNION_TYPE: + { + tree field; - /* Initialize func_options to the default before its target options can - be set. */ - cl_target_option_restore (&func_options, - TREE_TARGET_OPTION (target_option_default_node)); + /* Walk all the structure fields. */ + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) == FIELD_DECL + && ix86_compat_aligned_value_p (TREE_TYPE (field))) + return true; + } + break; + } - /* FLAGS == 1 is used for target_clones attribute. */ - new_target - = ix86_valid_target_attribute_tree (fndecl, args, &func_options, - &global_options_set, flags == 1); + case ARRAY_TYPE: + /* Just for use if some languages passes arrays by value. */ + if (ix86_compat_aligned_value_p (TREE_TYPE (type))) + return true; + break; - new_optimize = build_optimization_node (&func_options); + default: + gcc_unreachable (); + } + } + return false; +} - if (new_target == error_mark_node) - ret = false; +/* Return the alignment boundary for MODE and TYPE with alignment ALIGN. + XXX: This function is obsolete and is only used for checking psABI + compatibility with previous versions of GCC. */ - else if (fndecl && new_target) +static unsigned int +ix86_compat_function_arg_boundary (machine_mode mode, + const_tree type, unsigned int align) +{ + /* In 32bit, only _Decimal128 and __float128 are aligned to their + natural boundaries. */ + if (!TARGET_64BIT && mode != TDmode && mode != TFmode) { - DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target; + /* i386 ABI defines all arguments to be 4 byte aligned. We have to + make an exception for SSE modes since these require 128bit + alignment. - if (old_optimize != new_optimize) - DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize; + The handling here differs from field_alignment. ICC aligns MMX + arguments to 4 byte boundaries, while structure fields are aligned + to 8 byte boundaries. */ + if (!type) + { + if (!(TARGET_SSE && SSE_REG_MODE_P (mode))) + align = PARM_BOUNDARY; + } + else + { + if (!ix86_compat_aligned_value_p (type)) + align = PARM_BOUNDARY; + } } - - finalize_options_struct (&func_options); - - return ret; + if (align > BIGGEST_ALIGNMENT) + align = BIGGEST_ALIGNMENT; + return align; } - -/* Hook to determine if one function can safely inline another. */ +/* Return true when TYPE should be 128bit aligned for 32bit argument + passing ABI. */ static bool -ix86_can_inline_p (tree caller, tree callee) +ix86_contains_aligned_value_p (const_tree type) { - tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller); - tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee); + machine_mode mode = TYPE_MODE (type); - /* Changes of those flags can be tolerated for always inlines. Lets hope - user knows what he is doing. */ - const unsigned HOST_WIDE_INT always_inline_safe_mask - = (MASK_USE_8BIT_IDIV | MASK_ACCUMULATE_OUTGOING_ARGS - | MASK_NO_ALIGN_STRINGOPS | MASK_AVX256_SPLIT_UNALIGNED_LOAD - | MASK_AVX256_SPLIT_UNALIGNED_STORE | MASK_CLD - | MASK_NO_FANCY_MATH_387 | MASK_IEEE_FP | MASK_INLINE_ALL_STRINGOPS - | MASK_INLINE_STRINGOPS_DYNAMICALLY | MASK_RECIP | MASK_STACK_PROBE - | MASK_STV | MASK_TLS_DIRECT_SEG_REFS | MASK_VZEROUPPER - | MASK_NO_PUSH_ARGS | MASK_OMIT_LEAF_FRAME_POINTER); - - - if (!callee_tree) - callee_tree = target_option_default_node; - if (!caller_tree) - caller_tree = target_option_default_node; - if (callee_tree == caller_tree) - return true; - - struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree); - struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree); - bool ret = false; - bool always_inline - = (DECL_DISREGARD_INLINE_LIMITS (callee) - && lookup_attribute ("always_inline", - DECL_ATTRIBUTES (callee))); - - cgraph_node *callee_node = cgraph_node::get (callee); - /* Callee's isa options should be a subset of the caller's, i.e. a SSE4 - function can inline a SSE2 function but a SSE2 function can't inline - a SSE4 function. */ - if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags) - != callee_opts->x_ix86_isa_flags) - || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2) - != callee_opts->x_ix86_isa_flags2)) - ret = false; - - /* See if we have the same non-isa options. */ - else if ((!always_inline - && caller_opts->x_target_flags != callee_opts->x_target_flags) - || (caller_opts->x_target_flags & ~always_inline_safe_mask) - != (callee_opts->x_target_flags & ~always_inline_safe_mask)) - ret = false; - - /* See if arch, tune, etc. are the same. */ - else if (caller_opts->arch != callee_opts->arch) - ret = false; - - else if (!always_inline && caller_opts->tune != callee_opts->tune) - ret = false; - - else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath - /* If the calle doesn't use FP expressions differences in - ix86_fpmath can be ignored. We are called from FEs - for multi-versioning call optimization, so beware of - ipa_fn_summaries not available. */ - && (! ipa_fn_summaries - || ipa_fn_summaries->get (callee_node) == NULL - || ipa_fn_summaries->get (callee_node)->fp_expressions)) - ret = false; - - else if (!always_inline - && caller_opts->branch_cost != callee_opts->branch_cost) - ret = false; - - else - ret = true; - - return ret; -} - - -/* Remember the last target of ix86_set_current_function. */ -static GTY(()) tree ix86_previous_fndecl; - -/* Set targets globals to the default (or current #pragma GCC target - if active). Invalidate ix86_previous_fndecl cache. */ - -void -ix86_reset_previous_fndecl (void) -{ - tree new_tree = target_option_current_node; - cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree)); - if (TREE_TARGET_GLOBALS (new_tree)) - restore_target_globals (TREE_TARGET_GLOBALS (new_tree)); - else if (new_tree == target_option_default_node) - restore_target_globals (&default_target_globals); - else - TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts (); - ix86_previous_fndecl = NULL_TREE; -} + if (mode == XFmode || mode == XCmode) + return false; -/* Set the func_type field from the function FNDECL. */ + if (TYPE_ALIGN (type) < 128) + return false; -static void -ix86_set_func_type (tree fndecl) -{ - if (cfun->machine->func_type == TYPE_UNKNOWN) + if (AGGREGATE_TYPE_P (type)) { - if (lookup_attribute ("interrupt", - TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))) + /* Walk the aggregates recursively. */ + switch (TREE_CODE (type)) { - if (ix86_function_naked (fndecl)) - error_at (DECL_SOURCE_LOCATION (fndecl), - "interrupt and naked attributes are not compatible"); + case RECORD_TYPE: + case UNION_TYPE: + case QUAL_UNION_TYPE: + { + tree field; - int nargs = 0; - for (tree arg = DECL_ARGUMENTS (fndecl); - arg; - arg = TREE_CHAIN (arg)) - nargs++; - cfun->machine->no_caller_saved_registers = true; - cfun->machine->func_type - = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT; + /* Walk all the structure fields. */ + for (field = TYPE_FIELDS (type); + field; + field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) == FIELD_DECL + && ix86_contains_aligned_value_p (TREE_TYPE (field))) + return true; + } + break; + } - ix86_optimize_mode_switching[X86_DIRFLAG] = 1; + case ARRAY_TYPE: + /* Just for use if some languages passes arrays by value. */ + if (ix86_contains_aligned_value_p (TREE_TYPE (type))) + return true; + break; - /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */ - if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG) - sorry ("only DWARF debug format is supported for interrupt " - "service routine"); - } - else - { - cfun->machine->func_type = TYPE_NORMAL; - if (lookup_attribute ("no_caller_saved_registers", - TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))) - cfun->machine->no_caller_saved_registers = true; + default: + gcc_unreachable (); } } -} - -/* Set the indirect_branch_type field from the function FNDECL. */ + else + return TYPE_ALIGN (type) >= 128; -static void -ix86_set_indirect_branch_type (tree fndecl) -{ - if (cfun->machine->indirect_branch_type == indirect_branch_unset) - { - tree attr = lookup_attribute ("indirect_branch", - DECL_ATTRIBUTES (fndecl)); - if (attr != NULL) - { - tree args = TREE_VALUE (attr); - if (args == NULL) - gcc_unreachable (); - tree cst = TREE_VALUE (args); - if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0) - cfun->machine->indirect_branch_type = indirect_branch_keep; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0) - cfun->machine->indirect_branch_type = indirect_branch_thunk; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0) - cfun->machine->indirect_branch_type = indirect_branch_thunk_inline; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0) - cfun->machine->indirect_branch_type = indirect_branch_thunk_extern; - else - gcc_unreachable (); - } - else - cfun->machine->indirect_branch_type = ix86_indirect_branch; + return false; +} - /* -mcmodel=large is not compatible with -mindirect-branch=thunk - nor -mindirect-branch=thunk-extern. */ - if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) - && ((cfun->machine->indirect_branch_type - == indirect_branch_thunk_extern) - || (cfun->machine->indirect_branch_type - == indirect_branch_thunk))) - error ("%<-mindirect-branch=%s%> and %<-mcmodel=large%> are not " - "compatible", - ((cfun->machine->indirect_branch_type - == indirect_branch_thunk_extern) - ? "thunk-extern" : "thunk")); - - if (cfun->machine->indirect_branch_type != indirect_branch_keep - && (flag_cf_protection & CF_RETURN)) - error ("%<-mindirect-branch%> and %<-fcf-protection%> are not " - "compatible"); - } - - if (cfun->machine->function_return_type == indirect_branch_unset) - { - tree attr = lookup_attribute ("function_return", - DECL_ATTRIBUTES (fndecl)); - if (attr != NULL) - { - tree args = TREE_VALUE (attr); - if (args == NULL) - gcc_unreachable (); - tree cst = TREE_VALUE (args); - if (strcmp (TREE_STRING_POINTER (cst), "keep") == 0) - cfun->machine->function_return_type = indirect_branch_keep; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk") == 0) - cfun->machine->function_return_type = indirect_branch_thunk; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk-inline") == 0) - cfun->machine->function_return_type = indirect_branch_thunk_inline; - else if (strcmp (TREE_STRING_POINTER (cst), "thunk-extern") == 0) - cfun->machine->function_return_type = indirect_branch_thunk_extern; - else - gcc_unreachable (); - } - else - cfun->machine->function_return_type = ix86_function_return; +/* Gives the alignment boundary, in bits, of an argument with the + specified mode and type. */ - /* -mcmodel=large is not compatible with -mfunction-return=thunk - nor -mfunction-return=thunk-extern. */ - if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) - && ((cfun->machine->function_return_type - == indirect_branch_thunk_extern) - || (cfun->machine->function_return_type - == indirect_branch_thunk))) - error ("%<-mfunction-return=%s%> and %<-mcmodel=large%> are not " - "compatible", - ((cfun->machine->function_return_type - == indirect_branch_thunk_extern) - ? "thunk-extern" : "thunk")); - - if (cfun->machine->function_return_type != indirect_branch_keep - && (flag_cf_protection & CF_RETURN)) - error ("%<-mfunction-return%> and %<-fcf-protection%> are not " - "compatible"); - } -} - -/* Establish appropriate back-end context for processing the function - FNDECL. The argument might be NULL to indicate processing at top - level, outside of any function scope. */ -static void -ix86_set_current_function (tree fndecl) +static unsigned int +ix86_function_arg_boundary (machine_mode mode, const_tree type) { - /* Only change the context if the function changes. This hook is called - several times in the course of compiling a function, and we don't want to - slow things down too much or call target_reinit when it isn't safe. */ - if (fndecl == ix86_previous_fndecl) + unsigned int align; + if (type) { - /* There may be 2 function bodies for the same function FNDECL, - one is extern inline and one isn't. Call ix86_set_func_type - to set the func_type field. */ - if (fndecl != NULL_TREE) - { - ix86_set_func_type (fndecl); - ix86_set_indirect_branch_type (fndecl); - } - return; + /* Since the main variant type is used for call, we convert it to + the main variant type. */ + type = TYPE_MAIN_VARIANT (type); + align = TYPE_ALIGN (type); + if (TYPE_EMPTY_P (type)) + return PARM_BOUNDARY; } - - tree old_tree; - if (ix86_previous_fndecl == NULL_TREE) - old_tree = target_option_current_node; - else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)) - old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl); else - old_tree = target_option_default_node; - - if (fndecl == NULL_TREE) + align = GET_MODE_ALIGNMENT (mode); + if (align < PARM_BOUNDARY) + align = PARM_BOUNDARY; + else { - if (old_tree != target_option_current_node) - ix86_reset_previous_fndecl (); - return; - } - - ix86_set_func_type (fndecl); - ix86_set_indirect_branch_type (fndecl); - - tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl); - if (new_tree == NULL_TREE) - new_tree = target_option_default_node; + static bool warned; + unsigned int saved_align = align; - if (old_tree != new_tree) - { - cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree)); - if (TREE_TARGET_GLOBALS (new_tree)) - restore_target_globals (TREE_TARGET_GLOBALS (new_tree)); - else if (new_tree == target_option_default_node) - restore_target_globals (&default_target_globals); - else - TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts (); - } - ix86_previous_fndecl = fndecl; + if (!TARGET_64BIT) + { + /* i386 ABI defines XFmode arguments to be 4 byte aligned. */ + if (!type) + { + if (mode == XFmode || mode == XCmode) + align = PARM_BOUNDARY; + } + else if (!ix86_contains_aligned_value_p (type)) + align = PARM_BOUNDARY; - static bool prev_no_caller_saved_registers; + if (align < 128) + align = PARM_BOUNDARY; + } - /* 64-bit MS and SYSV ABI have different set of call used registers. - Avoid expensive re-initialization of init_regs each time we switch - function context. */ - if (TARGET_64BIT - && (call_used_regs[SI_REG] - == (cfun->machine->call_abi == MS_ABI))) - reinit_regs (); - /* Need to re-initialize init_regs if caller-saved registers are - changed. */ - else if (prev_no_caller_saved_registers - != cfun->machine->no_caller_saved_registers) - reinit_regs (); - - if (cfun->machine->func_type != TYPE_NORMAL - || cfun->machine->no_caller_saved_registers) - { - /* Don't allow SSE, MMX nor x87 instructions since they - may change processor state. */ - const char *isa; - if (TARGET_SSE) - isa = "SSE"; - else if (TARGET_MMX) - isa = "MMX/3Dnow"; - else if (TARGET_80387) - isa = "80387"; - else - isa = NULL; - if (isa != NULL) - { - if (cfun->machine->func_type != TYPE_NORMAL) - sorry (cfun->machine->func_type == TYPE_EXCEPTION - ? G_("%s instructions aren%'t allowed in an" - " exception service routine") - : G_("%s instructions aren%'t allowed in an" - " interrupt service routine"), - isa); - else - sorry ("%s instructions aren%'t allowed in a function with " - "the % attribute", isa); - /* Don't issue the same error twice. */ - cfun->machine->func_type = TYPE_NORMAL; - cfun->machine->no_caller_saved_registers = false; + if (warn_psabi + && !warned + && align != ix86_compat_function_arg_boundary (mode, type, + saved_align)) + { + warned = true; + inform (input_location, + "the ABI for passing parameters with %d-byte" + " alignment has changed in GCC 4.6", + align / BITS_PER_UNIT); } } - prev_no_caller_saved_registers - = cfun->machine->no_caller_saved_registers; + return align; } - -/* Return true if this goes in large data/bss. */ +/* Return true if N is a possible register number of function value. */ static bool -ix86_in_large_data_p (tree exp) +ix86_function_value_regno_p (const unsigned int regno) { - if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC) - return false; - - if (exp == NULL_TREE) - return false; - - /* Functions are never large data. */ - if (TREE_CODE (exp) == FUNCTION_DECL) - return false; + switch (regno) + { + case AX_REG: + return true; + case DX_REG: + return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI); + case DI_REG: + case SI_REG: + return TARGET_64BIT && ix86_cfun_abi () != MS_ABI; - /* Automatic variables are never large data. */ - if (VAR_P (exp) && !is_global_var (exp)) - return false; + /* Complex values are returned in %st(0)/%st(1) pair. */ + case ST0_REG: + case ST1_REG: + /* TODO: The function should depend on current function ABI but + builtins.c would need updating then. Therefore we use the + default ABI. */ + if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI) + return false; + return TARGET_FLOAT_RETURNS_IN_80387; - if (VAR_P (exp) && DECL_SECTION_NAME (exp)) - { - const char *section = DECL_SECTION_NAME (exp); - if (strcmp (section, ".ldata") == 0 - || strcmp (section, ".lbss") == 0) - return true; - return false; - } - else - { - HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp)); + /* Complex values are returned in %xmm0/%xmm1 pair. */ + case XMM0_REG: + case XMM1_REG: + return TARGET_SSE; - /* If this is an incomplete type with size 0, then we can't put it - in data because it might be too big when completed. Also, - int_size_in_bytes returns -1 if size can vary or is larger than - an integer in which case also it is safer to assume that it goes in - large data. */ - if (size <= 0 || size > ix86_section_threshold) - return true; + case MM0_REG: + if (TARGET_MACHO || TARGET_64BIT) + return false; + return TARGET_MMX; } return false; } -/* i386-specific section flag to mark large sections. */ -#define SECTION_LARGE SECTION_MACH_DEP +/* Define how to find the value returned by a function. + VALTYPE is the data type of the value (as a tree). + If the precise function being called is known, FUNC is its FUNCTION_DECL; + otherwise, FUNC is 0. */ -/* Switch to the appropriate section for output of DECL. - DECL is either a `VAR_DECL' node or a constant of some sort. - RELOC indicates whether forming the initial value of DECL requires - link-time relocations. */ - -ATTRIBUTE_UNUSED static section * -x86_64_elf_select_section (tree decl, int reloc, - unsigned HOST_WIDE_INT align) +static rtx +function_value_32 (machine_mode orig_mode, machine_mode mode, + const_tree fntype, const_tree fn) { - if (ix86_in_large_data_p (decl)) - { - const char *sname = NULL; - unsigned int flags = SECTION_WRITE | SECTION_LARGE; - switch (categorize_decl_for_section (decl, reloc)) - { - case SECCAT_DATA: - sname = ".ldata"; - break; - case SECCAT_DATA_REL: - sname = ".ldata.rel"; - break; - case SECCAT_DATA_REL_LOCAL: - sname = ".ldata.rel.local"; - break; - case SECCAT_DATA_REL_RO: - sname = ".ldata.rel.ro"; - break; - case SECCAT_DATA_REL_RO_LOCAL: - sname = ".ldata.rel.ro.local"; - break; - case SECCAT_BSS: - sname = ".lbss"; - flags |= SECTION_BSS; - break; - case SECCAT_RODATA: - case SECCAT_RODATA_MERGE_STR: - case SECCAT_RODATA_MERGE_STR_INIT: - case SECCAT_RODATA_MERGE_CONST: - sname = ".lrodata"; - flags &= ~SECTION_WRITE; - break; - case SECCAT_SRODATA: - case SECCAT_SDATA: - case SECCAT_SBSS: - gcc_unreachable (); - case SECCAT_TEXT: - case SECCAT_TDATA: - case SECCAT_TBSS: - /* We don't split these for medium model. Place them into - default sections and hope for best. */ - break; - } - if (sname) - { - /* We might get called with string constants, but get_named_section - doesn't like them as they are not DECLs. Also, we need to set - flags in that case. */ - if (!DECL_P (decl)) - return get_section (sname, flags, NULL); - return get_named_section (decl, sname, reloc); - } - } - return default_elf_select_section (decl, reloc, align); -} + unsigned int regno; -/* Select a set of attributes for section NAME based on the properties - of DECL and whether or not RELOC indicates that DECL's initializer - might contain runtime relocations. */ + /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where + we normally prevent this case when mmx is not available. However + some ABIs may require the result to be returned like DImode. */ + if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8) + regno = FIRST_MMX_REG; -static unsigned int ATTRIBUTE_UNUSED -x86_64_elf_section_type_flags (tree decl, const char *name, int reloc) -{ - unsigned int flags = default_section_type_flags (decl, name, reloc); + /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where + we prevent this case when sse is not available. However some ABIs + may require the result to be returned like integer TImode. */ + else if (mode == TImode + || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16)) + regno = FIRST_SSE_REG; - if (ix86_in_large_data_p (decl)) - flags |= SECTION_LARGE; + /* 32-byte vector modes in %ymm0. */ + else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32) + regno = FIRST_SSE_REG; - if (decl == NULL_TREE - && (strcmp (name, ".ldata.rel.ro") == 0 - || strcmp (name, ".ldata.rel.ro.local") == 0)) - flags |= SECTION_RELRO; + /* 64-byte vector modes in %zmm0. */ + else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) + regno = FIRST_SSE_REG; - if (strcmp (name, ".lbss") == 0 - || strncmp (name, ".lbss.", 5) == 0 - || strncmp (name, ".gnu.linkonce.lb.", 16) == 0) - flags |= SECTION_BSS; + /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */ + else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387) + regno = FIRST_FLOAT_REG; + else + /* Most things go in %eax. */ + regno = AX_REG; - return flags; -} + /* Override FP return register with %xmm0 for local functions when + SSE math is enabled or for functions with sseregparm attribute. */ + if ((fn || fntype) && (mode == SFmode || mode == DFmode)) + { + int sse_level = ix86_function_sseregparm (fntype, fn, false); + if (sse_level == -1) + { + error ("calling %qD with SSE calling convention without " + "SSE/SSE2 enabled", fn); + sorry ("this is a GCC bug that can be worked around by adding " + "attribute used to function called"); + } + else if ((sse_level >= 1 && mode == SFmode) + || (sse_level == 2 && mode == DFmode)) + regno = FIRST_SSE_REG; + } -/* Build up a unique section name, expressed as a - STRING_CST node, and assign it to DECL_SECTION_NAME (decl). - RELOC indicates whether the initial value of EXP requires - link-time relocations. */ + /* OImode shouldn't be used directly. */ + gcc_assert (mode != OImode); -static void ATTRIBUTE_UNUSED -x86_64_elf_unique_section (tree decl, int reloc) + return gen_rtx_REG (orig_mode, regno); +} + +static rtx +function_value_64 (machine_mode orig_mode, machine_mode mode, + const_tree valtype) { - if (ix86_in_large_data_p (decl)) + rtx ret; + + /* Handle libcalls, which don't provide a type node. */ + if (valtype == NULL) { - const char *prefix = NULL; - /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */ - bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP; + unsigned int regno; - switch (categorize_decl_for_section (decl, reloc)) + switch (mode) { - case SECCAT_DATA: - case SECCAT_DATA_REL: - case SECCAT_DATA_REL_LOCAL: - case SECCAT_DATA_REL_RO: - case SECCAT_DATA_REL_RO_LOCAL: - prefix = one_only ? ".ld" : ".ldata"; - break; - case SECCAT_BSS: - prefix = one_only ? ".lb" : ".lbss"; - break; - case SECCAT_RODATA: - case SECCAT_RODATA_MERGE_STR: - case SECCAT_RODATA_MERGE_STR_INIT: - case SECCAT_RODATA_MERGE_CONST: - prefix = one_only ? ".lr" : ".lrodata"; + case E_SFmode: + case E_SCmode: + case E_DFmode: + case E_DCmode: + case E_TFmode: + case E_SDmode: + case E_DDmode: + case E_TDmode: + regno = FIRST_SSE_REG; break; - case SECCAT_SRODATA: - case SECCAT_SDATA: - case SECCAT_SBSS: - gcc_unreachable (); - case SECCAT_TEXT: - case SECCAT_TDATA: - case SECCAT_TBSS: - /* We don't split these for medium model. Place them into - default sections and hope for best. */ + case E_XFmode: + case E_XCmode: + regno = FIRST_FLOAT_REG; break; + case E_TCmode: + return NULL; + default: + regno = AX_REG; } - if (prefix) - { - const char *name, *linkonce; - char *string; - name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); - name = targetm.strip_name_encoding (name); + return gen_rtx_REG (mode, regno); + } + else if (POINTER_TYPE_P (valtype)) + { + /* Pointers are always returned in word_mode. */ + mode = word_mode; + } - /* If we're using one_only, then there needs to be a .gnu.linkonce - prefix to the section name. */ - linkonce = one_only ? ".gnu.linkonce" : ""; + ret = construct_container (mode, orig_mode, valtype, 1, + X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX, + x86_64_int_return_registers, 0); - string = ACONCAT ((linkonce, prefix, ".", name, NULL)); + /* For zero sized structures, construct_container returns NULL, but we + need to keep rest of compiler happy by returning meaningful value. */ + if (!ret) + ret = gen_rtx_REG (orig_mode, AX_REG); - set_decl_section_name (decl, string); - return; - } - } - default_unique_section (decl, reloc); + return ret; } -#ifdef COMMON_ASM_OP - -#ifndef LARGECOMM_SECTION_ASM_OP -#define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t" -#endif +static rtx +function_value_ms_32 (machine_mode orig_mode, machine_mode mode, + const_tree fntype, const_tree fn, const_tree valtype) +{ + unsigned int regno; -/* This says how to output assembler code to declare an - uninitialized external linkage data object. + /* Floating point return values in %st(0) + (unless -mno-fp-ret-in-387 or aggregate type of up to 8 bytes). */ + if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387 + && (GET_MODE_SIZE (mode) > 8 + || valtype == NULL_TREE || !AGGREGATE_TYPE_P (valtype))) + { + regno = FIRST_FLOAT_REG; + return gen_rtx_REG (orig_mode, regno); + } + else + return function_value_32(orig_mode, mode, fntype,fn); +} - For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for - large objects. */ -void -x86_elf_aligned_decl_common (FILE *file, tree decl, - const char *name, unsigned HOST_WIDE_INT size, - int align) +static rtx +function_value_ms_64 (machine_mode orig_mode, machine_mode mode, + const_tree valtype) { - if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) - && size > (unsigned int)ix86_section_threshold) + unsigned int regno = AX_REG; + + if (TARGET_SSE) { - switch_to_section (get_named_section (decl, ".lbss", 0)); - fputs (LARGECOMM_SECTION_ASM_OP, file); + switch (GET_MODE_SIZE (mode)) + { + case 16: + if (valtype != NULL_TREE + && !VECTOR_INTEGER_TYPE_P (valtype) + && !VECTOR_INTEGER_TYPE_P (valtype) + && !INTEGRAL_TYPE_P (valtype) + && !VECTOR_FLOAT_TYPE_P (valtype)) + break; + if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)) + && !COMPLEX_MODE_P (mode)) + regno = FIRST_SSE_REG; + break; + case 8: + case 4: + if (valtype != NULL_TREE && AGGREGATE_TYPE_P (valtype)) + break; + if (mode == SFmode || mode == DFmode) + regno = FIRST_SSE_REG; + break; + default: + break; + } } - else - fputs (COMMON_ASM_OP, file); - assemble_name (file, name); - fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n", - size, align / BITS_PER_UNIT); + return gen_rtx_REG (orig_mode, regno); } -#endif - -/* Utility function for targets to use in implementing - ASM_OUTPUT_ALIGNED_BSS. */ -void -x86_output_aligned_bss (FILE *file, tree decl, const char *name, - unsigned HOST_WIDE_INT size, int align) +static rtx +ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl, + machine_mode orig_mode, machine_mode mode) { - if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) - && size > (unsigned int)ix86_section_threshold) - switch_to_section (get_named_section (decl, ".lbss", 0)); + const_tree fn, fntype; + + fn = NULL_TREE; + if (fntype_or_decl && DECL_P (fntype_or_decl)) + fn = fntype_or_decl; + fntype = fn ? TREE_TYPE (fn) : fntype_or_decl; + + if (ix86_function_type_abi (fntype) == MS_ABI) + { + if (TARGET_64BIT) + return function_value_ms_64 (orig_mode, mode, valtype); + else + return function_value_ms_32 (orig_mode, mode, fntype, fn, valtype); + } + else if (TARGET_64BIT) + return function_value_64 (orig_mode, mode, valtype); else - switch_to_section (bss_section); - ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT)); -#ifdef ASM_DECLARE_OBJECT_NAME - last_assemble_variable_decl = decl; - ASM_DECLARE_OBJECT_NAME (file, name, decl); -#else - /* Standard thing is just output label for the object. */ - ASM_OUTPUT_LABEL (file, name); -#endif /* ASM_DECLARE_OBJECT_NAME */ - ASM_OUTPUT_SKIP (file, size ? size : 1); + return function_value_32 (orig_mode, mode, fntype, fn); } - -/* Decide whether we must probe the stack before any space allocation - on this target. It's essentially TARGET_STACK_PROBE except when - -fstack-check causes the stack to be already probed differently. */ -bool -ix86_target_stack_probe (void) +static rtx +ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool) { - /* Do not probe the stack twice if static stack checking is enabled. */ - if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK) - return false; + machine_mode mode, orig_mode; - return TARGET_STACK_PROBE; + orig_mode = TYPE_MODE (valtype); + mode = type_natural_mode (valtype, NULL, true); + return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode); } - -/* Decide whether we can make a sibling call to a function. DECL is the - declaration of the function being targeted by the call and EXP is the - CALL_EXPR representing the call. */ - -static bool -ix86_function_ok_for_sibcall (tree decl, tree exp) -{ - tree type, decl_or_type; - rtx a, b; - bool bind_global = decl && !targetm.binds_local_p (decl); - if (ix86_function_naked (current_function_decl)) - return false; - - /* Sibling call isn't OK if there are no caller-saved registers - since all registers must be preserved before return. */ - if (cfun->machine->no_caller_saved_registers) - return false; - - /* If we are generating position-independent code, we cannot sibcall - optimize direct calls to global functions, as the PLT requires - %ebx be live. (Darwin does not have a PLT.) */ - if (!TARGET_MACHO - && !TARGET_64BIT - && flag_pic - && flag_plt - && bind_global) - return false; - - /* If we need to align the outgoing stack, then sibcalling would - unalign the stack, which may break the called function. */ - if (ix86_minimum_incoming_stack_boundary (true) - < PREFERRED_STACK_BOUNDARY) - return false; - - if (decl) - { - decl_or_type = decl; - type = TREE_TYPE (decl); - } - else - { - /* We're looking at the CALL_EXPR, we need the type of the function. */ - type = CALL_EXPR_FN (exp); /* pointer expression */ - type = TREE_TYPE (type); /* pointer type */ - type = TREE_TYPE (type); /* function type */ - decl_or_type = type; - } +/* Pointer function arguments and return values are promoted to + word_mode for normal functions. */ - /* Check that the return value locations are the same. Like - if we are returning floats on the 80387 register stack, we cannot - make a sibcall from a function that doesn't return a float to a - function that does or, conversely, from a function that does return - a float to a function that doesn't; the necessary stack adjustment - would not be executed. This is also the place we notice - differences in the return value ABI. Note that it is ok for one - of the functions to have void return type as long as the return - value of the other is passed in a register. */ - a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false); - b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)), - cfun->decl, false); - if (STACK_REG_P (a) || STACK_REG_P (b)) +static machine_mode +ix86_promote_function_mode (const_tree type, machine_mode mode, + int *punsignedp, const_tree fntype, + int for_return) +{ + if (cfun->machine->func_type == TYPE_NORMAL + && type != NULL_TREE + && POINTER_TYPE_P (type)) { - if (!rtx_equal_p (a, b)) - return false; + *punsignedp = POINTERS_EXTEND_UNSIGNED; + return word_mode; } - else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl)))) - ; - else if (!rtx_equal_p (a, b)) - return false; + return default_promote_function_mode (type, mode, punsignedp, fntype, + for_return); +} - if (TARGET_64BIT) - { - /* The SYSV ABI has more call-clobbered registers; - disallow sibcalls from MS to SYSV. */ - if (cfun->machine->call_abi == MS_ABI - && ix86_function_type_abi (type) == SYSV_ABI) - return false; - } - else - { - /* If this call is indirect, we'll need to be able to use a - call-clobbered register for the address of the target function. - Make sure that all such registers are not used for passing - parameters. Note that DLLIMPORT functions and call to global - function via GOT slot are indirect. */ - if (!decl - || (bind_global && flag_pic && !flag_plt) - || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)) - || flag_force_indirect_call) - { - /* Check if regparm >= 3 since arg_reg_available is set to - false if regparm == 0. If regparm is 1 or 2, there is - always a call-clobbered register available. +/* Return true if a structure, union or array with MODE containing FIELD + should be accessed using BLKmode. */ - ??? The symbol indirect call doesn't need a call-clobbered - register. But we don't know if this is a symbol indirect - call or not here. */ - if (ix86_function_regparm (type, decl) >= 3 - && !cfun->machine->arg_reg_available) - return false; - } - } +static bool +ix86_member_type_forces_blk (const_tree field, machine_mode mode) +{ + /* Union with XFmode must be in BLKmode. */ + return (mode == XFmode + && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE + || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE)); +} - /* Otherwise okay. That also includes certain types of indirect calls. */ - return true; +rtx +ix86_libcall_value (machine_mode mode) +{ + return ix86_function_value_1 (NULL, NULL, mode, mode); } -/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall", - and "sseregparm" calling convention attributes; - arguments as in struct attribute_spec.handler. */ +/* Return true iff type is returned in memory. */ -static tree -ix86_handle_cconv_attribute (tree *node, tree name, tree args, int, - bool *no_add_attrs) +static bool +ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) { - if (TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE - && TREE_CODE (*node) != FIELD_DECL - && TREE_CODE (*node) != TYPE_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - return NULL_TREE; - } +#ifdef SUBTARGET_RETURN_IN_MEMORY + return SUBTARGET_RETURN_IN_MEMORY (type, fntype); +#else + const machine_mode mode = type_natural_mode (type, NULL, true); + HOST_WIDE_INT size; - /* Can combine regparm with all attributes but fastcall, and thiscall. */ - if (is_attribute_p ("regparm", name)) + if (TARGET_64BIT) { - tree cst; - - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and regparm attributes are not compatible"); - } - - if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + if (ix86_function_type_abi (fntype) == MS_ABI) { - error ("regparam and thiscall attributes are not compatible"); - } + size = int_size_in_bytes (type); - cst = TREE_VALUE (args); - if (TREE_CODE (cst) != INTEGER_CST) - { - warning (OPT_Wattributes, - "%qE attribute requires an integer constant argument", - name); - *no_add_attrs = true; + /* __m128 is returned in xmm0. */ + if ((!type || VECTOR_INTEGER_TYPE_P (type) + || INTEGRAL_TYPE_P (type) + || VECTOR_FLOAT_TYPE_P (type)) + && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)) + && !COMPLEX_MODE_P (mode) + && (GET_MODE_SIZE (mode) == 16 || size == 16)) + return false; + + /* Otherwise, the size must be exactly in [1248]. */ + return size != 1 && size != 2 && size != 4 && size != 8; } - else if (compare_tree_int (cst, REGPARM_MAX) > 0) + else { - warning (OPT_Wattributes, "argument to %qE attribute larger than %d", - name, REGPARM_MAX); - *no_add_attrs = true; - } + int needed_intregs, needed_sseregs; - return NULL_TREE; + return examine_argument (mode, type, 1, + &needed_intregs, &needed_sseregs); + } } - - if (TARGET_64BIT) + else { - /* Do not warn when emulating the MS ABI. */ - if ((TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE) - || ix86_function_type_abi (*node) != MS_ABI) - warning (OPT_Wattributes, "%qE attribute ignored", - name); - *no_add_attrs = true; - return NULL_TREE; - } + size = int_size_in_bytes (type); - /* Can combine fastcall with stdcall (redundant) and sseregparm. */ - if (is_attribute_p ("fastcall", name)) - { - if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and cdecl attributes are not compatible"); - } - if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and stdcall attributes are not compatible"); - } - if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and regparm attributes are not compatible"); - } - if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and thiscall attributes are not compatible"); - } - } + /* Intel MCU psABI returns scalars and aggregates no larger than 8 + bytes in registers. */ + if (TARGET_IAMCU) + return VECTOR_MODE_P (mode) || size < 0 || size > 8; - /* Can combine stdcall with fastcall (redundant), regparm and - sseregparm. */ - else if (is_attribute_p ("stdcall", name)) - { - if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and cdecl attributes are not compatible"); - } - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and fastcall attributes are not compatible"); - } - if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and thiscall attributes are not compatible"); - } - } + if (mode == BLKmode) + return true; - /* Can combine cdecl with regparm and sseregparm. */ - else if (is_attribute_p ("cdecl", name)) - { - if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and cdecl attributes are not compatible"); - } - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and cdecl attributes are not compatible"); - } - if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) - { - error ("cdecl and thiscall attributes are not compatible"); - } - } - else if (is_attribute_p ("thiscall", name)) - { - if (TREE_CODE (*node) != METHOD_TYPE && pedantic) - warning (OPT_Wattributes, "%qE attribute is used for non-class method", - name); - if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) - { - error ("stdcall and thiscall attributes are not compatible"); - } - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) - { - error ("fastcall and thiscall attributes are not compatible"); - } - if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) + if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8) + return false; + + if (VECTOR_MODE_P (mode) || mode == TImode) { - error ("cdecl and thiscall attributes are not compatible"); - } - } + /* User-created vectors small enough to fit in EAX. */ + if (size < 8) + return false; - /* Can combine sseregparm with all attributes. */ + /* Unless ABI prescibes otherwise, + MMX/3dNow values are returned in MM0 if available. */ + + if (size == 8) + return TARGET_VECT8_RETURNS || !TARGET_MMX; - return NULL_TREE; -} + /* SSE values are returned in XMM0 if available. */ + if (size == 16) + return !TARGET_SSE; -/* The transactional memory builtins are implicitly regparm or fastcall - depending on the ABI. Override the generic do-nothing attribute that - these builtins were declared with, and replace it with one of the two - attributes that we expect elsewhere. */ + /* AVX values are returned in YMM0 if available. */ + if (size == 32) + return !TARGET_AVX; -static tree -ix86_handle_tm_regparm_attribute (tree *node, tree, tree, - int flags, bool *no_add_attrs) -{ - tree alt; + /* AVX512F values are returned in ZMM0 if available. */ + if (size == 64) + return !TARGET_AVX512F; + } - /* In no case do we want to add the placeholder attribute. */ - *no_add_attrs = true; + if (mode == XFmode) + return false; - /* The 64-bit ABI is unchanged for transactional memory. */ - if (TARGET_64BIT) - return NULL_TREE; + if (size > 12) + return true; - /* ??? Is there a better way to validate 32-bit windows? We have - cfun->machine->call_abi, but that seems to be set only for 64-bit. */ - if (CHECK_STACK_LIMIT > 0) - alt = tree_cons (get_identifier ("fastcall"), NULL, NULL); - else - { - alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL); - alt = tree_cons (get_identifier ("regparm"), alt, NULL); - } - decl_attributes (node, alt, flags); + /* OImode shouldn't be used directly. */ + gcc_assert (mode != OImode); - return NULL_TREE; + return false; + } +#endif } -/* This function determines from TYPE the calling-convention. */ + +/* Create the va_list data type. */ -unsigned int -ix86_get_callcvt (const_tree type) +static tree +ix86_build_builtin_va_list_64 (void) { - unsigned int ret = 0; - bool is_stdarg; - tree attrs; + tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl; - if (TARGET_64BIT) - return IX86_CALLCVT_CDECL; + record = lang_hooks.types.make_type (RECORD_TYPE); + type_decl = build_decl (BUILTINS_LOCATION, + TYPE_DECL, get_identifier ("__va_list_tag"), record); - attrs = TYPE_ATTRIBUTES (type); - if (attrs != NULL_TREE) - { - if (lookup_attribute ("cdecl", attrs)) - ret |= IX86_CALLCVT_CDECL; - else if (lookup_attribute ("stdcall", attrs)) - ret |= IX86_CALLCVT_STDCALL; - else if (lookup_attribute ("fastcall", attrs)) - ret |= IX86_CALLCVT_FASTCALL; - else if (lookup_attribute ("thiscall", attrs)) - ret |= IX86_CALLCVT_THISCALL; + f_gpr = build_decl (BUILTINS_LOCATION, + FIELD_DECL, get_identifier ("gp_offset"), + unsigned_type_node); + f_fpr = build_decl (BUILTINS_LOCATION, + FIELD_DECL, get_identifier ("fp_offset"), + unsigned_type_node); + f_ovf = build_decl (BUILTINS_LOCATION, + FIELD_DECL, get_identifier ("overflow_arg_area"), + ptr_type_node); + f_sav = build_decl (BUILTINS_LOCATION, + FIELD_DECL, get_identifier ("reg_save_area"), + ptr_type_node); - /* Regparam isn't allowed for thiscall and fastcall. */ - if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0) - { - if (lookup_attribute ("regparm", attrs)) - ret |= IX86_CALLCVT_REGPARM; - if (lookup_attribute ("sseregparm", attrs)) - ret |= IX86_CALLCVT_SSEREGPARM; - } + va_list_gpr_counter_field = f_gpr; + va_list_fpr_counter_field = f_fpr; - if (IX86_BASE_CALLCVT(ret) != 0) - return ret; - } + DECL_FIELD_CONTEXT (f_gpr) = record; + DECL_FIELD_CONTEXT (f_fpr) = record; + DECL_FIELD_CONTEXT (f_ovf) = record; + DECL_FIELD_CONTEXT (f_sav) = record; - is_stdarg = stdarg_p (type); - if (TARGET_RTD && !is_stdarg) - return IX86_CALLCVT_STDCALL | ret; + TYPE_STUB_DECL (record) = type_decl; + TYPE_NAME (record) = type_decl; + TYPE_FIELDS (record) = f_gpr; + DECL_CHAIN (f_gpr) = f_fpr; + DECL_CHAIN (f_fpr) = f_ovf; + DECL_CHAIN (f_ovf) = f_sav; - if (ret != 0 - || is_stdarg - || TREE_CODE (type) != METHOD_TYPE - || ix86_function_type_abi (type) != MS_ABI) - return IX86_CALLCVT_CDECL | ret; + layout_type (record); - return IX86_CALLCVT_THISCALL; + TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"), + NULL_TREE, TYPE_ATTRIBUTES (record)); + + /* The correct type is an array type of one element. */ + return build_array_type (record, build_index_type (size_zero_node)); } -/* Return 0 if the attributes for two types are incompatible, 1 if they - are compatible, and 2 if they are nearly compatible (which causes a - warning to be generated). */ +/* Setup the builtin va_list data type and for 64-bit the additional + calling convention specific va_list data types. */ -static int -ix86_comp_type_attributes (const_tree type1, const_tree type2) +static tree +ix86_build_builtin_va_list (void) { - unsigned int ccvt1, ccvt2; - - if (TREE_CODE (type1) != FUNCTION_TYPE - && TREE_CODE (type1) != METHOD_TYPE) - return 1; - - ccvt1 = ix86_get_callcvt (type1); - ccvt2 = ix86_get_callcvt (type2); - if (ccvt1 != ccvt2) - return 0; - if (ix86_function_regparm (type1, NULL) - != ix86_function_regparm (type2, NULL)) - return 0; + if (TARGET_64BIT) + { + /* Initialize ABI specific va_list builtin types. - return 1; -} - -/* Return the regparm value for a function with the indicated TYPE and DECL. - DECL may be NULL when calling function indirectly - or considering a libcall. */ + In lto1, we can encounter two va_list types: + - one as a result of the type-merge across TUs, and + - the one constructed here. + These two types will not have the same TYPE_MAIN_VARIANT, and therefore + a type identity check in canonical_va_list_type based on + TYPE_MAIN_VARIANT (which we used to have) will not work. + Instead, we tag each va_list_type_node with its unique attribute, and + look for the attribute in the type identity check in + canonical_va_list_type. -static int -ix86_function_regparm (const_tree type, const_tree decl) -{ - tree attr; - int regparm; - unsigned int ccvt; + Tagging sysv_va_list_type_node directly with the attribute is + problematic since it's a array of one record, which will degrade into a + pointer to record when used as parameter (see build_va_arg comments for + an example), dropping the attribute in the process. So we tag the + record instead. */ - if (TARGET_64BIT) - return (ix86_function_type_abi (type) == SYSV_ABI - ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX); - ccvt = ix86_get_callcvt (type); - regparm = ix86_regparm; + /* For SYSV_ABI we use an array of one record. */ + sysv_va_list_type_node = ix86_build_builtin_va_list_64 (); + + /* For MS_ABI we use plain pointer to argument area. */ + tree char_ptr_type = build_pointer_type (char_type_node); + tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE, + TYPE_ATTRIBUTES (char_ptr_type)); + ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr); - if ((ccvt & IX86_CALLCVT_REGPARM) != 0) + return ((ix86_abi == MS_ABI) + ? ms_va_list_type_node + : sysv_va_list_type_node); + } + else { - attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); - if (attr) - { - regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); - return regparm; - } + /* For i386 we use plain pointer to argument area. */ + return build_pointer_type (char_type_node); } - else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) - return 2; - else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) - return 1; +} - /* Use register calling convention for local functions when possible. */ - if (decl - && TREE_CODE (decl) == FUNCTION_DECL) - { - cgraph_node *target = cgraph_node::get (decl); - if (target) - target = target->function_symbol (); +/* Worker function for TARGET_SETUP_INCOMING_VARARGS. */ - /* Caller and callee must agree on the calling convention, so - checking here just optimize means that with - __attribute__((optimize (...))) caller could use regparm convention - and callee not, or vice versa. Instead look at whether the callee - is optimized or not. */ - if (target && opt_for_fn (target->decl, optimize) - && !(profile_flag && !flag_fentry)) - { - cgraph_local_info *i = &target->local; - if (i && i->local && i->can_change_signature) - { - int local_regparm, globals = 0, regno; +static void +setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) +{ + rtx save_area, mem; + alias_set_type set; + int i, max; - /* Make sure no regparm register is taken by a - fixed register variable. */ - for (local_regparm = 0; local_regparm < REGPARM_MAX; - local_regparm++) - if (fixed_regs[local_regparm]) - break; + /* GPR size of varargs save area. */ + if (cfun->va_list_gpr_size) + ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD; + else + ix86_varargs_gpr_size = 0; - /* We don't want to use regparm(3) for nested functions as - these use a static chain pointer in the third argument. */ - if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl)) - local_regparm = 2; + /* FPR size of varargs save area. We don't need it if we don't pass + anything in SSE registers. */ + if (TARGET_SSE && cfun->va_list_fpr_size) + ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16; + else + ix86_varargs_fpr_size = 0; - /* Save a register for the split stack. */ - if (flag_split_stack) - { - if (local_regparm == 3) - local_regparm = 2; - else if (local_regparm == 2 - && DECL_STATIC_CHAIN (target->decl)) - local_regparm = 1; - } + if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size) + return; - /* Each fixed register usage increases register pressure, - so less registers should be used for argument passing. - This functionality can be overriden by an explicit - regparm value. */ - for (regno = AX_REG; regno <= DI_REG; regno++) - if (fixed_regs[regno]) - globals++; + save_area = frame_pointer_rtx; + set = get_varargs_alias_set (); - local_regparm - = globals < local_regparm ? local_regparm - globals : 0; + max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD; + if (max > X86_64_REGPARM_MAX) + max = X86_64_REGPARM_MAX; - if (local_regparm > regparm) - regparm = local_regparm; - } - } + for (i = cum->regno; i < max; i++) + { + mem = gen_rtx_MEM (word_mode, + plus_constant (Pmode, save_area, i * UNITS_PER_WORD)); + MEM_NOTRAP_P (mem) = 1; + set_mem_alias_set (mem, set); + emit_move_insn (mem, + gen_rtx_REG (word_mode, + x86_64_int_parameter_registers[i])); } - return regparm; -} - -/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and - DFmode (2) arguments in SSE registers for a function with the - indicated TYPE and DECL. DECL may be NULL when calling function - indirectly or considering a libcall. Return -1 if any FP parameter - should be rejected by error. This is used in siutation we imply SSE - calling convetion but the function is called from another function with - SSE disabled. Otherwise return 0. */ - -static int -ix86_function_sseregparm (const_tree type, const_tree decl, bool warn) -{ - gcc_assert (!TARGET_64BIT); - - /* Use SSE registers to pass SFmode and DFmode arguments if requested - by the sseregparm attribute. */ - if (TARGET_SSEREGPARM - || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type)))) + if (ix86_varargs_fpr_size) { - if (!TARGET_SSE) - { - if (warn) - { - if (decl) - error ("calling %qD with attribute sseregparm without " - "SSE/SSE2 enabled", decl); - else - error ("calling %qT with attribute sseregparm without " - "SSE/SSE2 enabled", type); - } - return 0; - } + machine_mode smode; + rtx_code_label *label; + rtx test; - return 2; - } + /* Now emit code to save SSE registers. The AX parameter contains number + of SSE parameter registers used to call this function, though all we + actually check here is the zero/non-zero status. */ - if (!decl) - return 0; + label = gen_label_rtx (); + test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx); + emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1), + label)); - cgraph_node *target = cgraph_node::get (decl); - if (target) - target = target->function_symbol (); + /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if + we used movdqa (i.e. TImode) instead? Perhaps even better would + be if we could determine the real mode of the data, via a hook + into pass_stdarg. Ignore all that for now. */ + smode = V4SFmode; + if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode)) + crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode); - /* For local functions, pass up to SSE_REGPARM_MAX SFmode - (and DFmode for SSE2) arguments in SSE registers. */ - if (target - /* TARGET_SSE_MATH */ - && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE) - && opt_for_fn (target->decl, optimize) - && !(profile_flag && !flag_fentry)) - { - cgraph_local_info *i = &target->local; - if (i && i->local && i->can_change_signature) + max = cum->sse_regno + cfun->va_list_fpr_size / 16; + if (max > X86_64_SSE_REGPARM_MAX) + max = X86_64_SSE_REGPARM_MAX; + + for (i = cum->sse_regno; i < max; ++i) { - /* Refuse to produce wrong code when local function with SSE enabled - is called from SSE disabled function. - FIXME: We need a way to detect these cases cross-ltrans partition - and avoid using SSE calling conventions on local functions called - from function with SSE disabled. For now at least delay the - warning until we know we are going to produce wrong code. - See PR66047 */ - if (!TARGET_SSE && warn) - return -1; - return TARGET_SSE2_P (target_opts_for_fn (target->decl) - ->x_ix86_isa_flags) ? 2 : 1; + mem = plus_constant (Pmode, save_area, + i * 16 + ix86_varargs_gpr_size); + mem = gen_rtx_MEM (smode, mem); + MEM_NOTRAP_P (mem) = 1; + set_mem_alias_set (mem, set); + set_mem_align (mem, GET_MODE_ALIGNMENT (smode)); + + emit_move_insn (mem, gen_rtx_REG (smode, GET_SSE_REGNO (i))); } - } - return 0; + emit_label (label); + } } -/* Return true if EAX is live at the start of the function. Used by - ix86_expand_prologue to determine if we need special help before - calling allocate_stack_worker. */ - -static bool -ix86_eax_live_at_start_p (void) +static void +setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum) { - /* Cheat. Don't bother working forward from ix86_function_regparm - to the function type to whether an actual argument is located in - eax. Instead just look at cfg info, which is still close enough - to correct at this point. This gives false positives for broken - functions that might use uninitialized data that happens to be - allocated in eax, but who cares? */ - return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0); -} + alias_set_type set = get_varargs_alias_set (); + int i; -static bool -ix86_keep_aggregate_return_pointer (tree fntype) -{ - tree attr; + /* Reset to zero, as there might be a sysv vaarg used + before. */ + ix86_varargs_gpr_size = 0; + ix86_varargs_fpr_size = 0; - if (!TARGET_64BIT) + for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++) { - attr = lookup_attribute ("callee_pop_aggregate_return", - TYPE_ATTRIBUTES (fntype)); - if (attr) - return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0); + rtx reg, mem; - /* For 32-bit MS-ABI the default is to keep aggregate - return pointer. */ - if (ix86_function_type_abi (fntype) == MS_ABI) - return true; + mem = gen_rtx_MEM (Pmode, + plus_constant (Pmode, virtual_incoming_args_rtx, + i * UNITS_PER_WORD)); + MEM_NOTRAP_P (mem) = 1; + set_mem_alias_set (mem, set); + + reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]); + emit_move_insn (mem, reg); } - return KEEP_AGGREGATE_RETURN_POINTER != 0; } -/* Value is the number of bytes of arguments automatically - popped when returning from a subroutine call. - FUNDECL is the declaration node of the function (as a tree), - FUNTYPE is the data type of the function (as a tree), - or for a library call it is an identifier node for the subroutine name. - SIZE is the number of bytes of arguments passed on the stack. - - On the 80386, the RTD insn may be used to pop them if the number - of args is fixed, but if the number is variable then the caller - must pop them all. RTD can't be used for library calls now - because the library is compiled with the Unix compiler. - Use of RTD is a selectable option, since it is incompatible with - standard Unix calling sequences. If the option is not selected, - the caller must always pop the args. - - The attribute stdcall is equivalent to RTD on a per module basis. */ - -static poly_int64 -ix86_return_pops_args (tree fundecl, tree funtype, poly_int64 size) +static void +ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode, + tree type, int *, int no_rtl) { - unsigned int ccvt; + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + CUMULATIVE_ARGS next_cum; + tree fntype; - /* None of the 64-bit ABIs pop arguments. */ - if (TARGET_64BIT) - return 0; + /* This argument doesn't appear to be used anymore. Which is good, + because the old code here didn't suppress rtl generation. */ + gcc_assert (!no_rtl); - ccvt = ix86_get_callcvt (funtype); + if (!TARGET_64BIT) + return; - if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL - | IX86_CALLCVT_THISCALL)) != 0 - && ! stdarg_p (funtype)) - return size; + fntype = TREE_TYPE (current_function_decl); - /* Lose any fake structure return argument if it is passed on the stack. */ - if (aggregate_value_p (TREE_TYPE (funtype), fundecl) - && !ix86_keep_aggregate_return_pointer (funtype)) - { - int nregs = ix86_function_regparm (funtype, fundecl); - if (nregs == 0) - return GET_MODE_SIZE (Pmode); - } + /* For varargs, we do not want to skip the dummy va_dcl argument. + For stdargs, we do want to skip the last named argument. */ + next_cum = *cum; + if (stdarg_p (fntype)) + ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type, + true); - return 0; + if (cum->call_abi == MS_ABI) + setup_incoming_varargs_ms_64 (&next_cum); + else + setup_incoming_varargs_64 (&next_cum); } -/* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */ +static void +ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v, + machine_mode mode, + tree type, + int *pretend_size ATTRIBUTE_UNUSED, + int no_rtl) +{ + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + CUMULATIVE_ARGS next_cum; + tree fntype; + int max; + + gcc_assert (!no_rtl); + + /* Do nothing if we use plain pointer to argument area. */ + if (!TARGET_64BIT || cum->call_abi == MS_ABI) + return; + + fntype = TREE_TYPE (current_function_decl); + + /* For varargs, we do not want to skip the dummy va_dcl argument. + For stdargs, we do want to skip the last named argument. */ + next_cum = *cum; + if (stdarg_p (fntype)) + ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type, + true); + + max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD; + if (max > X86_64_REGPARM_MAX) + max = X86_64_REGPARM_MAX; +} + + +/* Checks if TYPE is of kind va_list char *. */ static bool -ix86_legitimate_combined_insn (rtx_insn *insn) +is_va_list_char_pointer (tree type) { - int i; + tree canonic; - /* Check operand constraints in case hard registers were propagated - into insn pattern. This check prevents combine pass from - generating insn patterns with invalid hard register operands. - These invalid insns can eventually confuse reload to error out - with a spill failure. See also PRs 46829 and 46843. */ + /* For 32-bit it is always true. */ + if (!TARGET_64BIT) + return true; + canonic = ix86_canonical_va_list_type (type); + return (canonic == ms_va_list_type_node + || (ix86_abi == MS_ABI && canonic == va_list_type_node)); +} - gcc_assert (INSN_CODE (insn) >= 0); +/* Implement va_start. */ - extract_insn (insn); - preprocess_constraints (insn); +static void +ix86_va_start (tree valist, rtx nextarg) +{ + HOST_WIDE_INT words, n_gpr, n_fpr; + tree f_gpr, f_fpr, f_ovf, f_sav; + tree gpr, fpr, ovf, sav, t; + tree type; + rtx ovf_rtx; - int n_operands = recog_data.n_operands; - int n_alternatives = recog_data.n_alternatives; - for (i = 0; i < n_operands; i++) + if (flag_split_stack + && cfun->machine->split_stack_varargs_pointer == NULL_RTX) { - rtx op = recog_data.operand[i]; - machine_mode mode = GET_MODE (op); - const operand_alternative *op_alt; - int offset = 0; - bool win; - int j; + unsigned int scratch_regno; - /* A unary operator may be accepted by the predicate, but it - is irrelevant for matching constraints. */ - if (UNARY_P (op)) - op = XEXP (op, 0); + /* When we are splitting the stack, we can't refer to the stack + arguments using internal_arg_pointer, because they may be on + the old stack. The split stack prologue will arrange to + leave a pointer to the old stack arguments in a scratch + register, which we here copy to a pseudo-register. The split + stack prologue can't set the pseudo-register directly because + it (the prologue) runs before any registers have been saved. */ - if (SUBREG_P (op)) + scratch_regno = split_stack_prologue_scratch_regno (); + if (scratch_regno != INVALID_REGNUM) { - if (REG_P (SUBREG_REG (op)) - && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER) - offset = subreg_regno_offset (REGNO (SUBREG_REG (op)), - GET_MODE (SUBREG_REG (op)), - SUBREG_BYTE (op), - GET_MODE (op)); - op = SUBREG_REG (op); - } + rtx reg; + rtx_insn *seq; - if (!(REG_P (op) && HARD_REGISTER_P (op))) - continue; + reg = gen_reg_rtx (Pmode); + cfun->machine->split_stack_varargs_pointer = reg; - op_alt = recog_op_alt; + start_sequence (); + emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno)); + seq = get_insns (); + end_sequence (); - /* Operand has no constraints, anything is OK. */ - win = !n_alternatives; + push_topmost_sequence (); + emit_insn_after (seq, entry_of_function ()); + pop_topmost_sequence (); + } + } - alternative_mask preferred = get_preferred_alternatives (insn); - for (j = 0; j < n_alternatives; j++, op_alt += n_operands) + /* Only 64bit target needs something special. */ + if (is_va_list_char_pointer (TREE_TYPE (valist))) + { + if (cfun->machine->split_stack_varargs_pointer == NULL_RTX) + std_expand_builtin_va_start (valist, nextarg); + else { - if (!TEST_BIT (preferred, j)) - continue; - if (op_alt[i].anything_ok - || (op_alt[i].matches != -1 - && operands_match_p - (recog_data.operand[i], - recog_data.operand[op_alt[i].matches])) - || reg_fits_class_p (op, op_alt[i].cl, offset, mode)) - { - win = true; - break; - } - } + rtx va_r, next; - if (!win) - return false; + va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE); + next = expand_binop (ptr_mode, add_optab, + cfun->machine->split_stack_varargs_pointer, + crtl->args.arg_offset_rtx, + NULL_RTX, 0, OPTAB_LIB_WIDEN); + convert_move (va_r, next, 0); + } + return; } - return true; -} - -/* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */ + f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node)); + f_fpr = DECL_CHAIN (f_gpr); + f_ovf = DECL_CHAIN (f_fpr); + f_sav = DECL_CHAIN (f_ovf); -static unsigned HOST_WIDE_INT -ix86_asan_shadow_offset (void) -{ - return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44) - : HOST_WIDE_INT_C (0x7fff8000)) - : (HOST_WIDE_INT_1 << 29); -} - -/* Argument support functions. */ + valist = build_simple_mem_ref (valist); + TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node); + /* The following should be folded into the MEM_REF offset. */ + gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist), + f_gpr, NULL_TREE); + fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist), + f_fpr, NULL_TREE); + ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist), + f_ovf, NULL_TREE); + sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist), + f_sav, NULL_TREE); -/* Return true when register may be used to pass function parameters. */ -bool -ix86_function_arg_regno_p (int regno) -{ - int i; - enum calling_abi call_abi; - const int *parm_regs; + /* Count number of gp and fp argument registers used. */ + words = crtl->args.info.words; + n_gpr = crtl->args.info.regno; + n_fpr = crtl->args.info.sse_regno; - if (!TARGET_64BIT) + if (cfun->va_list_gpr_size) { - if (TARGET_MACHO) - return (regno < REGPARM_MAX - || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno])); - else - return (regno < REGPARM_MAX - || (TARGET_MMX && MMX_REGNO_P (regno) - && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX)) - || (TARGET_SSE && SSE_REGNO_P (regno) - && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))); + type = TREE_TYPE (gpr); + t = build2 (MODIFY_EXPR, type, + gpr, build_int_cst (type, n_gpr * 8)); + TREE_SIDE_EFFECTS (t) = 1; + expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); } - if (TARGET_SSE && SSE_REGNO_P (regno) - && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)) - return true; - - /* TODO: The function should depend on current function ABI but - builtins.c would need updating then. Therefore we use the - default ABI. */ - call_abi = ix86_cfun_abi (); - - /* RAX is used as hidden argument to va_arg functions. */ - if (call_abi == SYSV_ABI && regno == AX_REG) - return true; + if (TARGET_SSE && cfun->va_list_fpr_size) + { + type = TREE_TYPE (fpr); + t = build2 (MODIFY_EXPR, type, fpr, + build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX)); + TREE_SIDE_EFFECTS (t) = 1; + expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + } - if (call_abi == MS_ABI) - parm_regs = x86_64_ms_abi_int_parameter_registers; + /* Find the overflow area. */ + type = TREE_TYPE (ovf); + if (cfun->machine->split_stack_varargs_pointer == NULL_RTX) + ovf_rtx = crtl->args.internal_arg_pointer; else - parm_regs = x86_64_int_parameter_registers; + ovf_rtx = cfun->machine->split_stack_varargs_pointer; + t = make_tree (type, ovf_rtx); + if (words != 0) + t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD); - for (i = 0; i < (call_abi == MS_ABI - ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++) - if (regno == parm_regs[i]) - return true; - return false; -} - -/* Return if we do not know how to pass TYPE solely in registers. */ + t = build2 (MODIFY_EXPR, type, ovf, t); + TREE_SIDE_EFFECTS (t) = 1; + expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); -static bool -ix86_must_pass_in_stack (machine_mode mode, const_tree type) -{ - if (must_pass_in_stack_var_size_or_pad (mode, type)) - return true; + if (ix86_varargs_gpr_size || ix86_varargs_fpr_size) + { + /* Find the register save area. + Prologue of the function save it right above stack frame. */ + type = TREE_TYPE (sav); + t = make_tree (type, frame_pointer_rtx); + if (!ix86_varargs_gpr_size) + t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX); - /* For 32-bit, we want TImode aggregates to go on the stack. But watch out! - The layout_type routine is crafty and tries to trick us into passing - currently unsupported vector types on the stack by using TImode. */ - return (!TARGET_64BIT && mode == TImode - && type && TREE_CODE (type) != VECTOR_TYPE); + t = build2 (MODIFY_EXPR, type, sav, t); + TREE_SIDE_EFFECTS (t) = 1; + expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + } } -/* It returns the size, in bytes, of the area reserved for arguments passed - in registers for the function represented by fndecl dependent to the used - abi format. */ -int -ix86_reg_parm_stack_space (const_tree fndecl) -{ - enum calling_abi call_abi = SYSV_ABI; - if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL) - call_abi = ix86_function_abi (fndecl); - else - call_abi = ix86_function_type_abi (fndecl); - if (TARGET_64BIT && call_abi == MS_ABI) - return 32; - return 0; -} +/* Implement va_arg. */ -/* We add this as a workaround in order to use libc_has_function - hook in i386.md. */ -bool -ix86_libc_has_function (enum function_class fn_class) +static tree +ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, + gimple_seq *post_p) { - return targetm.libc_has_function (fn_class); -} + static const int intreg[6] = { 0, 1, 2, 3, 4, 5 }; + tree f_gpr, f_fpr, f_ovf, f_sav; + tree gpr, fpr, ovf, sav, t; + int size, rsize; + tree lab_false, lab_over = NULL_TREE; + tree addr, t2; + rtx container; + int indirect_p = 0; + tree ptrtype; + machine_mode nat_mode; + unsigned int arg_boundary; -/* Returns value SYSV_ABI, MS_ABI dependent on fntype, - specifying the call abi used. */ -enum calling_abi -ix86_function_type_abi (const_tree fntype) -{ - enum calling_abi abi = ix86_abi; + /* Only 64bit target needs something special. */ + if (is_va_list_char_pointer (TREE_TYPE (valist))) + return std_gimplify_va_arg_expr (valist, type, pre_p, post_p); - if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE) - return abi; + f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node)); + f_fpr = DECL_CHAIN (f_gpr); + f_ovf = DECL_CHAIN (f_fpr); + f_sav = DECL_CHAIN (f_ovf); - if (abi == SYSV_ABI - && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype))) + gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), + valist, f_gpr, NULL_TREE); + + fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE); + ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE); + sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE); + + indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false); + if (indirect_p) + type = build_pointer_type (type); + size = arg_int_size_in_bytes (type); + rsize = CEIL (size, UNITS_PER_WORD); + + nat_mode = type_natural_mode (type, NULL, false); + switch (nat_mode) { - static int warned; - if (TARGET_X32 && !warned) + case E_V8SFmode: + case E_V8SImode: + case E_V32QImode: + case E_V16HImode: + case E_V4DFmode: + case E_V4DImode: + case E_V16SFmode: + case E_V16SImode: + case E_V64QImode: + case E_V32HImode: + case E_V8DFmode: + case E_V8DImode: + /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */ + if (!TARGET_64BIT_MS_ABI) { - error ("X32 does not support ms_abi attribute"); - warned = 1; + container = NULL; + break; } + /* FALLTHRU */ - abi = MS_ABI; + default: + container = construct_container (nat_mode, TYPE_MODE (type), + type, 0, X86_64_REGPARM_MAX, + X86_64_SSE_REGPARM_MAX, intreg, + 0); + break; } - else if (abi == MS_ABI - && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype))) - abi = SYSV_ABI; - - return abi; -} -static enum calling_abi -ix86_function_abi (const_tree fndecl) -{ - return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi; -} + /* Pull the value out of the saved registers. */ -/* Returns value SYSV_ABI, MS_ABI dependent on cfun, - specifying the call abi used. */ -enum calling_abi -ix86_cfun_abi (void) -{ - return cfun ? cfun->machine->call_abi : ix86_abi; -} + addr = create_tmp_var (ptr_type_node, "addr"); -static bool -ix86_function_ms_hook_prologue (const_tree fn) -{ - if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn))) + if (container) { - if (decl_function_context (fn) != NULL_TREE) - error_at (DECL_SOURCE_LOCATION (fn), - "ms_hook_prologue is not compatible with nested function"); - else - return true; - } - return false; -} - -static bool -ix86_function_naked (const_tree fn) -{ - if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn))) - return true; - - return false; -} - -/* Write the extra assembler code needed to declare a function properly. */ + int needed_intregs, needed_sseregs; + bool need_temp; + tree int_addr, sse_addr; -void -ix86_asm_output_function_label (FILE *asm_out_file, const char *fname, - tree decl) -{ - bool is_ms_hook = ix86_function_ms_hook_prologue (decl); + lab_false = create_artificial_label (UNKNOWN_LOCATION); + lab_over = create_artificial_label (UNKNOWN_LOCATION); - if (is_ms_hook) - { - int i, filler_count = (TARGET_64BIT ? 32 : 16); - unsigned int filler_cc = 0xcccccccc; + examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs); - for (i = 0; i < filler_count; i += 4) - fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc); - } + need_temp = (!REG_P (container) + && ((needed_intregs && TYPE_ALIGN (type) > 64) + || TYPE_ALIGN (type) > 128)); -#ifdef SUBTARGET_ASM_UNWIND_INIT - SUBTARGET_ASM_UNWIND_INIT (asm_out_file); -#endif + /* In case we are passing structure, verify that it is consecutive block + on the register save area. If not we need to do moves. */ + if (!need_temp && !REG_P (container)) + { + /* Verify that all registers are strictly consecutive */ + if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0)))) + { + int i; - ASM_OUTPUT_LABEL (asm_out_file, fname); + for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++) + { + rtx slot = XVECEXP (container, 0, i); + if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i + || INTVAL (XEXP (slot, 1)) != i * 16) + need_temp = true; + } + } + else + { + int i; - /* Output magic byte marker, if hot-patch attribute is set. */ - if (is_ms_hook) - { - if (TARGET_64BIT) + for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++) + { + rtx slot = XVECEXP (container, 0, i); + if (REGNO (XEXP (slot, 0)) != (unsigned int) i + || INTVAL (XEXP (slot, 1)) != i * 8) + need_temp = true; + } + } + } + if (!need_temp) { - /* leaq [%rsp + 0], %rsp */ - fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n", - asm_out_file); + int_addr = addr; + sse_addr = addr; } else { - /* movl.s %edi, %edi - push %ebp - movl.s %esp, %ebp */ - fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file); + int_addr = create_tmp_var (ptr_type_node, "int_addr"); + sse_addr = create_tmp_var (ptr_type_node, "sse_addr"); } - } -} - -/* Implementation of call abi switching target hook. Specific to FNDECL - the specific call register sets are set. See also - ix86_conditional_register_usage for more details. */ -void -ix86_call_abi_override (const_tree fndecl) -{ - cfun->machine->call_abi = ix86_function_abi (fndecl); -} -/* Return 1 if pseudo register should be created and used to hold - GOT address for PIC code. */ -bool -ix86_use_pseudo_pic_reg (void) -{ - if ((TARGET_64BIT - && (ix86_cmodel == CM_SMALL_PIC - || TARGET_PECOFF)) - || !flag_pic) - return false; - return true; -} + /* First ensure that we fit completely in registers. */ + if (needed_intregs) + { + t = build_int_cst (TREE_TYPE (gpr), + (X86_64_REGPARM_MAX - needed_intregs + 1) * 8); + t = build2 (GE_EXPR, boolean_type_node, gpr, t); + t2 = build1 (GOTO_EXPR, void_type_node, lab_false); + t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE); + gimplify_and_add (t, pre_p); + } + if (needed_sseregs) + { + t = build_int_cst (TREE_TYPE (fpr), + (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16 + + X86_64_REGPARM_MAX * 8); + t = build2 (GE_EXPR, boolean_type_node, fpr, t); + t2 = build1 (GOTO_EXPR, void_type_node, lab_false); + t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE); + gimplify_and_add (t, pre_p); + } -/* Initialize large model PIC register. */ + /* Compute index to start of area used for integer regs. */ + if (needed_intregs) + { + /* int_addr = gpr + sav; */ + t = fold_build_pointer_plus (sav, gpr); + gimplify_assign (int_addr, t, pre_p); + } + if (needed_sseregs) + { + /* sse_addr = fpr + sav; */ + t = fold_build_pointer_plus (sav, fpr); + gimplify_assign (sse_addr, t, pre_p); + } + if (need_temp) + { + int i, prev_size = 0; + tree temp = create_tmp_var (type, "va_arg_tmp"); -static void -ix86_init_large_pic_reg (unsigned int tmp_regno) -{ - rtx_code_label *label; - rtx tmp_reg; + /* addr = &temp; */ + t = build1 (ADDR_EXPR, build_pointer_type (type), temp); + gimplify_assign (addr, t, pre_p); - gcc_assert (Pmode == DImode); - label = gen_label_rtx (); - emit_label (label); - LABEL_PRESERVE_P (label) = 1; - tmp_reg = gen_rtx_REG (Pmode, tmp_regno); - gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno); - emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, - label)); - emit_insn (gen_set_got_offset_rex64 (tmp_reg, label)); - emit_insn (ix86_gen_add3 (pic_offset_table_rtx, - pic_offset_table_rtx, tmp_reg)); - const char *name = LABEL_NAME (label); - PUT_CODE (label, NOTE); - NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL; - NOTE_DELETED_LABEL_NAME (label) = name; -} + for (i = 0; i < XVECLEN (container, 0); i++) + { + rtx slot = XVECEXP (container, 0, i); + rtx reg = XEXP (slot, 0); + machine_mode mode = GET_MODE (reg); + tree piece_type; + tree addr_type; + tree daddr_type; + tree src_addr, src; + int src_offset; + tree dest_addr, dest; + int cur_size = GET_MODE_SIZE (mode); -/* Create and initialize PIC register if required. */ -static void -ix86_init_pic_reg (void) -{ - edge entry_edge; - rtx_insn *seq; + gcc_assert (prev_size <= INTVAL (XEXP (slot, 1))); + prev_size = INTVAL (XEXP (slot, 1)); + if (prev_size + cur_size > size) + { + cur_size = size - prev_size; + unsigned int nbits = cur_size * BITS_PER_UNIT; + if (!int_mode_for_size (nbits, 1).exists (&mode)) + mode = QImode; + } + piece_type = lang_hooks.types.type_for_mode (mode, 1); + if (mode == GET_MODE (reg)) + addr_type = build_pointer_type (piece_type); + else + addr_type = build_pointer_type_for_mode (piece_type, ptr_mode, + true); + daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode, + true); - if (!ix86_use_pseudo_pic_reg ()) - return; + if (SSE_REGNO_P (REGNO (reg))) + { + src_addr = sse_addr; + src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16; + } + else + { + src_addr = int_addr; + src_offset = REGNO (reg) * 8; + } + src_addr = fold_convert (addr_type, src_addr); + src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset); - start_sequence (); + dest_addr = fold_convert (daddr_type, addr); + dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size); + if (cur_size == GET_MODE_SIZE (mode)) + { + src = build_va_arg_indirect_ref (src_addr); + dest = build_va_arg_indirect_ref (dest_addr); - if (TARGET_64BIT) - { - if (ix86_cmodel == CM_LARGE_PIC) - ix86_init_large_pic_reg (R11_REG); - else - emit_insn (gen_set_got_rex64 (pic_offset_table_rtx)); + gimplify_assign (dest, src, pre_p); + } + else + { + tree copy + = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY), + 3, dest_addr, src_addr, + size_int (cur_size)); + gimplify_and_add (copy, pre_p); + } + prev_size += cur_size; + } + } + + if (needed_intregs) + { + t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr, + build_int_cst (TREE_TYPE (gpr), needed_intregs * 8)); + gimplify_assign (gpr, t, pre_p); + } + + if (needed_sseregs) + { + t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr, + build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16)); + gimplify_assign (unshare_expr (fpr), t, pre_p); + } + + gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over)); + + gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false)); } - else + + /* ... otherwise out of the overflow area. */ + + /* When we align parameter on stack for caller, if the parameter + alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be + aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee + here with caller. */ + arg_boundary = ix86_function_arg_boundary (VOIDmode, type); + if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT) + arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT; + + /* Care for on-stack alignment if needed. */ + if (arg_boundary <= 64 || size == 0) + t = ovf; + else { - /* If there is future mcount call in the function it is more profitable - to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */ - rtx reg = crtl->profile - ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM) - : pic_offset_table_rtx; - rtx_insn *insn = emit_insn (gen_set_got (reg)); - RTX_FRAME_RELATED_P (insn) = 1; - if (crtl->profile) - emit_move_insn (pic_offset_table_rtx, reg); - add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX); + HOST_WIDE_INT align = arg_boundary / 8; + t = fold_build_pointer_plus_hwi (ovf, align - 1); + t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, + build_int_cst (TREE_TYPE (t), -align)); } - seq = get_insns (); - end_sequence (); + gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue); + gimplify_assign (addr, t, pre_p); - entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun)); - insert_insn_on_edge (seq, entry_edge); - commit_one_edge_insertion (entry_edge); -} + t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD); + gimplify_assign (unshare_expr (ovf), t, pre_p); -/* Initialize a variable CUM of type CUMULATIVE_ARGS - for a call to a function whose data type is FNTYPE. - For a library call, FNTYPE is 0. */ + if (container) + gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over)); -void -init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ - tree fntype, /* tree ptr for function decl */ - rtx libname, /* SYMBOL_REF of library name or 0 */ - tree fndecl, - int caller) + ptrtype = build_pointer_type_for_mode (type, ptr_mode, true); + addr = fold_convert (ptrtype, addr); + + if (indirect_p) + addr = build_va_arg_indirect_ref (addr); + return build_va_arg_indirect_ref (addr); +} + +/* Return true if OPNUM's MEM should be matched + in movabs* patterns. */ + +bool +ix86_check_movabs (rtx insn, int opnum) { - struct cgraph_local_info *i = NULL; - struct cgraph_node *target = NULL; + rtx set, mem; - memset (cum, 0, sizeof (*cum)); + set = PATTERN (insn); + if (GET_CODE (set) == PARALLEL) + set = XVECEXP (set, 0, 0); + gcc_assert (GET_CODE (set) == SET); + mem = XEXP (set, opnum); + while (SUBREG_P (mem)) + mem = SUBREG_REG (mem); + gcc_assert (MEM_P (mem)); + return volatile_ok || !MEM_VOLATILE_P (mem); +} - if (fndecl) +/* Return false if INSN contains a MEM with a non-default address space. */ +bool +ix86_check_no_addr_space (rtx insn) +{ + subrtx_var_iterator::array_type array; + FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL) { - target = cgraph_node::get (fndecl); - if (target) - { - target = target->function_symbol (); - i = cgraph_node::local_info (target->decl); - cum->call_abi = ix86_function_abi (target->decl); - } - else - cum->call_abi = ix86_function_abi (fndecl); + rtx x = *iter; + if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x))) + return false; } - else - cum->call_abi = ix86_function_type_abi (fntype); + return true; +} + +/* Initialize the table of extra 80387 mathematical constants. */ - cum->caller = caller; +static void +init_ext_80387_constants (void) +{ + static const char * cst[5] = + { + "0.3010299956639811952256464283594894482", /* 0: fldlg2 */ + "0.6931471805599453094286904741849753009", /* 1: fldln2 */ + "1.4426950408889634073876517827983434472", /* 2: fldl2e */ + "3.3219280948873623478083405569094566090", /* 3: fldl2t */ + "3.1415926535897932385128089594061862044", /* 4: fldpi */ + }; + int i; - /* Set up the number of registers to use for passing arguments. */ - cum->nregs = ix86_regparm; - if (TARGET_64BIT) - { - cum->nregs = (cum->call_abi == SYSV_ABI - ? X86_64_REGPARM_MAX - : X86_64_MS_REGPARM_MAX); - } - if (TARGET_SSE) + for (i = 0; i < 5; i++) { - cum->sse_nregs = SSE_REGPARM_MAX; - if (TARGET_64BIT) - { - cum->sse_nregs = (cum->call_abi == SYSV_ABI - ? X86_64_SSE_REGPARM_MAX - : X86_64_MS_SSE_REGPARM_MAX); - } + real_from_string (&ext_80387_constants_table[i], cst[i]); + /* Ensure each constant is rounded to XFmode precision. */ + real_convert (&ext_80387_constants_table[i], + XFmode, &ext_80387_constants_table[i]); } - if (TARGET_MMX) - cum->mmx_nregs = MMX_REGPARM_MAX; - cum->warn_avx512f = true; - cum->warn_avx = true; - cum->warn_sse = true; - cum->warn_mmx = true; - /* Because type might mismatch in between caller and callee, we need to - use actual type of function for local calls. - FIXME: cgraph_analyze can be told to actually record if function uses - va_start so for local functions maybe_vaarg can be made aggressive - helping K&R code. - FIXME: once typesytem is fixed, we won't need this code anymore. */ - if (i && i->local && i->can_change_signature) - fntype = TREE_TYPE (target->decl); - cum->stdarg = stdarg_p (fntype); - cum->maybe_vaarg = (fntype - ? (!prototype_p (fntype) || stdarg_p (fntype)) - : !libname); + ext_80387_constants_init = 1; +} - cum->decl = fndecl; +/* Return non-zero if the constant is something that + can be loaded with a special instruction. */ - cum->warn_empty = !warn_abi || cum->stdarg; - if (!cum->warn_empty && fntype) - { - function_args_iterator iter; - tree argtype; - bool seen_empty_type = false; - FOREACH_FUNCTION_ARGS (fntype, argtype, iter) - { - if (argtype == error_mark_node || VOID_TYPE_P (argtype)) - break; - if (TYPE_EMPTY_P (argtype)) - seen_empty_type = true; - else if (seen_empty_type) - { - cum->warn_empty = true; - break; - } - } - } +int +standard_80387_constant_p (rtx x) +{ + machine_mode mode = GET_MODE (x); - if (!TARGET_64BIT) + const REAL_VALUE_TYPE *r; + + if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode))) + return -1; + + if (x == CONST0_RTX (mode)) + return 1; + if (x == CONST1_RTX (mode)) + return 2; + + r = CONST_DOUBLE_REAL_VALUE (x); + + /* For XFmode constants, try to find a special 80387 instruction when + optimizing for size or on those CPUs that benefit from them. */ + if (mode == XFmode + && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS)) { - /* If there are variable arguments, then we won't pass anything - in registers in 32-bit mode. */ - if (stdarg_p (fntype)) - { - cum->nregs = 0; - /* Since in 32-bit, variable arguments are always passed on - stack, there is scratch register available for indirect - sibcall. */ - cfun->machine->arg_reg_available = true; - cum->sse_nregs = 0; - cum->mmx_nregs = 0; - cum->warn_avx512f = false; - cum->warn_avx = false; - cum->warn_sse = false; - cum->warn_mmx = false; - return; - } + int i; - /* Use ecx and edx registers if function has fastcall attribute, - else look for regparm information. */ - if (fntype) - { - unsigned int ccvt = ix86_get_callcvt (fntype); - if ((ccvt & IX86_CALLCVT_THISCALL) != 0) - { - cum->nregs = 1; - cum->fastcall = 1; /* Same first register as in fastcall. */ - } - else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) - { - cum->nregs = 2; - cum->fastcall = 1; - } - else - cum->nregs = ix86_function_regparm (fntype, fndecl); - } + if (! ext_80387_constants_init) + init_ext_80387_constants (); - /* Set up the number of SSE registers used for passing SFmode - and DFmode arguments. Warn for mismatching ABI. */ - cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true); + for (i = 0; i < 5; i++) + if (real_identical (r, &ext_80387_constants_table[i])) + return i + 3; } - cfun->machine->arg_reg_available = (cum->nregs > 0); -} + /* Load of the constant -0.0 or -1.0 will be split as + fldz;fchs or fld1;fchs sequence. */ + if (real_isnegzero (r)) + return 8; + if (real_identical (r, &dconstm1)) + return 9; -/* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE. - But in the case of vector types, it is some vector mode. + return 0; +} - When we have only some of our vector isa extensions enabled, then there - are some modes for which vector_mode_supported_p is false. For these - modes, the generic vector support in gcc will choose some non-vector mode - in order to implement the type. By computing the natural mode, we'll - select the proper ABI location for the operand and not depend on whatever - the middle-end decides to do with these vector types. +/* Return the opcode of the special instruction to be used to load + the constant X. */ - The midde-end can't deal with the vector types > 16 bytes. In this - case, we return the original mode and warn ABI change if CUM isn't - NULL. +const char * +standard_80387_constant_opcode (rtx x) +{ + switch (standard_80387_constant_p (x)) + { + case 1: + return "fldz"; + case 2: + return "fld1"; + case 3: + return "fldlg2"; + case 4: + return "fldln2"; + case 5: + return "fldl2e"; + case 6: + return "fldl2t"; + case 7: + return "fldpi"; + case 8: + case 9: + return "#"; + default: + gcc_unreachable (); + } +} - If INT_RETURN is true, warn ABI change if the vector mode isn't - available for function return value. */ +/* Return the CONST_DOUBLE representing the 80387 constant that is + loaded by the specified special instruction. The argument IDX + matches the return value from standard_80387_constant_p. */ -static machine_mode -type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum, - bool in_return) +rtx +standard_80387_constant_rtx (int idx) { - machine_mode mode = TYPE_MODE (type); + int i; - if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode)) + if (! ext_80387_constants_init) + init_ext_80387_constants (); + + switch (idx) { - HOST_WIDE_INT size = int_size_in_bytes (type); - if ((size == 8 || size == 16 || size == 32 || size == 64) - /* ??? Generic code allows us to create width 1 vectors. Ignore. */ - && TYPE_VECTOR_SUBPARTS (type) > 1) - { - machine_mode innermode = TYPE_MODE (TREE_TYPE (type)); + case 3: + case 4: + case 5: + case 6: + case 7: + i = idx - 3; + break; - /* There are no XFmode vector modes. */ - if (innermode == XFmode) - return mode; + default: + gcc_unreachable (); + } - if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE) - mode = MIN_MODE_VECTOR_FLOAT; - else - mode = MIN_MODE_VECTOR_INT; + return const_double_from_real_value (ext_80387_constants_table[i], + XFmode); +} - /* Get the mode which has this inner mode and number of units. */ - FOR_EACH_MODE_FROM (mode, mode) - if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type) - && GET_MODE_INNER (mode) == innermode) - { - if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU) - { - static bool warnedavx512f; - static bool warnedavx512f_ret; +/* Return 1 if X is all bits 0 and 2 if X is all bits 1 + in supported SSE/AVX vector mode. */ - if (cum && cum->warn_avx512f && !warnedavx512f) - { - if (warning (OPT_Wpsabi, "AVX512F vector argument " - "without AVX512F enabled changes the ABI")) - warnedavx512f = true; - } - else if (in_return && !warnedavx512f_ret) - { - if (warning (OPT_Wpsabi, "AVX512F vector return " - "without AVX512F enabled changes the ABI")) - warnedavx512f_ret = true; - } - - return TYPE_MODE (type); - } - else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU) - { - static bool warnedavx; - static bool warnedavx_ret; +int +standard_sse_constant_p (rtx x, machine_mode pred_mode) +{ + machine_mode mode; - if (cum && cum->warn_avx && !warnedavx) - { - if (warning (OPT_Wpsabi, "AVX vector argument " - "without AVX enabled changes the ABI")) - warnedavx = true; - } - else if (in_return && !warnedavx_ret) - { - if (warning (OPT_Wpsabi, "AVX vector return " - "without AVX enabled changes the ABI")) - warnedavx_ret = true; - } + if (!TARGET_SSE) + return 0; - return TYPE_MODE (type); - } - else if (((size == 8 && TARGET_64BIT) || size == 16) - && !TARGET_SSE - && !TARGET_IAMCU) - { - static bool warnedsse; - static bool warnedsse_ret; + mode = GET_MODE (x); - if (cum && cum->warn_sse && !warnedsse) - { - if (warning (OPT_Wpsabi, "SSE vector argument " - "without SSE enabled changes the ABI")) - warnedsse = true; - } - else if (!TARGET_64BIT && in_return && !warnedsse_ret) - { - if (warning (OPT_Wpsabi, "SSE vector return " - "without SSE enabled changes the ABI")) - warnedsse_ret = true; - } - } - else if ((size == 8 && !TARGET_64BIT) - && (!cfun - || cfun->machine->func_type == TYPE_NORMAL) - && !TARGET_MMX - && !TARGET_IAMCU) - { - static bool warnedmmx; - static bool warnedmmx_ret; + if (x == const0_rtx || const0_operand (x, mode)) + return 1; - if (cum && cum->warn_mmx && !warnedmmx) - { - if (warning (OPT_Wpsabi, "MMX vector argument " - "without MMX enabled changes the ABI")) - warnedmmx = true; - } - else if (in_return && !warnedmmx_ret) - { - if (warning (OPT_Wpsabi, "MMX vector return " - "without MMX enabled changes the ABI")) - warnedmmx_ret = true; - } - } - return mode; - } + if (x == constm1_rtx || vector_all_ones_operand (x, mode)) + { + /* VOIDmode integer constant, get mode from the predicate. */ + if (mode == VOIDmode) + mode = pred_mode; + switch (GET_MODE_SIZE (mode)) + { + case 64: + if (TARGET_AVX512F) + return 2; + break; + case 32: + if (TARGET_AVX2) + return 2; + break; + case 16: + if (TARGET_SSE2) + return 2; + break; + case 0: + /* VOIDmode */ gcc_unreachable (); + default: + break; } } - return mode; + return 0; } -/* We want to pass a value in REGNO whose "natural" mode is MODE. However, - this may not agree with the mode that the type system has chosen for the - register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can - go ahead and use it. Otherwise we have to build a PARALLEL instead. */ +/* Return the opcode of the special instruction to be used to load + the constant operands[1] into operands[0]. */ -static rtx -gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode, - unsigned int regno) +const char * +standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) { - rtx tmp; + machine_mode mode; + rtx x = operands[1]; - if (orig_mode != BLKmode) - tmp = gen_rtx_REG (orig_mode, regno); - else + gcc_assert (TARGET_SSE); + + mode = GET_MODE (x); + + if (x == const0_rtx || const0_operand (x, mode)) { - tmp = gen_rtx_REG (mode, regno); - tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx); - tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp)); + switch (get_attr_mode (insn)) + { + case MODE_TI: + if (!EXT_REX_SSE_REG_P (operands[0])) + return "%vpxor\t%0, %d0"; + /* FALLTHRU */ + case MODE_XI: + case MODE_OI: + if (EXT_REX_SSE_REG_P (operands[0])) + return (TARGET_AVX512VL + ? "vpxord\t%x0, %x0, %x0" + : "vpxord\t%g0, %g0, %g0"); + return "vpxor\t%x0, %x0, %x0"; + + case MODE_V2DF: + if (!EXT_REX_SSE_REG_P (operands[0])) + return "%vxorpd\t%0, %d0"; + /* FALLTHRU */ + case MODE_V8DF: + case MODE_V4DF: + if (!EXT_REX_SSE_REG_P (operands[0])) + return "vxorpd\t%x0, %x0, %x0"; + else if (TARGET_AVX512DQ) + return (TARGET_AVX512VL + ? "vxorpd\t%x0, %x0, %x0" + : "vxorpd\t%g0, %g0, %g0"); + else + return (TARGET_AVX512VL + ? "vpxorq\t%x0, %x0, %x0" + : "vpxorq\t%g0, %g0, %g0"); + + case MODE_V4SF: + if (!EXT_REX_SSE_REG_P (operands[0])) + return "%vxorps\t%0, %d0"; + /* FALLTHRU */ + case MODE_V16SF: + case MODE_V8SF: + if (!EXT_REX_SSE_REG_P (operands[0])) + return "vxorps\t%x0, %x0, %x0"; + else if (TARGET_AVX512DQ) + return (TARGET_AVX512VL + ? "vxorps\t%x0, %x0, %x0" + : "vxorps\t%g0, %g0, %g0"); + else + return (TARGET_AVX512VL + ? "vpxord\t%x0, %x0, %x0" + : "vpxord\t%g0, %g0, %g0"); + + default: + gcc_unreachable (); + } } + else if (x == constm1_rtx || vector_all_ones_operand (x, mode)) + { + enum attr_mode insn_mode = get_attr_mode (insn); + + switch (insn_mode) + { + case MODE_XI: + case MODE_V8DF: + case MODE_V16SF: + gcc_assert (TARGET_AVX512F); + return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; - return tmp; -} + case MODE_OI: + case MODE_V4DF: + case MODE_V8SF: + gcc_assert (TARGET_AVX2); + /* FALLTHRU */ + case MODE_TI: + case MODE_V2DF: + case MODE_V4SF: + gcc_assert (TARGET_SSE2); + if (!EXT_REX_SSE_REG_P (operands[0])) + return (TARGET_AVX + ? "vpcmpeqd\t%0, %0, %0" + : "pcmpeqd\t%0, %0"); + else if (TARGET_AVX512VL) + return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"; + else + return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; -/* x86-64 register passing implementation. See x86-64 ABI for details. Goal - of this code is to classify each 8bytes of incoming argument by the register - class and assign registers accordingly. */ + default: + gcc_unreachable (); + } + } -/* Return the union class of CLASS1 and CLASS2. - See the x86-64 PS ABI for details. */ + gcc_unreachable (); +} -static enum x86_64_reg_class -merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2) +/* Returns true if INSN can be transformed from a memory load + to a supported FP constant load. */ + +bool +ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst) { - /* Rule #1: If both classes are equal, this is the resulting class. */ - if (class1 == class2) - return class1; + rtx src = find_constant_src (insn); - /* Rule #2: If one of the classes is NO_CLASS, the resulting class is - the other class. */ - if (class1 == X86_64_NO_CLASS) - return class2; - if (class2 == X86_64_NO_CLASS) - return class1; + gcc_assert (REG_P (dst)); - /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */ - if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS) - return X86_64_MEMORY_CLASS; + if (src == NULL + || (SSE_REGNO_P (REGNO (dst)) + && standard_sse_constant_p (src, GET_MODE (dst)) != 1) + || (STACK_REGNO_P (REGNO (dst)) + && standard_80387_constant_p (src) < 1)) + return false; - /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */ - if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS) - || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS)) - return X86_64_INTEGERSI_CLASS; - if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS - || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS) - return X86_64_INTEGER_CLASS; + return true; +} - /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class, - MEMORY is used. */ - if (class1 == X86_64_X87_CLASS - || class1 == X86_64_X87UP_CLASS - || class1 == X86_64_COMPLEX_X87_CLASS - || class2 == X86_64_X87_CLASS - || class2 == X86_64_X87UP_CLASS - || class2 == X86_64_COMPLEX_X87_CLASS) - return X86_64_MEMORY_CLASS; +/* Returns true if OP contains a symbol reference */ - /* Rule #6: Otherwise class SSE is used. */ - return X86_64_SSE_CLASS; -} +bool +symbolic_reference_mentioned_p (rtx op) +{ + const char *fmt; + int i; -/* Classify the argument of type TYPE and mode MODE. - CLASSES will be filled by the register class used to pass each word - of the operand. The number of words is returned. In case the parameter - should be passed in memory, 0 is returned. As a special case for zero - sized containers, classes[0] will be NO_CLASS and 1 is returned. + if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF) + return true; - BIT_OFFSET is used internally for handling records and specifies offset - of the offset in bits modulo 512 to avoid overflow cases. + fmt = GET_RTX_FORMAT (GET_CODE (op)); + for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--) + { + if (fmt[i] == 'E') + { + int j; - See the x86-64 PS ABI for details. -*/ + for (j = XVECLEN (op, i) - 1; j >= 0; j--) + if (symbolic_reference_mentioned_p (XVECEXP (op, i, j))) + return true; + } -static int -classify_argument (machine_mode mode, const_tree type, - enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset) + else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i))) + return true; + } + + return false; +} + +/* Return true if it is appropriate to emit `ret' instructions in the + body of a function. Do this only if the epilogue is simple, needing a + couple of insns. Prior to reloading, we can't tell how many registers + must be saved, so return false then. Return false if there is no frame + marker to de-allocate. */ + +bool +ix86_can_use_return_insn_p (void) { - HOST_WIDE_INT bytes - = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); - int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD); + if (ix86_function_naked (current_function_decl)) + return false; - /* Variable sized entities are always passed/returned in memory. */ - if (bytes < 0) + /* Don't use `ret' instruction in interrupt handler. */ + if (! reload_completed + || frame_pointer_needed + || cfun->machine->func_type != TYPE_NORMAL) return 0; - if (mode != VOIDmode - && targetm.calls.must_pass_in_stack (mode, type)) + /* Don't allow more than 32k pop, since that's all we can do + with one instruction. */ + if (crtl->args.pops_args && crtl->args.size >= 32768) return 0; - if (type && AGGREGATE_TYPE_P (type)) - { - int i; - tree field; - enum x86_64_reg_class subclasses[MAX_CLASSES]; + struct ix86_frame &frame = cfun->machine->frame; + return (frame.stack_pointer_offset == UNITS_PER_WORD + && (frame.nregs + frame.nsseregs) == 0); +} + +/* Value should be nonzero if functions must have frame pointers. + Zero means the frame pointer need not be set up (and parms may + be accessed via the stack pointer) in functions that seem suitable. */ - /* On x86-64 we pass structures larger than 64 bytes on the stack. */ - if (bytes > 64) - return 0; +static bool +ix86_frame_pointer_required (void) +{ + /* If we accessed previous frames, then the generated code expects + to be able to access the saved ebp value in our frame. */ + if (cfun->machine->accesses_prev_frame) + return true; - for (i = 0; i < words; i++) - classes[i] = X86_64_NO_CLASS; + /* Several x86 os'es need a frame pointer for other reasons, + usually pertaining to setjmp. */ + if (SUBTARGET_FRAME_POINTER_REQUIRED) + return true; - /* Zero sized arrays or structures are NO_CLASS. We return 0 to - signalize memory class, so handle it as special case. */ - if (!words) - { - classes[0] = X86_64_NO_CLASS; - return 1; - } + /* For older 32-bit runtimes setjmp requires valid frame-pointer. */ + if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp) + return true; - /* Classify each field of record and merge classes. */ - switch (TREE_CODE (type)) - { - case RECORD_TYPE: - /* And now merge the fields of structure. */ - for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) - { - if (TREE_CODE (field) == FIELD_DECL) - { - int num; + /* Win64 SEH, very large frames need a frame-pointer as maximum stack + allocation is 4GB. */ + if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE) + return true; - if (TREE_TYPE (field) == error_mark_node) - continue; + /* SSE saves require frame-pointer when stack is misaligned. */ + if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128) + return true; + + /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER + turns off the frame pointer by default. Turn it back on now if + we've not got a leaf function. */ + if (TARGET_OMIT_LEAF_FRAME_POINTER + && (!crtl->is_leaf + || ix86_current_function_calls_tls_descriptor)) + return true; - /* Bitfields are always classified as integer. Handle them - early, since later code would consider them to be - misaligned integers. */ - if (DECL_BIT_FIELD (field)) - { - for (i = (int_bit_position (field) - + (bit_offset % 64)) / 8 / 8; - i < ((int_bit_position (field) + (bit_offset % 64)) - + tree_to_shwi (DECL_SIZE (field)) - + 63) / 8 / 8; i++) - classes[i] - = merge_classes (X86_64_INTEGER_CLASS, classes[i]); - } - else - { - int pos; + if (crtl->profile && !flag_fentry) + return true; - type = TREE_TYPE (field); + return false; +} - /* Flexible array member is ignored. */ - if (TYPE_MODE (type) == BLKmode - && TREE_CODE (type) == ARRAY_TYPE - && TYPE_SIZE (type) == NULL_TREE - && TYPE_DOMAIN (type) != NULL_TREE - && (TYPE_MAX_VALUE (TYPE_DOMAIN (type)) - == NULL_TREE)) - { - static bool warned; +/* Record that the current function accesses previous call frames. */ - if (!warned && warn_psabi) - { - warned = true; - inform (input_location, - "the ABI of passing struct with" - " a flexible array member has" - " changed in GCC 4.4"); - } - continue; - } - num = classify_argument (TYPE_MODE (type), type, - subclasses, - (int_bit_position (field) - + bit_offset) % 512); - if (!num) - return 0; - pos = (int_bit_position (field) - + (bit_offset % 64)) / 8 / 8; - for (i = 0; i < num && (i + pos) < words; i++) - classes[i + pos] - = merge_classes (subclasses[i], classes[i + pos]); - } - } - } - break; +void +ix86_setup_frame_addresses (void) +{ + cfun->machine->accesses_prev_frame = 1; +} + +#ifndef USE_HIDDEN_LINKONCE +# if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0) +# define USE_HIDDEN_LINKONCE 1 +# else +# define USE_HIDDEN_LINKONCE 0 +# endif +#endif - case ARRAY_TYPE: - /* Arrays are handled as small records. */ - { - int num; - num = classify_argument (TYPE_MODE (TREE_TYPE (type)), - TREE_TYPE (type), subclasses, bit_offset); - if (!num) - return 0; +/* Label count for call and return thunks. It is used to make unique + labels in call and return thunks. */ +static int indirectlabelno; - /* The partial classes are now full classes. */ - if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4) - subclasses[0] = X86_64_SSE_CLASS; - if (subclasses[0] == X86_64_INTEGERSI_CLASS - && !((bit_offset % 64) == 0 && bytes == 4)) - subclasses[0] = X86_64_INTEGER_CLASS; +/* True if call thunk function is needed. */ +static bool indirect_thunk_needed = false; - for (i = 0; i < words; i++) - classes[i] = subclasses[i % num]; +/* Bit masks of integer registers, which contain branch target, used + by call thunk functions. */ +static int indirect_thunks_used; - break; - } - case UNION_TYPE: - case QUAL_UNION_TYPE: - /* Unions are similar to RECORD_TYPE but offset is always 0. - */ - for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) - { - if (TREE_CODE (field) == FIELD_DECL) - { - int num; +/* True if return thunk function is needed. */ +static bool indirect_return_needed = false; - if (TREE_TYPE (field) == error_mark_node) - continue; +/* True if return thunk function via CX is needed. */ +static bool indirect_return_via_cx; - num = classify_argument (TYPE_MODE (TREE_TYPE (field)), - TREE_TYPE (field), subclasses, - bit_offset); - if (!num) - return 0; - for (i = 0; i < num && i < words; i++) - classes[i] = merge_classes (subclasses[i], classes[i]); - } - } - break; +#ifndef INDIRECT_LABEL +# define INDIRECT_LABEL "LIND" +#endif - default: - gcc_unreachable (); - } +/* Indicate what prefix is needed for an indirect branch. */ +enum indirect_thunk_prefix +{ + indirect_thunk_prefix_none, + indirect_thunk_prefix_nt +}; - if (words > 2) - { - /* When size > 16 bytes, if the first one isn't - X86_64_SSE_CLASS or any other ones aren't - X86_64_SSEUP_CLASS, everything should be passed in - memory. */ - if (classes[0] != X86_64_SSE_CLASS) - return 0; +/* Return the prefix needed for an indirect branch INSN. */ - for (i = 1; i < words; i++) - if (classes[i] != X86_64_SSEUP_CLASS) - return 0; +enum indirect_thunk_prefix +indirect_thunk_need_prefix (rtx_insn *insn) +{ + enum indirect_thunk_prefix need_prefix; + if ((cfun->machine->indirect_branch_type + == indirect_branch_thunk_extern) + && ix86_notrack_prefixed_insn_p (insn)) + { + /* NOTRACK prefix is only used with external thunk so that it + can be properly updated to support CET at run-time. */ + need_prefix = indirect_thunk_prefix_nt; + } + else + need_prefix = indirect_thunk_prefix_none; + return need_prefix; +} + +/* Fills in the label name that should be used for the indirect thunk. */ + +static void +indirect_thunk_name (char name[32], unsigned int regno, + enum indirect_thunk_prefix need_prefix, + bool ret_p) +{ + if (regno != INVALID_REGNUM && regno != CX_REG && ret_p) + gcc_unreachable (); + + if (USE_HIDDEN_LINKONCE) + { + const char *prefix; + + if (need_prefix == indirect_thunk_prefix_nt + && regno != INVALID_REGNUM) + { + /* NOTRACK prefix is only used with external thunk via + register so that NOTRACK prefix can be added to indirect + branch via register to support CET at run-time. */ + prefix = "_nt"; } + else + prefix = ""; - /* Final merger cleanup. */ - for (i = 0; i < words; i++) + const char *ret = ret_p ? "return" : "indirect"; + + if (regno != INVALID_REGNUM) { - /* If one class is MEMORY, everything should be passed in - memory. */ - if (classes[i] == X86_64_MEMORY_CLASS) - return 0; + const char *reg_prefix; + if (LEGACY_INT_REGNO_P (regno)) + reg_prefix = TARGET_64BIT ? "r" : "e"; + else + reg_prefix = ""; + sprintf (name, "__x86_%s_thunk%s_%s%s", + ret, prefix, reg_prefix, reg_names[regno]); + } + else + sprintf (name, "__x86_%s_thunk%s", ret, prefix); + } + else + { + if (regno != INVALID_REGNUM) + ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno); + else + { + if (ret_p) + ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0); + else + ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0); + } + } +} - /* The X86_64_SSEUP_CLASS should be always preceded by - X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */ - if (classes[i] == X86_64_SSEUP_CLASS - && classes[i - 1] != X86_64_SSE_CLASS - && classes[i - 1] != X86_64_SSEUP_CLASS) - { - /* The first one should never be X86_64_SSEUP_CLASS. */ - gcc_assert (i != 0); - classes[i] = X86_64_SSE_CLASS; - } +/* Output a call and return thunk for indirect branch. If REGNO != -1, + the function address is in REGNO and the call and return thunk looks like: - /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS, - everything should be passed in memory. */ - if (classes[i] == X86_64_X87UP_CLASS - && (classes[i - 1] != X86_64_X87_CLASS)) - { - static bool warned; + call L2 + L1: + pause + lfence + jmp L1 + L2: + mov %REG, (%sp) + ret - /* The first one should never be X86_64_X87UP_CLASS. */ - gcc_assert (i != 0); - if (!warned && warn_psabi) - { - warned = true; - inform (input_location, - "the ABI of passing union with long double" - " has changed in GCC 4.4"); - } - return 0; - } + Otherwise, the function address is on the top of stack and the + call and return thunk looks like: + + call L2 + L1: + pause + lfence + jmp L1 + L2: + lea WORD_SIZE(%sp), %sp + ret + */ + +static void +output_indirect_thunk (unsigned int regno) +{ + char indirectlabel1[32]; + char indirectlabel2[32]; + + ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL, + indirectlabelno++); + ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL, + indirectlabelno++); + + /* Call */ + fputs ("\tcall\t", asm_out_file); + assemble_name_raw (asm_out_file, indirectlabel2); + fputc ('\n', asm_out_file); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1); + + /* AMD and Intel CPUs prefer each a different instruction as loop filler. + Usage of both pause + lfence is compromise solution. */ + fprintf (asm_out_file, "\tpause\n\tlfence\n"); + + /* Jump. */ + fputs ("\tjmp\t", asm_out_file); + assemble_name_raw (asm_out_file, indirectlabel1); + fputc ('\n', asm_out_file); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2); + + /* The above call insn pushed a word to stack. Adjust CFI info. */ + if (flag_asynchronous_unwind_tables && dwarf2out_do_frame ()) + { + if (! dwarf2out_do_cfi_asm ()) + { + dw_cfi_ref xcfi = ggc_cleared_alloc (); + xcfi->dw_cfi_opc = DW_CFA_advance_loc4; + xcfi->dw_cfi_oprnd1.dw_cfi_addr = ggc_strdup (indirectlabel2); + vec_safe_push (cfun->fde->dw_fde_cfi, xcfi); } - return words; + dw_cfi_ref xcfi = ggc_cleared_alloc (); + xcfi->dw_cfi_opc = DW_CFA_def_cfa_offset; + xcfi->dw_cfi_oprnd1.dw_cfi_offset = 2 * UNITS_PER_WORD; + vec_safe_push (cfun->fde->dw_fde_cfi, xcfi); + dwarf2out_emit_cfi (xcfi); } - /* Compute alignment needed. We align all types to natural boundaries with - exception of XFmode that is aligned to 64bits. */ - if (mode != VOIDmode && mode != BLKmode) + if (regno != INVALID_REGNUM) { - int mode_alignment = GET_MODE_BITSIZE (mode); - - if (mode == XFmode) - mode_alignment = 128; - else if (mode == XCmode) - mode_alignment = 256; - if (COMPLEX_MODE_P (mode)) - mode_alignment /= 2; - /* Misaligned fields are always returned in memory. */ - if (bit_offset % mode_alignment) - return 0; + /* MOV. */ + rtx xops[2]; + xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx); + xops[1] = gen_rtx_REG (word_mode, regno); + output_asm_insn ("mov\t{%1, %0|%0, %1}", xops); } - - /* for V1xx modes, just use the base mode */ - if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode - && GET_MODE_UNIT_SIZE (mode) == bytes) - mode = GET_MODE_INNER (mode); - - /* Classification of atomic types. */ - switch (mode) + else { - case E_SDmode: - case E_DDmode: - classes[0] = X86_64_SSE_CLASS; - return 1; - case E_TDmode: - classes[0] = X86_64_SSE_CLASS; - classes[1] = X86_64_SSEUP_CLASS; - return 2; - case E_DImode: - case E_SImode: - case E_HImode: - case E_QImode: - case E_CSImode: - case E_CHImode: - case E_CQImode: - { - int size = bit_offset + (int) GET_MODE_BITSIZE (mode); + /* LEA. */ + rtx xops[2]; + xops[0] = stack_pointer_rtx; + xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); + output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops); + } - /* Analyze last 128 bits only. */ - size = (size - 1) & 0x7f; + fputs ("\tret\n", asm_out_file); +} - if (size < 32) - { - classes[0] = X86_64_INTEGERSI_CLASS; - return 1; - } - else if (size < 64) - { - classes[0] = X86_64_INTEGER_CLASS; - return 1; - } - else if (size < 64+32) - { - classes[0] = X86_64_INTEGER_CLASS; - classes[1] = X86_64_INTEGERSI_CLASS; - return 2; - } - else if (size < 64+64) - { - classes[0] = classes[1] = X86_64_INTEGER_CLASS; - return 2; - } - else - gcc_unreachable (); - } - case E_CDImode: - case E_TImode: - classes[0] = classes[1] = X86_64_INTEGER_CLASS; - return 2; - case E_COImode: - case E_OImode: - /* OImode shouldn't be used directly. */ - gcc_unreachable (); - case E_CTImode: - return 0; - case E_SFmode: - if (!(bit_offset % 64)) - classes[0] = X86_64_SSESF_CLASS; - else - classes[0] = X86_64_SSE_CLASS; - return 1; - case E_DFmode: - classes[0] = X86_64_SSEDF_CLASS; - return 1; - case E_XFmode: - classes[0] = X86_64_X87_CLASS; - classes[1] = X86_64_X87UP_CLASS; - return 2; - case E_TFmode: - classes[0] = X86_64_SSE_CLASS; - classes[1] = X86_64_SSEUP_CLASS; - return 2; - case E_SCmode: - classes[0] = X86_64_SSE_CLASS; - if (!(bit_offset % 64)) - return 1; - else - { - static bool warned; - - if (!warned && warn_psabi) - { - warned = true; - inform (input_location, - "the ABI of passing structure with complex float" - " member has changed in GCC 4.4"); - } - classes[1] = X86_64_SSESF_CLASS; - return 2; - } - case E_DCmode: - classes[0] = X86_64_SSEDF_CLASS; - classes[1] = X86_64_SSEDF_CLASS; - return 2; - case E_XCmode: - classes[0] = X86_64_COMPLEX_X87_CLASS; - return 1; - case E_TCmode: - /* This modes is larger than 16 bytes. */ - return 0; - case E_V8SFmode: - case E_V8SImode: - case E_V32QImode: - case E_V16HImode: - case E_V4DFmode: - case E_V4DImode: - classes[0] = X86_64_SSE_CLASS; - classes[1] = X86_64_SSEUP_CLASS; - classes[2] = X86_64_SSEUP_CLASS; - classes[3] = X86_64_SSEUP_CLASS; - return 4; - case E_V8DFmode: - case E_V16SFmode: - case E_V8DImode: - case E_V16SImode: - case E_V32HImode: - case E_V64QImode: - classes[0] = X86_64_SSE_CLASS; - classes[1] = X86_64_SSEUP_CLASS; - classes[2] = X86_64_SSEUP_CLASS; - classes[3] = X86_64_SSEUP_CLASS; - classes[4] = X86_64_SSEUP_CLASS; - classes[5] = X86_64_SSEUP_CLASS; - classes[6] = X86_64_SSEUP_CLASS; - classes[7] = X86_64_SSEUP_CLASS; - return 8; - case E_V4SFmode: - case E_V4SImode: - case E_V16QImode: - case E_V8HImode: - case E_V2DFmode: - case E_V2DImode: - classes[0] = X86_64_SSE_CLASS; - classes[1] = X86_64_SSEUP_CLASS; - return 2; - case E_V1TImode: - case E_V1DImode: - case E_V2SFmode: - case E_V2SImode: - case E_V4HImode: - case E_V8QImode: - classes[0] = X86_64_SSE_CLASS; - return 1; - case E_BLKmode: - case E_VOIDmode: - return 0; - default: - gcc_assert (VECTOR_MODE_P (mode)); +/* Output a funtion with a call and return thunk for indirect branch. + If REGNO != INVALID_REGNUM, the function address is in REGNO. + Otherwise, the function address is on the top of stack. Thunk is + used for function return if RET_P is true. */ - if (bytes > 16) - return 0; +static void +output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix, + unsigned int regno, bool ret_p) +{ + char name[32]; + tree decl; - gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT); + /* Create __x86_indirect_thunk. */ + indirect_thunk_name (name, regno, need_prefix, ret_p); + decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, + get_identifier (name), + build_function_type_list (void_type_node, NULL_TREE)); + DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, + NULL_TREE, void_type_node); + TREE_PUBLIC (decl) = 1; + TREE_STATIC (decl) = 1; + DECL_IGNORED_P (decl) = 1; - if (bit_offset + GET_MODE_BITSIZE (mode) <= 32) - classes[0] = X86_64_INTEGERSI_CLASS; - else - classes[0] = X86_64_INTEGER_CLASS; - classes[1] = X86_64_INTEGER_CLASS; - return 1 + (bytes > 8); +#if TARGET_MACHO + if (TARGET_MACHO) + { + switch_to_section (darwin_sections[picbase_thunk_section]); + fputs ("\t.weak_definition\t", asm_out_file); + assemble_name (asm_out_file, name); + fputs ("\n\t.private_extern\t", asm_out_file); + assemble_name (asm_out_file, name); + putc ('\n', asm_out_file); + ASM_OUTPUT_LABEL (asm_out_file, name); + DECL_WEAK (decl) = 1; } -} - -/* Examine the argument and return set number of register required in each - class. Return true iff parameter should be passed in memory. */ - -static bool -examine_argument (machine_mode mode, const_tree type, int in_return, - int *int_nregs, int *sse_nregs) -{ - enum x86_64_reg_class regclass[MAX_CLASSES]; - int n = classify_argument (mode, type, regclass, 0); + else +#endif + if (USE_HIDDEN_LINKONCE) + { + cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl)); - *int_nregs = 0; - *sse_nregs = 0; + targetm.asm_out.unique_section (decl, 0); + switch_to_section (get_named_section (decl, NULL, 0)); - if (!n) - return true; - for (n--; n >= 0; n--) - switch (regclass[n]) + targetm.asm_out.globalize_label (asm_out_file, name); + fputs ("\t.hidden\t", asm_out_file); + assemble_name (asm_out_file, name); + putc ('\n', asm_out_file); + ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl); + } + else { - case X86_64_INTEGER_CLASS: - case X86_64_INTEGERSI_CLASS: - (*int_nregs)++; - break; - case X86_64_SSE_CLASS: - case X86_64_SSESF_CLASS: - case X86_64_SSEDF_CLASS: - (*sse_nregs)++; - break; - case X86_64_NO_CLASS: - case X86_64_SSEUP_CLASS: - break; - case X86_64_X87_CLASS: - case X86_64_X87UP_CLASS: - case X86_64_COMPLEX_X87_CLASS: - if (!in_return) - return true; - break; - case X86_64_MEMORY_CLASS: - gcc_unreachable (); + switch_to_section (text_section); + ASM_OUTPUT_LABEL (asm_out_file, name); } - return false; + DECL_INITIAL (decl) = make_node (BLOCK); + current_function_decl = decl; + allocate_struct_function (decl, false); + init_function_start (decl); + /* We're about to hide the function body from callees of final_* by + emitting it directly; tell them we're a thunk, if they care. */ + cfun->is_thunk = true; + first_function_block_is_cold = false; + /* Make sure unwind info is emitted for the thunk if needed. */ + final_start_function (emit_barrier (), asm_out_file, 1); + + output_indirect_thunk (regno); + + final_end_function (); + init_insn_lengths (); + free_after_compilation (cfun); + set_cfun (NULL); + current_function_decl = NULL; } -/* Construct container for the argument used by GCC interface. See - FUNCTION_ARG for the detailed description. */ +static int pic_labels_used; -static rtx -construct_container (machine_mode mode, machine_mode orig_mode, - const_tree type, int in_return, int nintregs, int nsseregs, - const int *intreg, int sse_regno) +/* Fills in the label name that should be used for a pc thunk for + the given register. */ + +static void +get_pc_thunk_name (char name[32], unsigned int regno) { - /* The following variables hold the static issued_error state. */ - static bool issued_sse_arg_error; - static bool issued_sse_ret_error; - static bool issued_x87_ret_error; + gcc_assert (!TARGET_64BIT); - machine_mode tmpmode; - int bytes - = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); - enum x86_64_reg_class regclass[MAX_CLASSES]; - int n; - int i; - int nexps = 0; - int needed_sseregs, needed_intregs; - rtx exp[MAX_CLASSES]; - rtx ret; + if (USE_HIDDEN_LINKONCE) + sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]); + else + ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno); +} - n = classify_argument (mode, type, regclass, 0); - if (!n) - return NULL; - if (examine_argument (mode, type, in_return, &needed_intregs, - &needed_sseregs)) - return NULL; - if (needed_intregs > nintregs || needed_sseregs > nsseregs) - return NULL; - /* We allowed the user to turn off SSE for kernel mode. Don't crash if - some less clueful developer tries to use floating-point anyway. */ - if (needed_sseregs && !TARGET_SSE) +/* This function generates code for -fpic that loads %ebx with + the return address of the caller and then returns. */ + +static void +ix86_code_end (void) +{ + rtx xops[2]; + unsigned int regno; + + if (indirect_return_needed) + output_indirect_thunk_function (indirect_thunk_prefix_none, + INVALID_REGNUM, true); + if (indirect_return_via_cx) + output_indirect_thunk_function (indirect_thunk_prefix_none, + CX_REG, true); + if (indirect_thunk_needed) + output_indirect_thunk_function (indirect_thunk_prefix_none, + INVALID_REGNUM, false); + + for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++) { - if (in_return) + unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1; + if ((indirect_thunks_used & (1 << i))) + output_indirect_thunk_function (indirect_thunk_prefix_none, + regno, false); + } + + for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++) + { + char name[32]; + tree decl; + + if ((indirect_thunks_used & (1 << regno))) + output_indirect_thunk_function (indirect_thunk_prefix_none, + regno, false); + + if (!(pic_labels_used & (1 << regno))) + continue; + + get_pc_thunk_name (name, regno); + + decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, + get_identifier (name), + build_function_type_list (void_type_node, NULL_TREE)); + DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, + NULL_TREE, void_type_node); + TREE_PUBLIC (decl) = 1; + TREE_STATIC (decl) = 1; + DECL_IGNORED_P (decl) = 1; + +#if TARGET_MACHO + if (TARGET_MACHO) { - if (!issued_sse_ret_error) - { - error ("SSE register return with SSE disabled"); - issued_sse_ret_error = true; - } + switch_to_section (darwin_sections[picbase_thunk_section]); + fputs ("\t.weak_definition\t", asm_out_file); + assemble_name (asm_out_file, name); + fputs ("\n\t.private_extern\t", asm_out_file); + assemble_name (asm_out_file, name); + putc ('\n', asm_out_file); + ASM_OUTPUT_LABEL (asm_out_file, name); + DECL_WEAK (decl) = 1; } - else if (!issued_sse_arg_error) + else +#endif + if (USE_HIDDEN_LINKONCE) { - error ("SSE register argument with SSE disabled"); - issued_sse_arg_error = true; - } - return NULL; - } + cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl)); - /* Likewise, error if the ABI requires us to return values in the - x87 registers and the user specified -mno-80387. */ - if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return) - for (i = 0; i < n; i++) - if (regclass[i] == X86_64_X87_CLASS - || regclass[i] == X86_64_X87UP_CLASS - || regclass[i] == X86_64_COMPLEX_X87_CLASS) + targetm.asm_out.unique_section (decl, 0); + switch_to_section (get_named_section (decl, NULL, 0)); + + targetm.asm_out.globalize_label (asm_out_file, name); + fputs ("\t.hidden\t", asm_out_file); + assemble_name (asm_out_file, name); + putc ('\n', asm_out_file); + ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl); + } + else { - if (!issued_x87_ret_error) - { - error ("x87 register return with x87 disabled"); - issued_x87_ret_error = true; - } - return NULL; + switch_to_section (text_section); + ASM_OUTPUT_LABEL (asm_out_file, name); } - /* First construct simple cases. Avoid SCmode, since we want to use - single register to pass this type. */ - if (n == 1 && mode != SCmode) - switch (regclass[0]) - { - case X86_64_INTEGER_CLASS: - case X86_64_INTEGERSI_CLASS: - return gen_rtx_REG (mode, intreg[0]); - case X86_64_SSE_CLASS: - case X86_64_SSESF_CLASS: - case X86_64_SSEDF_CLASS: - if (mode != BLKmode) - return gen_reg_or_parallel (mode, orig_mode, - GET_SSE_REGNO (sse_regno)); - break; - case X86_64_X87_CLASS: - case X86_64_COMPLEX_X87_CLASS: - return gen_rtx_REG (mode, FIRST_STACK_REG); - case X86_64_NO_CLASS: - /* Zero sized array, struct or class. */ - return NULL; - default: - gcc_unreachable (); - } - if (n == 2 - && regclass[0] == X86_64_SSE_CLASS - && regclass[1] == X86_64_SSEUP_CLASS - && mode != BLKmode) - return gen_reg_or_parallel (mode, orig_mode, - GET_SSE_REGNO (sse_regno)); - if (n == 4 - && regclass[0] == X86_64_SSE_CLASS - && regclass[1] == X86_64_SSEUP_CLASS - && regclass[2] == X86_64_SSEUP_CLASS - && regclass[3] == X86_64_SSEUP_CLASS - && mode != BLKmode) - return gen_reg_or_parallel (mode, orig_mode, - GET_SSE_REGNO (sse_regno)); - if (n == 8 - && regclass[0] == X86_64_SSE_CLASS - && regclass[1] == X86_64_SSEUP_CLASS - && regclass[2] == X86_64_SSEUP_CLASS - && regclass[3] == X86_64_SSEUP_CLASS - && regclass[4] == X86_64_SSEUP_CLASS - && regclass[5] == X86_64_SSEUP_CLASS - && regclass[6] == X86_64_SSEUP_CLASS - && regclass[7] == X86_64_SSEUP_CLASS - && mode != BLKmode) - return gen_reg_or_parallel (mode, orig_mode, - GET_SSE_REGNO (sse_regno)); - if (n == 2 - && regclass[0] == X86_64_X87_CLASS - && regclass[1] == X86_64_X87UP_CLASS) - return gen_rtx_REG (XFmode, FIRST_STACK_REG); + DECL_INITIAL (decl) = make_node (BLOCK); + current_function_decl = decl; + allocate_struct_function (decl, false); + init_function_start (decl); + /* We're about to hide the function body from callees of final_* by + emitting it directly; tell them we're a thunk, if they care. */ + cfun->is_thunk = true; + first_function_block_is_cold = false; + /* Make sure unwind info is emitted for the thunk if needed. */ + final_start_function (emit_barrier (), asm_out_file, 1); - if (n == 2 - && regclass[0] == X86_64_INTEGER_CLASS - && regclass[1] == X86_64_INTEGER_CLASS - && (mode == CDImode || mode == TImode || mode == BLKmode) - && intreg[0] + 1 == intreg[1]) - { - if (mode == BLKmode) + /* Pad stack IP move with 4 instructions (two NOPs count + as one instruction). */ + if (TARGET_PAD_SHORT_FUNCTION) { - /* Use TImode for BLKmode values in 2 integer registers. */ - exp[0] = gen_rtx_EXPR_LIST (VOIDmode, - gen_rtx_REG (TImode, intreg[0]), - GEN_INT (0)); - ret = gen_rtx_PARALLEL (mode, rtvec_alloc (1)); - XVECEXP (ret, 0, 0) = exp[0]; - return ret; + int i = 8; + + while (i--) + fputs ("\tnop\n", asm_out_file); } - else - return gen_rtx_REG (mode, intreg[0]); + + xops[0] = gen_rtx_REG (Pmode, regno); + xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx); + output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops); + output_asm_insn ("%!ret", NULL); + final_end_function (); + init_insn_lengths (); + free_after_compilation (cfun); + set_cfun (NULL); + current_function_decl = NULL; } - /* Otherwise figure out the entries of the PARALLEL. */ - for (i = 0; i < n; i++) + if (flag_split_stack) + file_end_indicate_split_stack (); +} + +/* Emit code for the SET_GOT patterns. */ + +const char * +output_set_got (rtx dest, rtx label) +{ + rtx xops[3]; + + xops[0] = dest; + + if (TARGET_VXWORKS_RTP && flag_pic) { - int pos; + /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */ + xops[2] = gen_rtx_MEM (Pmode, + gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE)); + output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops); - switch (regclass[i]) - { - case X86_64_NO_CLASS: - break; - case X86_64_INTEGER_CLASS: - case X86_64_INTEGERSI_CLASS: - /* Merge TImodes on aligned occasions here too. */ - if (i * 8 + 8 > bytes) - { - unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT; - if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode)) - /* We've requested 24 bytes we - don't have mode for. Use DImode. */ - tmpmode = DImode; - } - else if (regclass[i] == X86_64_INTEGERSI_CLASS) - tmpmode = SImode; - else - tmpmode = DImode; - exp [nexps++] - = gen_rtx_EXPR_LIST (VOIDmode, - gen_rtx_REG (tmpmode, *intreg), - GEN_INT (i*8)); - intreg++; - break; - case X86_64_SSESF_CLASS: - exp [nexps++] - = gen_rtx_EXPR_LIST (VOIDmode, - gen_rtx_REG (SFmode, - GET_SSE_REGNO (sse_regno)), - GEN_INT (i*8)); - sse_regno++; - break; - case X86_64_SSEDF_CLASS: - exp [nexps++] - = gen_rtx_EXPR_LIST (VOIDmode, - gen_rtx_REG (DFmode, - GET_SSE_REGNO (sse_regno)), - GEN_INT (i*8)); - sse_regno++; - break; - case X86_64_SSE_CLASS: - pos = i; - switch (n) - { - case 1: - tmpmode = DImode; - break; - case 2: - if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS) - { - tmpmode = TImode; - i++; - } - else - tmpmode = DImode; - break; - case 4: - gcc_assert (i == 0 - && regclass[1] == X86_64_SSEUP_CLASS - && regclass[2] == X86_64_SSEUP_CLASS - && regclass[3] == X86_64_SSEUP_CLASS); - tmpmode = OImode; - i += 3; - break; - case 8: - gcc_assert (i == 0 - && regclass[1] == X86_64_SSEUP_CLASS - && regclass[2] == X86_64_SSEUP_CLASS - && regclass[3] == X86_64_SSEUP_CLASS - && regclass[4] == X86_64_SSEUP_CLASS - && regclass[5] == X86_64_SSEUP_CLASS - && regclass[6] == X86_64_SSEUP_CLASS - && regclass[7] == X86_64_SSEUP_CLASS); - tmpmode = XImode; - i += 7; - break; - default: - gcc_unreachable (); - } - exp [nexps++] - = gen_rtx_EXPR_LIST (VOIDmode, - gen_rtx_REG (tmpmode, - GET_SSE_REGNO (sse_regno)), - GEN_INT (pos*8)); - sse_regno++; - break; - default: - gcc_unreachable (); - } + /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register. + Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as + an unadorned address. */ + xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX); + SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL; + output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops); + return ""; } - /* Empty aligned struct, union or class. */ - if (nexps == 0) - return NULL; - - ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps)); - for (i = 0; i < nexps; i++) - XVECEXP (ret, 0, i) = exp [i]; - return ret; -} + xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); -/* Update the data in CUM to advance over an argument of mode MODE - and data type TYPE. (TYPE is null for libcalls where that information - may not be available.) + if (flag_pic) + { + char name[32]; + get_pc_thunk_name (name, REGNO (dest)); + pic_labels_used |= 1 << REGNO (dest); - Return a number of integer regsiters advanced over. */ + xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name)); + xops[2] = gen_rtx_MEM (QImode, xops[2]); + output_asm_insn ("%!call\t%X2", xops); -static int -function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode, - const_tree type, HOST_WIDE_INT bytes, - HOST_WIDE_INT words) -{ - int res = 0; - bool error_p = false; +#if TARGET_MACHO + /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here. + This is what will be referenced by the Mach-O PIC subsystem. */ + if (machopic_should_output_picbase_label () || !label) + ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME); - if (TARGET_IAMCU) - { - /* Intel MCU psABI passes scalars and aggregates no larger than 8 - bytes in registers. */ - if (!VECTOR_MODE_P (mode) && bytes <= 8) - goto pass_in_reg; - return res; + /* When we are restoring the pic base at the site of a nonlocal label, + and we decided to emit the pic base above, we will still output a + local label used for calculating the correction offset (even though + the offset will be 0 in that case). */ + if (label) + targetm.asm_out.internal_label (asm_out_file, "L", + CODE_LABEL_NUMBER (label)); +#endif } - - switch (mode) + else { - default: - break; + if (TARGET_MACHO) + /* We don't need a pic base, we're not producing pic. */ + gcc_unreachable (); - case E_BLKmode: - if (bytes < 0) - break; - /* FALLTHRU */ + xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ()); + output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops); + targetm.asm_out.internal_label (asm_out_file, "L", + CODE_LABEL_NUMBER (XEXP (xops[2], 0))); + } - case E_DImode: - case E_SImode: - case E_HImode: - case E_QImode: -pass_in_reg: - cum->words += words; - cum->nregs -= words; - cum->regno += words; - if (cum->nregs >= 0) - res = words; - if (cum->nregs <= 0) - { - cum->nregs = 0; - cfun->machine->arg_reg_available = false; - cum->regno = 0; - } - break; + if (!TARGET_MACHO) + output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops); - case E_OImode: - /* OImode shouldn't be used directly. */ - gcc_unreachable (); + return ""; +} - case E_DFmode: - if (cum->float_in_sse == -1) - error_p = true; - if (cum->float_in_sse < 2) - break; - /* FALLTHRU */ - case E_SFmode: - if (cum->float_in_sse == -1) - error_p = true; - if (cum->float_in_sse < 1) - break; - /* FALLTHRU */ +/* Generate an "push" pattern for input ARG. */ - case E_V8SFmode: - case E_V8SImode: - case E_V64QImode: - case E_V32HImode: - case E_V16SImode: - case E_V8DImode: - case E_V16SFmode: - case E_V8DFmode: - case E_V32QImode: - case E_V16HImode: - case E_V4DFmode: - case E_V4DImode: - case E_TImode: - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - case E_V4SFmode: - case E_V2DFmode: - if (!type || !AGGREGATE_TYPE_P (type)) - { - cum->sse_words += words; - cum->sse_nregs -= 1; - cum->sse_regno += 1; - if (cum->sse_nregs <= 0) - { - cum->sse_nregs = 0; - cum->sse_regno = 0; - } - } - break; +rtx +gen_push (rtx arg) +{ + struct machine_function *m = cfun->machine; - case E_V8QImode: - case E_V4HImode: - case E_V2SImode: - case E_V2SFmode: - case E_V1TImode: - case E_V1DImode: - if (!type || !AGGREGATE_TYPE_P (type)) - { - cum->mmx_words += words; - cum->mmx_nregs -= 1; - cum->mmx_regno += 1; - if (cum->mmx_nregs <= 0) - { - cum->mmx_nregs = 0; - cum->mmx_regno = 0; - } - } - break; - } - if (error_p) - { - cum->float_in_sse = 0; - error ("calling %qD with SSE calling convention without " - "SSE/SSE2 enabled", cum->decl); - sorry ("this is a GCC bug that can be worked around by adding " - "attribute used to function called"); - } + if (m->fs.cfa_reg == stack_pointer_rtx) + m->fs.cfa_offset += UNITS_PER_WORD; + m->fs.sp_offset += UNITS_PER_WORD; - return res; + if (REG_P (arg) && GET_MODE (arg) != word_mode) + arg = gen_rtx_REG (word_mode, REGNO (arg)); + + return gen_rtx_SET (gen_rtx_MEM (word_mode, + gen_rtx_PRE_DEC (Pmode, + stack_pointer_rtx)), + arg); } -static int -function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode, - const_tree type, HOST_WIDE_INT words, bool named) -{ - int int_nregs, sse_nregs; +/* Generate an "pop" pattern for input ARG. */ - /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */ - if (!named && (VALID_AVX512F_REG_MODE (mode) - || VALID_AVX256_REG_MODE (mode))) - return 0; +rtx +gen_pop (rtx arg) +{ + if (REG_P (arg) && GET_MODE (arg) != word_mode) + arg = gen_rtx_REG (word_mode, REGNO (arg)); - if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs) - && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs) - { - cum->nregs -= int_nregs; - cum->sse_nregs -= sse_nregs; - cum->regno += int_nregs; - cum->sse_regno += sse_nregs; - return int_nregs; - } - else - { - int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD; - cum->words = ROUND_UP (cum->words, align); - cum->words += words; - return 0; - } + return gen_rtx_SET (arg, + gen_rtx_MEM (word_mode, + gen_rtx_POST_INC (Pmode, + stack_pointer_rtx))); } -static int -function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes, - HOST_WIDE_INT words) +/* Return >= 0 if there is an unused call-clobbered register available + for the entire function. */ + +static unsigned int +ix86_select_alt_pic_regnum (void) { - /* Otherwise, this should be passed indirect. */ - gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8); + if (ix86_use_pseudo_pic_reg ()) + return INVALID_REGNUM; - cum->words += words; - if (cum->nregs > 0) + if (crtl->is_leaf + && !crtl->profile + && !ix86_current_function_calls_tls_descriptor) { - cum->nregs -= 1; - cum->regno += 1; - return 1; + int i, drap; + /* Can't use the same register for both PIC and DRAP. */ + if (crtl->drap_reg) + drap = REGNO (crtl->drap_reg); + else + drap = -1; + for (i = 2; i >= 0; --i) + if (i != drap && !df_regs_ever_live_p (i)) + return i; } - return 0; + + return INVALID_REGNUM; } -/* Update the data in CUM to advance over an argument of mode MODE and - data type TYPE. (TYPE is null for libcalls where that information - may not be available.) */ +/* Return true if REGNO is used by the epilogue. */ -static void -ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode, - const_tree type, bool named) +bool +ix86_epilogue_uses (int regno) { - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); - HOST_WIDE_INT bytes, words; - int nregs; + /* If there are no caller-saved registers, we preserve all registers, + except for MMX and x87 registers which aren't supported when saving + and restoring registers. Don't explicitly save SP register since + it is always preserved. */ + return (epilogue_completed + && cfun->machine->no_caller_saved_registers + && !fixed_regs[regno] + && !STACK_REGNO_P (regno) + && !MMX_REGNO_P (regno)); +} - /* The argument of interrupt handler is a special case and is - handled in ix86_function_arg. */ - if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL) - return; +/* Return nonzero if register REGNO can be used as a scratch register + in peephole2. */ - if (mode == BLKmode) - bytes = int_size_in_bytes (type); - else - bytes = GET_MODE_SIZE (mode); - words = CEIL (bytes, UNITS_PER_WORD); +static bool +ix86_hard_regno_scratch_ok (unsigned int regno) +{ + /* If there are no caller-saved registers, we can't use any register + as a scratch register after epilogue and use REGNO as scratch + register only if it has been used before to avoid saving and + restoring it. */ + return (!cfun->machine->no_caller_saved_registers + || (!epilogue_completed + && df_regs_ever_live_p (regno))); +} - if (type) - mode = type_natural_mode (type, NULL, false); +/* Return TRUE if we need to save REGNO. */ - if (TARGET_64BIT) +bool +ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined) +{ + /* If there are no caller-saved registers, we preserve all registers, + except for MMX and x87 registers which aren't supported when saving + and restoring registers. Don't explicitly save SP register since + it is always preserved. */ + if (cfun->machine->no_caller_saved_registers) { - enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi; + /* Don't preserve registers used for function return value. */ + rtx reg = crtl->return_rtx; + if (reg) + { + unsigned int i = REGNO (reg); + unsigned int nregs = REG_NREGS (reg); + while (nregs-- > 0) + if ((i + nregs) == regno) + return false; + } - if (call_abi == MS_ABI) - nregs = function_arg_advance_ms_64 (cum, bytes, words); - else - nregs = function_arg_advance_64 (cum, mode, type, words, named); + return (df_regs_ever_live_p (regno) + && !fixed_regs[regno] + && !STACK_REGNO_P (regno) + && !MMX_REGNO_P (regno) + && (regno != HARD_FRAME_POINTER_REGNUM + || !frame_pointer_needed)); } - else - nregs = function_arg_advance_32 (cum, mode, type, bytes, words); - if (!nregs) + if (regno == REAL_PIC_OFFSET_TABLE_REGNUM + && pic_offset_table_rtx) { - /* Track if there are outgoing arguments on stack. */ - if (cum->caller) - cfun->machine->outgoing_args_on_stack = true; + if (ix86_use_pseudo_pic_reg ()) + { + /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to + _mcount in prologue. */ + if (!TARGET_64BIT && flag_pic && crtl->profile) + return true; + } + else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM) + || crtl->profile + || crtl->calls_eh_return + || crtl->uses_const_pool + || cfun->has_nonlocal_label) + return ix86_select_alt_pic_regnum () == INVALID_REGNUM; } -} - -/* Define where to put the arguments to a function. - Value is zero to push the argument on the stack, - or a hard register in which to store the argument. - - MODE is the argument's machine mode. - TYPE is the data type of the argument (as a tree). - This is null for libcalls where that information may - not be available. - CUM is a variable of type CUMULATIVE_ARGS which gives info about - the preceding args and about the function being called. - NAMED is nonzero if this argument is a named parameter - (otherwise it is an extra parameter matching an ellipsis). */ -static rtx -function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode, - machine_mode orig_mode, const_tree type, - HOST_WIDE_INT bytes, HOST_WIDE_INT words) -{ - bool error_p = false; - - /* Avoid the AL settings for the Unix64 ABI. */ - if (mode == VOIDmode) - return constm1_rtx; - - if (TARGET_IAMCU) + if (crtl->calls_eh_return && maybe_eh_return) { - /* Intel MCU psABI passes scalars and aggregates no larger than 8 - bytes in registers. */ - if (!VECTOR_MODE_P (mode) && bytes <= 8) - goto pass_in_reg; - return NULL_RTX; + unsigned i; + for (i = 0; ; i++) + { + unsigned test = EH_RETURN_DATA_REGNO (i); + if (test == INVALID_REGNUM) + break; + if (test == regno) + return true; + } } - switch (mode) + if (ignore_outlined && cfun->machine->call_ms2sysv) { - default: - break; + unsigned count = cfun->machine->call_ms2sysv_extra_regs + + xlogue_layout::MIN_REGS; + if (xlogue_layout::is_stub_managed_reg (regno, count)) + return false; + } - case E_BLKmode: - if (bytes < 0) - break; - /* FALLTHRU */ - case E_DImode: - case E_SImode: - case E_HImode: - case E_QImode: -pass_in_reg: - if (words <= cum->nregs) - { - int regno = cum->regno; + if (crtl->drap_reg + && regno == REGNO (crtl->drap_reg) + && !cfun->machine->no_drap_save_restore) + return true; - /* Fastcall allocates the first two DWORD (SImode) or - smaller arguments to ECX and EDX if it isn't an - aggregate type . */ - if (cum->fastcall) - { - if (mode == BLKmode - || mode == DImode - || (type && AGGREGATE_TYPE_P (type))) - break; + return (df_regs_ever_live_p (regno) + && !call_used_regs[regno] + && !fixed_regs[regno] + && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed)); +} - /* ECX not EAX is the first allocated register. */ - if (regno == AX_REG) - regno = CX_REG; - } - return gen_rtx_REG (mode, regno); - } - break; +/* Return number of saved general prupose registers. */ - case E_DFmode: - if (cum->float_in_sse == -1) - error_p = true; - if (cum->float_in_sse < 2) - break; - /* FALLTHRU */ - case E_SFmode: - if (cum->float_in_sse == -1) - error_p = true; - if (cum->float_in_sse < 1) - break; - /* FALLTHRU */ - case E_TImode: - /* In 32bit, we pass TImode in xmm registers. */ - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - case E_V4SFmode: - case E_V2DFmode: - if (!type || !AGGREGATE_TYPE_P (type)) - { - if (cum->sse_nregs) - return gen_reg_or_parallel (mode, orig_mode, - cum->sse_regno + FIRST_SSE_REG); - } - break; +static int +ix86_nsaved_regs (void) +{ + int nregs = 0; + int regno; - case E_OImode: - case E_XImode: - /* OImode and XImode shouldn't be used directly. */ - gcc_unreachable (); + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + nregs ++; + return nregs; +} - case E_V64QImode: - case E_V32HImode: - case E_V16SImode: - case E_V8DImode: - case E_V16SFmode: - case E_V8DFmode: - case E_V8SFmode: - case E_V8SImode: - case E_V32QImode: - case E_V16HImode: - case E_V4DFmode: - case E_V4DImode: - if (!type || !AGGREGATE_TYPE_P (type)) - { - if (cum->sse_nregs) - return gen_reg_or_parallel (mode, orig_mode, - cum->sse_regno + FIRST_SSE_REG); - } - break; +/* Return number of saved SSE registers. */ - case E_V8QImode: - case E_V4HImode: - case E_V2SImode: - case E_V2SFmode: - case E_V1TImode: - case E_V1DImode: - if (!type || !AGGREGATE_TYPE_P (type)) - { - if (cum->mmx_nregs) - return gen_reg_or_parallel (mode, orig_mode, - cum->mmx_regno + FIRST_MMX_REG); - } - break; - } - if (error_p) - { - cum->float_in_sse = 0; - error ("calling %qD with SSE calling convention without " - "SSE/SSE2 enabled", cum->decl); - sorry ("this is a GCC bug that can be worked around by adding " - "attribute used to function called"); - } +static int +ix86_nsaved_sseregs (void) +{ + int nregs = 0; + int regno; - return NULL_RTX; + if (!TARGET_64BIT_MS_ABI) + return 0; + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + nregs ++; + return nregs; } -static rtx -function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode, - machine_mode orig_mode, const_tree type, bool named) +/* Given FROM and TO register numbers, say whether this elimination is + allowed. If stack alignment is needed, we can only replace argument + pointer with hard frame pointer, or replace frame pointer with stack + pointer. Otherwise, frame pointer elimination is automatically + handled and all other eliminations are valid. */ + +static bool +ix86_can_eliminate (const int from, const int to) { - /* Handle a hidden AL argument containing number of registers - for varargs x86-64 functions. */ - if (mode == VOIDmode) - return GEN_INT (cum->maybe_vaarg - ? (cum->sse_nregs < 0 - ? X86_64_SSE_REGPARM_MAX - : cum->sse_regno) - : -1); + if (stack_realign_fp) + return ((from == ARG_POINTER_REGNUM + && to == HARD_FRAME_POINTER_REGNUM) + || (from == FRAME_POINTER_REGNUM + && to == STACK_POINTER_REGNUM)); + else + return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true; +} - switch (mode) +/* Return the offset between two registers, one to be eliminated, and the other + its replacement, at the start of a routine. */ + +HOST_WIDE_INT +ix86_initial_elimination_offset (int from, int to) +{ + struct ix86_frame &frame = cfun->machine->frame; + + if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM) + return frame.hard_frame_pointer_offset; + else if (from == FRAME_POINTER_REGNUM + && to == HARD_FRAME_POINTER_REGNUM) + return frame.hard_frame_pointer_offset - frame.frame_pointer_offset; + else { - default: - break; + gcc_assert (to == STACK_POINTER_REGNUM); - case E_V8SFmode: - case E_V8SImode: - case E_V32QImode: - case E_V16HImode: - case E_V4DFmode: - case E_V4DImode: - case E_V16SFmode: - case E_V16SImode: - case E_V64QImode: - case E_V32HImode: - case E_V8DFmode: - case E_V8DImode: - /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */ - if (!named) - return NULL; - break; - } + if (from == ARG_POINTER_REGNUM) + return frame.stack_pointer_offset; - return construct_container (mode, orig_mode, type, 0, cum->nregs, - cum->sse_nregs, - &x86_64_int_parameter_registers [cum->regno], - cum->sse_regno); + gcc_assert (from == FRAME_POINTER_REGNUM); + return frame.stack_pointer_offset - frame.frame_pointer_offset; + } } +/* In a dynamically-aligned function, we can't know the offset from + stack pointer to frame pointer, so we must ensure that setjmp + eliminates fp against the hard fp (%ebp) rather than trying to + index from %esp up to the top of the frame across a gap that is + of unknown (at compile-time) size. */ static rtx -function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode, - machine_mode orig_mode, bool named, - HOST_WIDE_INT bytes) +ix86_builtin_setjmp_frame_value (void) { - unsigned int regno; - - /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call. - We use value of -2 to specify that current function call is MSABI. */ - if (mode == VOIDmode) - return GEN_INT (-2); - - /* If we've run out of registers, it goes on the stack. */ - if (cum->nregs == 0) - return NULL_RTX; - - regno = x86_64_ms_abi_int_parameter_registers[cum->regno]; - - /* Only floating point modes are passed in anything but integer regs. */ - if (TARGET_SSE && (mode == SFmode || mode == DFmode)) - { - if (named) - regno = cum->regno + FIRST_SSE_REG; - else - { - rtx t1, t2; + return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx; +} - /* Unnamed floating parameters are passed in both the - SSE and integer registers. */ - t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG); - t2 = gen_rtx_REG (mode, regno); - t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx); - t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx); - return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2)); - } - } - /* Handle aggregated types passed in register. */ - if (orig_mode == BLKmode) +/* Emits a warning for unsupported msabi to sysv pro/epilogues. */ +void warn_once_call_ms2sysv_xlogues (const char *feature) +{ + static bool warned_once = false; + if (!warned_once) { - if (bytes > 0 && bytes <= 8) - mode = (bytes > 4 ? DImode : SImode); - if (mode == BLKmode) - mode = DImode; + warning (0, "%<-mcall-ms2sysv-xlogues%> is not compatible with %s", + feature); + warned_once = true; } +} - return gen_reg_or_parallel (mode, orig_mode, regno); +/* Return the probing interval for -fstack-clash-protection. */ + +static HOST_WIDE_INT +get_probe_interval (void) +{ + if (flag_stack_clash_protection) + return (HOST_WIDE_INT_1U + << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL)); + else + return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP); } -/* Return where to put the arguments to a function. - Return zero to push the argument on the stack, or a hard register in which to store the argument. +/* When using -fsplit-stack, the allocation routines set a field in + the TCB to the bottom of the stack plus this much space, measured + in bytes. */ - MODE is the argument's machine mode. TYPE is the data type of the - argument. It is null for libcalls where that information may not be - available. CUM gives information about the preceding args and about - the function being called. NAMED is nonzero if this argument is a - named parameter (otherwise it is an extra parameter matching an - ellipsis). */ +#define SPLIT_STACK_AVAILABLE 256 -static rtx -ix86_function_arg (cumulative_args_t cum_v, machine_mode omode, - const_tree type, bool named) +/* Fill structure ix86_frame about frame of currently computed function. */ + +static void +ix86_compute_frame_layout (void) { - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); - machine_mode mode = omode; - HOST_WIDE_INT bytes, words; - rtx arg; + struct ix86_frame *frame = &cfun->machine->frame; + struct machine_function *m = cfun->machine; + unsigned HOST_WIDE_INT stack_alignment_needed; + HOST_WIDE_INT offset; + unsigned HOST_WIDE_INT preferred_alignment; + HOST_WIDE_INT size = get_frame_size (); + HOST_WIDE_INT to_allocate; - if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL) + /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit + * ms_abi functions that call a sysv function. We now need to prune away + * cases where it should be disabled. */ + if (TARGET_64BIT && m->call_ms2sysv) { - gcc_assert (type != NULL_TREE); - if (POINTER_TYPE_P (type)) + gcc_assert (TARGET_64BIT_MS_ABI); + gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES); + gcc_assert (!TARGET_SEH); + gcc_assert (TARGET_SSE); + gcc_assert (!ix86_using_red_zone ()); + + if (crtl->calls_eh_return) { - /* This is the pointer argument. */ - gcc_assert (TYPE_MODE (type) == Pmode); - /* It is at -WORD(AP) in the current frame in interrupt and - exception handlers. */ - arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD); + gcc_assert (!reload_completed); + m->call_ms2sysv = false; + warn_once_call_ms2sysv_xlogues ("__builtin_eh_return"); + } + + else if (ix86_static_chain_on_stack) + { + gcc_assert (!reload_completed); + m->call_ms2sysv = false; + warn_once_call_ms2sysv_xlogues ("static call chains"); } + + /* Finally, compute which registers the stub will manage. */ else { - gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION - && TREE_CODE (type) == INTEGER_TYPE - && TYPE_MODE (type) == word_mode); - /* The error code is the word-mode integer argument at - -2 * WORD(AP) in the current frame of the exception - handler. */ - arg = gen_rtx_MEM (word_mode, - plus_constant (Pmode, - arg_pointer_rtx, - -2 * UNITS_PER_WORD)); + unsigned count = xlogue_layout::count_stub_managed_regs (); + m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS; + m->call_ms2sysv_pad_in = 0; } - return arg; } - if (mode == BLKmode) - bytes = int_size_in_bytes (type); - else - bytes = GET_MODE_SIZE (mode); - words = CEIL (bytes, UNITS_PER_WORD); + frame->nregs = ix86_nsaved_regs (); + frame->nsseregs = ix86_nsaved_sseregs (); - /* To simplify the code below, represent vector types with a vector mode - even if MMX/SSE are not active. */ - if (type && TREE_CODE (type) == VECTOR_TYPE) - mode = type_natural_mode (type, cum, false); + /* 64-bit MS ABI seem to require stack alignment to be always 16, + except for function prologues, leaf functions and when the defult + incoming stack boundary is overriden at command line or via + force_align_arg_pointer attribute. - if (TARGET_64BIT) + Darwin's ABI specifies 128b alignment for both 32 and 64 bit variants + at call sites, including profile function calls. + */ + if (((TARGET_64BIT_MS_ABI || TARGET_MACHO) + && crtl->preferred_stack_boundary < 128) + && (!crtl->is_leaf || cfun->calls_alloca != 0 + || ix86_current_function_calls_tls_descriptor + || (TARGET_MACHO && crtl->profile) + || ix86_incoming_stack_boundary < 128)) { - enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi; - - if (call_abi == MS_ABI) - arg = function_arg_ms_64 (cum, mode, omode, named, bytes); - else - arg = function_arg_64 (cum, mode, omode, type, named); + crtl->preferred_stack_boundary = 128; + crtl->stack_alignment_needed = 128; } - else - arg = function_arg_32 (cum, mode, omode, type, bytes, words); - /* Track if there are outgoing arguments on stack. */ - if (arg == NULL_RTX && cum->caller) - cfun->machine->outgoing_args_on_stack = true; - - return arg; -} - -/* A C expression that indicates when an argument must be passed by - reference. If nonzero for an argument, a copy of that argument is - made in memory and a pointer to the argument is passed instead of - the argument itself. The pointer is passed in whatever way is - appropriate for passing a pointer to that type. */ + stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT; + preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT; -static bool -ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode, - const_tree type, bool) -{ - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); + gcc_assert (!size || stack_alignment_needed); + gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT); + gcc_assert (preferred_alignment <= stack_alignment_needed); - if (TARGET_64BIT) + /* The only ABI saving SSE regs should be 64-bit ms_abi. */ + gcc_assert (TARGET_64BIT || !frame->nsseregs); + if (TARGET_64BIT && m->call_ms2sysv) { - enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi; - - /* See Windows x64 Software Convention. */ - if (call_abi == MS_ABI) - { - HOST_WIDE_INT msize = GET_MODE_SIZE (mode); + gcc_assert (stack_alignment_needed >= 16); + gcc_assert (!frame->nsseregs); + } - if (type) - { - /* Arrays are passed by reference. */ - if (TREE_CODE (type) == ARRAY_TYPE) - return true; + /* For SEH we have to limit the amount of code movement into the prologue. + At present we do this via a BLOCKAGE, at which point there's very little + scheduling that can be done, which means that there's very little point + in doing anything except PUSHs. */ + if (TARGET_SEH) + m->use_fast_prologue_epilogue = false; + else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))) + { + int count = frame->nregs; + struct cgraph_node *node = cgraph_node::get (current_function_decl); - if (RECORD_OR_UNION_TYPE_P (type)) - { - /* Structs/unions of sizes other than 8, 16, 32, or 64 bits - are passed by reference. */ - msize = int_size_in_bytes (type); - } - } + /* The fast prologue uses move instead of push to save registers. This + is significantly longer, but also executes faster as modern hardware + can execute the moves in parallel, but can't do that for push/pop. - /* __m128 is passed by reference. */ - return msize != 1 && msize != 2 && msize != 4 && msize != 8; - } - else if (type && int_size_in_bytes (type) == -1) - return true; + Be careful about choosing what prologue to emit: When function takes + many instructions to execute we may use slow version as well as in + case function is known to be outside hot spot (this is known with + feedback only). Weight the size of function by number of registers + to save as it is cheap to use one or two push instructions but very + slow to use many of them. */ + if (count) + count = (count - 1) * FAST_PROLOGUE_INSN_COUNT; + if (node->frequency < NODE_FREQUENCY_NORMAL + || (flag_branch_probabilities + && node->frequency < NODE_FREQUENCY_HOT)) + m->use_fast_prologue_epilogue = false; + else + m->use_fast_prologue_epilogue + = !expensive_function_p (count); } - return false; -} + frame->save_regs_using_mov + = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue + /* If static stack checking is enabled and done with probes, + the registers need to be saved before allocating the frame. */ + && flag_stack_check != STATIC_BUILTIN_STACK_CHECK); -/* Return true when TYPE should be 128bit aligned for 32bit argument - passing ABI. XXX: This function is obsolete and is only used for - checking psABI compatibility with previous versions of GCC. */ + /* Skip return address and error code in exception handler. */ + offset = INCOMING_FRAME_SP_OFFSET; -static bool -ix86_compat_aligned_value_p (const_tree type) -{ - machine_mode mode = TYPE_MODE (type); - if (((TARGET_SSE && SSE_REG_MODE_P (mode)) - || mode == TDmode - || mode == TFmode - || mode == TCmode) - && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128)) - return true; - if (TYPE_ALIGN (type) < 128) - return false; + /* Skip pushed static chain. */ + if (ix86_static_chain_on_stack) + offset += UNITS_PER_WORD; - if (AGGREGATE_TYPE_P (type)) - { - /* Walk the aggregates recursively. */ - switch (TREE_CODE (type)) - { - case RECORD_TYPE: - case UNION_TYPE: - case QUAL_UNION_TYPE: - { - tree field; + /* Skip saved base pointer. */ + if (frame_pointer_needed) + offset += UNITS_PER_WORD; + frame->hfp_save_offset = offset; - /* Walk all the structure fields. */ - for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) - { - if (TREE_CODE (field) == FIELD_DECL - && ix86_compat_aligned_value_p (TREE_TYPE (field))) - return true; - } - break; - } + /* The traditional frame pointer location is at the top of the frame. */ + frame->hard_frame_pointer_offset = offset; - case ARRAY_TYPE: - /* Just for use if some languages passes arrays by value. */ - if (ix86_compat_aligned_value_p (TREE_TYPE (type))) - return true; - break; + /* Register save area */ + offset += frame->nregs * UNITS_PER_WORD; + frame->reg_save_offset = offset; - default: - gcc_unreachable (); - } - } - return false; -} + /* On SEH target, registers are pushed just before the frame pointer + location. */ + if (TARGET_SEH) + frame->hard_frame_pointer_offset = offset; -/* Return the alignment boundary for MODE and TYPE with alignment ALIGN. - XXX: This function is obsolete and is only used for checking psABI - compatibility with previous versions of GCC. */ + /* Calculate the size of the va-arg area (not including padding, if any). */ + frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size; -static unsigned int -ix86_compat_function_arg_boundary (machine_mode mode, - const_tree type, unsigned int align) -{ - /* In 32bit, only _Decimal128 and __float128 are aligned to their - natural boundaries. */ - if (!TARGET_64BIT && mode != TDmode && mode != TFmode) + /* Also adjust stack_realign_offset for the largest alignment of + stack slot actually used. */ + if (stack_realign_fp + || (cfun->machine->max_used_stack_alignment != 0 + && (offset % cfun->machine->max_used_stack_alignment) != 0)) { - /* i386 ABI defines all arguments to be 4 byte aligned. We have to - make an exception for SSE modes since these require 128bit - alignment. - - The handling here differs from field_alignment. ICC aligns MMX - arguments to 4 byte boundaries, while structure fields are aligned - to 8 byte boundaries. */ - if (!type) - { - if (!(TARGET_SSE && SSE_REG_MODE_P (mode))) - align = PARM_BOUNDARY; - } - else - { - if (!ix86_compat_aligned_value_p (type)) - align = PARM_BOUNDARY; - } - } - if (align > BIGGEST_ALIGNMENT) - align = BIGGEST_ALIGNMENT; - return align; -} - -/* Return true when TYPE should be 128bit aligned for 32bit argument - passing ABI. */ - -static bool -ix86_contains_aligned_value_p (const_tree type) -{ - machine_mode mode = TYPE_MODE (type); - - if (mode == XFmode || mode == XCmode) - return false; - - if (TYPE_ALIGN (type) < 128) - return false; + /* We may need a 16-byte aligned stack for the remainder of the + register save area, but the stack frame for the local function + may require a greater alignment if using AVX/2/512. In order + to avoid wasting space, we first calculate the space needed for + the rest of the register saves, add that to the stack pointer, + and then realign the stack to the boundary of the start of the + frame for the local function. */ + HOST_WIDE_INT space_needed = 0; + HOST_WIDE_INT sse_reg_space_needed = 0; - if (AGGREGATE_TYPE_P (type)) - { - /* Walk the aggregates recursively. */ - switch (TREE_CODE (type)) + if (TARGET_64BIT) { - case RECORD_TYPE: - case UNION_TYPE: - case QUAL_UNION_TYPE: - { - tree field; + if (m->call_ms2sysv) + { + m->call_ms2sysv_pad_in = 0; + space_needed = xlogue_layout::get_instance ().get_stack_space_used (); + } - /* Walk all the structure fields. */ - for (field = TYPE_FIELDS (type); - field; - field = DECL_CHAIN (field)) - { - if (TREE_CODE (field) == FIELD_DECL - && ix86_contains_aligned_value_p (TREE_TYPE (field))) - return true; - } - break; - } + else if (frame->nsseregs) + /* The only ABI that has saved SSE registers (Win64) also has a + 16-byte aligned default stack. However, many programs violate + the ABI, and Wine64 forces stack realignment to compensate. */ + space_needed = frame->nsseregs * 16; - case ARRAY_TYPE: - /* Just for use if some languages passes arrays by value. */ - if (ix86_contains_aligned_value_p (TREE_TYPE (type))) - return true; - break; + sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16); - default: - gcc_unreachable (); + /* 64-bit frame->va_arg_size should always be a multiple of 16, but + rounding to be pedantic. */ + space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16); } - } - else - return TYPE_ALIGN (type) >= 128; + else + space_needed = frame->va_arg_size; - return false; -} + /* Record the allocation size required prior to the realignment AND. */ + frame->stack_realign_allocate = space_needed; -/* Gives the alignment boundary, in bits, of an argument with the - specified mode and type. */ + /* The re-aligned stack starts at frame->stack_realign_offset. Values + before this point are not directly comparable with values below + this point. Use sp_valid_at to determine if the stack pointer is + valid for a given offset, fp_valid_at for the frame pointer, or + choose_baseaddr to have a base register chosen for you. -static unsigned int -ix86_function_arg_boundary (machine_mode mode, const_tree type) -{ - unsigned int align; - if (type) - { - /* Since the main variant type is used for call, we convert it to - the main variant type. */ - type = TYPE_MAIN_VARIANT (type); - align = TYPE_ALIGN (type); - if (TYPE_EMPTY_P (type)) - return PARM_BOUNDARY; + Note that the result of (frame->stack_realign_offset + & (stack_alignment_needed - 1)) may not equal zero. */ + offset = ROUND_UP (offset + space_needed, stack_alignment_needed); + frame->stack_realign_offset = offset - space_needed; + frame->sse_reg_save_offset = frame->stack_realign_offset + + sse_reg_space_needed; } - else - align = GET_MODE_ALIGNMENT (mode); - if (align < PARM_BOUNDARY) - align = PARM_BOUNDARY; else { - static bool warned; - unsigned int saved_align = align; + frame->stack_realign_offset = offset; - if (!TARGET_64BIT) + if (TARGET_64BIT && m->call_ms2sysv) { - /* i386 ABI defines XFmode arguments to be 4 byte aligned. */ - if (!type) - { - if (mode == XFmode || mode == XCmode) - align = PARM_BOUNDARY; - } - else if (!ix86_contains_aligned_value_p (type)) - align = PARM_BOUNDARY; - - if (align < 128) - align = PARM_BOUNDARY; + m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD); + offset += xlogue_layout::get_instance ().get_stack_space_used (); } - if (warn_psabi - && !warned - && align != ix86_compat_function_arg_boundary (mode, type, - saved_align)) + /* Align and set SSE register save area. */ + else if (frame->nsseregs) { - warned = true; - inform (input_location, - "the ABI for passing parameters with %d-byte" - " alignment has changed in GCC 4.6", - align / BITS_PER_UNIT); + /* If the incoming stack boundary is at least 16 bytes, or DRAP is + required and the DRAP re-alignment boundary is at least 16 bytes, + then we want the SSE register save area properly aligned. */ + if (ix86_incoming_stack_boundary >= 128 + || (stack_realign_drap && stack_alignment_needed >= 16)) + offset = ROUND_UP (offset, 16); + offset += frame->nsseregs * 16; } + frame->sse_reg_save_offset = offset; + offset += frame->va_arg_size; } - return align; -} - -/* Return true if N is a possible register number of function value. */ - -static bool -ix86_function_value_regno_p (const unsigned int regno) -{ - switch (regno) - { - case AX_REG: - return true; - case DX_REG: - return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI); - case DI_REG: - case SI_REG: - return TARGET_64BIT && ix86_cfun_abi () != MS_ABI; + /* Align start of frame for local function. When a function call + is removed, it may become a leaf function. But if argument may + be passed on stack, we need to align the stack when there is no + tail call. */ + if (m->call_ms2sysv + || frame->va_arg_size != 0 + || size != 0 + || !crtl->is_leaf + || (!crtl->tail_call_emit + && cfun->machine->outgoing_args_on_stack) + || cfun->calls_alloca + || ix86_current_function_calls_tls_descriptor) + offset = ROUND_UP (offset, stack_alignment_needed); - /* Complex values are returned in %st(0)/%st(1) pair. */ - case ST0_REG: - case ST1_REG: - /* TODO: The function should depend on current function ABI but - builtins.c would need updating then. Therefore we use the - default ABI. */ - if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI) - return false; - return TARGET_FLOAT_RETURNS_IN_80387; + /* Frame pointer points here. */ + frame->frame_pointer_offset = offset; - /* Complex values are returned in %xmm0/%xmm1 pair. */ - case XMM0_REG: - case XMM1_REG: - return TARGET_SSE; + offset += size; - case MM0_REG: - if (TARGET_MACHO || TARGET_64BIT) - return false; - return TARGET_MMX; + /* Add outgoing arguments area. Can be skipped if we eliminated + all the function calls as dead code. + Skipping is however impossible when function calls alloca. Alloca + expander assumes that last crtl->outgoing_args_size + of stack frame are unused. */ + if (ACCUMULATE_OUTGOING_ARGS + && (!crtl->is_leaf || cfun->calls_alloca + || ix86_current_function_calls_tls_descriptor)) + { + offset += crtl->outgoing_args_size; + frame->outgoing_arguments_size = crtl->outgoing_args_size; } + else + frame->outgoing_arguments_size = 0; - return false; -} + /* Align stack boundary. Only needed if we're calling another function + or using alloca. */ + if (!crtl->is_leaf || cfun->calls_alloca + || ix86_current_function_calls_tls_descriptor) + offset = ROUND_UP (offset, preferred_alignment); -/* Define how to find the value returned by a function. - VALTYPE is the data type of the value (as a tree). - If the precise function being called is known, FUNC is its FUNCTION_DECL; - otherwise, FUNC is 0. */ + /* We've reached end of stack frame. */ + frame->stack_pointer_offset = offset; -static rtx -function_value_32 (machine_mode orig_mode, machine_mode mode, - const_tree fntype, const_tree fn) -{ - unsigned int regno; + /* Size prologue needs to allocate. */ + to_allocate = offset - frame->sse_reg_save_offset; - /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where - we normally prevent this case when mmx is not available. However - some ABIs may require the result to be returned like DImode. */ - if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8) - regno = FIRST_MMX_REG; + if ((!to_allocate && frame->nregs <= 1) + || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)) + /* If stack clash probing needs a loop, then it needs a + scratch register. But the returned register is only guaranteed + to be safe to use after register saves are complete. So if + stack clash protections are enabled and the allocated frame is + larger than the probe interval, then use pushes to save + callee saved registers. */ + || (flag_stack_clash_protection && to_allocate > get_probe_interval ())) + frame->save_regs_using_mov = false; - /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where - we prevent this case when sse is not available. However some ABIs - may require the result to be returned like integer TImode. */ - else if (mode == TImode - || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16)) - regno = FIRST_SSE_REG; - - /* 32-byte vector modes in %ymm0. */ - else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32) - regno = FIRST_SSE_REG; - - /* 64-byte vector modes in %zmm0. */ - else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) - regno = FIRST_SSE_REG; - - /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */ - else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387) - regno = FIRST_FLOAT_REG; + if (ix86_using_red_zone () + && crtl->sp_is_unchanging + && crtl->is_leaf + && !ix86_pc_thunk_call_expanded + && !ix86_current_function_calls_tls_descriptor) + { + frame->red_zone_size = to_allocate; + if (frame->save_regs_using_mov) + frame->red_zone_size += frame->nregs * UNITS_PER_WORD; + if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE) + frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE; + } else - /* Most things go in %eax. */ - regno = AX_REG; + frame->red_zone_size = 0; + frame->stack_pointer_offset -= frame->red_zone_size; - /* Override FP return register with %xmm0 for local functions when - SSE math is enabled or for functions with sseregparm attribute. */ - if ((fn || fntype) && (mode == SFmode || mode == DFmode)) + /* The SEH frame pointer location is near the bottom of the frame. + This is enforced by the fact that the difference between the + stack pointer and the frame pointer is limited to 240 bytes in + the unwind data structure. */ + if (TARGET_SEH) { - int sse_level = ix86_function_sseregparm (fntype, fn, false); - if (sse_level == -1) + HOST_WIDE_INT diff; + + /* If we can leave the frame pointer where it is, do so. Also, returns + the establisher frame for __builtin_frame_address (0). */ + diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset; + if (diff <= SEH_MAX_FRAME_SIZE + && (diff > 240 || (diff & 15) != 0) + && !crtl->accesses_prior_frames) { - error ("calling %qD with SSE calling convention without " - "SSE/SSE2 enabled", fn); - sorry ("this is a GCC bug that can be worked around by adding " - "attribute used to function called"); + /* Ideally we'd determine what portion of the local stack frame + (within the constraint of the lowest 240) is most heavily used. + But without that complication, simply bias the frame pointer + by 128 bytes so as to maximize the amount of the local stack + frame that is addressable with 8-bit offsets. */ + frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128; } - else if ((sse_level >= 1 && mode == SFmode) - || (sse_level == 2 && mode == DFmode)) - regno = FIRST_SSE_REG; } - - /* OImode shouldn't be used directly. */ - gcc_assert (mode != OImode); - - return gen_rtx_REG (orig_mode, regno); } -static rtx -function_value_64 (machine_mode orig_mode, machine_mode mode, - const_tree valtype) -{ - rtx ret; - - /* Handle libcalls, which don't provide a type node. */ - if (valtype == NULL) - { - unsigned int regno; +/* This is semi-inlined memory_address_length, but simplified + since we know that we're always dealing with reg+offset, and + to avoid having to create and discard all that rtl. */ - switch (mode) - { - case E_SFmode: - case E_SCmode: - case E_DFmode: - case E_DCmode: - case E_TFmode: - case E_SDmode: - case E_DDmode: - case E_TDmode: - regno = FIRST_SSE_REG; - break; - case E_XFmode: - case E_XCmode: - regno = FIRST_FLOAT_REG; - break; - case E_TCmode: - return NULL; - default: - regno = AX_REG; - } +static inline int +choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset) +{ + int len = 4; - return gen_rtx_REG (mode, regno); - } - else if (POINTER_TYPE_P (valtype)) + if (offset == 0) { - /* Pointers are always returned in word_mode. */ - mode = word_mode; + /* EBP and R13 cannot be encoded without an offset. */ + len = (regno == BP_REG || regno == R13_REG); } + else if (IN_RANGE (offset, -128, 127)) + len = 1; - ret = construct_container (mode, orig_mode, valtype, 1, - X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX, - x86_64_int_return_registers, 0); - - /* For zero sized structures, construct_container returns NULL, but we - need to keep rest of compiler happy by returning meaningful value. */ - if (!ret) - ret = gen_rtx_REG (orig_mode, AX_REG); + /* ESP and R12 must be encoded with a SIB byte. */ + if (regno == SP_REG || regno == R12_REG) + len++; - return ret; + return len; } -static rtx -function_value_ms_32 (machine_mode orig_mode, machine_mode mode, - const_tree fntype, const_tree fn, const_tree valtype) -{ - unsigned int regno; - - /* Floating point return values in %st(0) - (unless -mno-fp-ret-in-387 or aggregate type of up to 8 bytes). */ - if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387 - && (GET_MODE_SIZE (mode) > 8 - || valtype == NULL_TREE || !AGGREGATE_TYPE_P (valtype))) - { - regno = FIRST_FLOAT_REG; - return gen_rtx_REG (orig_mode, regno); - } - else - return function_value_32(orig_mode, mode, fntype,fn); -} +/* Determine if the stack pointer is valid for accessing the CFA_OFFSET in + the frame save area. The register is saved at CFA - CFA_OFFSET. */ -static rtx -function_value_ms_64 (machine_mode orig_mode, machine_mode mode, - const_tree valtype) +static bool +sp_valid_at (HOST_WIDE_INT cfa_offset) { - unsigned int regno = AX_REG; - - if (TARGET_SSE) + const struct machine_frame_state &fs = cfun->machine->fs; + if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset) { - switch (GET_MODE_SIZE (mode)) - { - case 16: - if (valtype != NULL_TREE - && !VECTOR_INTEGER_TYPE_P (valtype) - && !VECTOR_INTEGER_TYPE_P (valtype) - && !INTEGRAL_TYPE_P (valtype) - && !VECTOR_FLOAT_TYPE_P (valtype)) - break; - if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)) - && !COMPLEX_MODE_P (mode)) - regno = FIRST_SSE_REG; - break; - case 8: - case 4: - if (valtype != NULL_TREE && AGGREGATE_TYPE_P (valtype)) - break; - if (mode == SFmode || mode == DFmode) - regno = FIRST_SSE_REG; - break; - default: - break; - } + /* Validate that the cfa_offset isn't in a "no-man's land". */ + gcc_assert (cfa_offset <= fs.sp_realigned_fp_last); + return false; } - return gen_rtx_REG (orig_mode, regno); + return fs.sp_valid; } -static rtx -ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl, - machine_mode orig_mode, machine_mode mode) -{ - const_tree fn, fntype; +/* Determine if the frame pointer is valid for accessing the CFA_OFFSET in + the frame save area. The register is saved at CFA - CFA_OFFSET. */ - fn = NULL_TREE; - if (fntype_or_decl && DECL_P (fntype_or_decl)) - fn = fntype_or_decl; - fntype = fn ? TREE_TYPE (fn) : fntype_or_decl; - - if (ix86_function_type_abi (fntype) == MS_ABI) +static inline bool +fp_valid_at (HOST_WIDE_INT cfa_offset) +{ + const struct machine_frame_state &fs = cfun->machine->fs; + if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last) { - if (TARGET_64BIT) - return function_value_ms_64 (orig_mode, mode, valtype); - else - return function_value_ms_32 (orig_mode, mode, fntype, fn, valtype); + /* Validate that the cfa_offset isn't in a "no-man's land". */ + gcc_assert (cfa_offset >= fs.sp_realigned_offset); + return false; } - else if (TARGET_64BIT) - return function_value_64 (orig_mode, mode, valtype); - else - return function_value_32 (orig_mode, mode, fntype, fn); + return fs.fp_valid; } -static rtx -ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool) -{ - machine_mode mode, orig_mode; +/* Choose a base register based upon alignment requested, speed and/or + size. */ - orig_mode = TYPE_MODE (valtype); - mode = type_natural_mode (valtype, NULL, true); - return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode); -} +static void +choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg, + HOST_WIDE_INT &base_offset, + unsigned int align_reqested, unsigned int *align) +{ + const struct machine_function *m = cfun->machine; + unsigned int hfp_align; + unsigned int drap_align; + unsigned int sp_align; + bool hfp_ok = fp_valid_at (cfa_offset); + bool drap_ok = m->fs.drap_valid; + bool sp_ok = sp_valid_at (cfa_offset); -/* Pointer function arguments and return values are promoted to - word_mode for normal functions. */ + hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY; -static machine_mode -ix86_promote_function_mode (const_tree type, machine_mode mode, - int *punsignedp, const_tree fntype, - int for_return) -{ - if (cfun->machine->func_type == TYPE_NORMAL - && type != NULL_TREE - && POINTER_TYPE_P (type)) + /* Filter out any registers that don't meet the requested alignment + criteria. */ + if (align_reqested) { - *punsignedp = POINTERS_EXTEND_UNSIGNED; - return word_mode; - } - return default_promote_function_mode (type, mode, punsignedp, fntype, - for_return); -} - -/* Return true if a structure, union or array with MODE containing FIELD - should be accessed using BLKmode. */ - -static bool -ix86_member_type_forces_blk (const_tree field, machine_mode mode) -{ - /* Union with XFmode must be in BLKmode. */ - return (mode == XFmode - && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE - || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE)); -} - -rtx -ix86_libcall_value (machine_mode mode) -{ - return ix86_function_value_1 (NULL, NULL, mode, mode); -} - -/* Return true iff type is returned in memory. */ + if (m->fs.realigned) + hfp_align = drap_align = sp_align = crtl->stack_alignment_needed; + /* SEH unwind code does do not currently support REG_CFA_EXPRESSION + notes (which we would need to use a realigned stack pointer), + so disable on SEH targets. */ + else if (m->fs.sp_realigned) + sp_align = crtl->stack_alignment_needed; -static bool -ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) -{ -#ifdef SUBTARGET_RETURN_IN_MEMORY - return SUBTARGET_RETURN_IN_MEMORY (type, fntype); -#else - const machine_mode mode = type_natural_mode (type, NULL, true); - HOST_WIDE_INT size; + hfp_ok = hfp_ok && hfp_align >= align_reqested; + drap_ok = drap_ok && drap_align >= align_reqested; + sp_ok = sp_ok && sp_align >= align_reqested; + } - if (TARGET_64BIT) + if (m->use_fast_prologue_epilogue) { - if (ix86_function_type_abi (fntype) == MS_ABI) - { - size = int_size_in_bytes (type); - - /* __m128 is returned in xmm0. */ - if ((!type || VECTOR_INTEGER_TYPE_P (type) - || INTEGRAL_TYPE_P (type) - || VECTOR_FLOAT_TYPE_P (type)) - && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)) - && !COMPLEX_MODE_P (mode) - && (GET_MODE_SIZE (mode) == 16 || size == 16)) - return false; + /* Choose the base register most likely to allow the most scheduling + opportunities. Generally FP is valid throughout the function, + while DRAP must be reloaded within the epilogue. But choose either + over the SP due to increased encoding size. */ - /* Otherwise, the size must be exactly in [1248]. */ - return size != 1 && size != 2 && size != 4 && size != 8; + if (hfp_ok) + { + base_reg = hard_frame_pointer_rtx; + base_offset = m->fs.fp_offset - cfa_offset; } - else + else if (drap_ok) { - int needed_intregs, needed_sseregs; - - return examine_argument (mode, type, 1, - &needed_intregs, &needed_sseregs); + base_reg = crtl->drap_reg; + base_offset = 0 - cfa_offset; + } + else if (sp_ok) + { + base_reg = stack_pointer_rtx; + base_offset = m->fs.sp_offset - cfa_offset; } } else { - size = int_size_in_bytes (type); + HOST_WIDE_INT toffset; + int len = 16, tlen; - /* Intel MCU psABI returns scalars and aggregates no larger than 8 - bytes in registers. */ - if (TARGET_IAMCU) - return VECTOR_MODE_P (mode) || size < 0 || size > 8; + /* Choose the base register with the smallest address encoding. + With a tie, choose FP > DRAP > SP. */ + if (sp_ok) + { + base_reg = stack_pointer_rtx; + base_offset = m->fs.sp_offset - cfa_offset; + len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset); + } + if (drap_ok) + { + toffset = 0 - cfa_offset; + tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset); + if (tlen <= len) + { + base_reg = crtl->drap_reg; + base_offset = toffset; + len = tlen; + } + } + if (hfp_ok) + { + toffset = m->fs.fp_offset - cfa_offset; + tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset); + if (tlen <= len) + { + base_reg = hard_frame_pointer_rtx; + base_offset = toffset; + len = tlen; + } + } + } - if (mode == BLKmode) - return true; + /* Set the align return value. */ + if (align) + { + if (base_reg == stack_pointer_rtx) + *align = sp_align; + else if (base_reg == crtl->drap_reg) + *align = drap_align; + else if (base_reg == hard_frame_pointer_rtx) + *align = hfp_align; + } +} - if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8) - return false; +/* Return an RTX that points to CFA_OFFSET within the stack frame and + the alignment of address. If ALIGN is non-null, it should point to + an alignment value (in bits) that is preferred or zero and will + recieve the alignment of the base register that was selected, + irrespective of rather or not CFA_OFFSET is a multiple of that + alignment value. If it is possible for the base register offset to be + non-immediate then SCRATCH_REGNO should specify a scratch register to + use. - if (VECTOR_MODE_P (mode) || mode == TImode) - { - /* User-created vectors small enough to fit in EAX. */ - if (size < 8) - return false; + The valid base registers are taken from CFUN->MACHINE->FS. */ - /* Unless ABI prescibes otherwise, - MMX/3dNow values are returned in MM0 if available. */ - - if (size == 8) - return TARGET_VECT8_RETURNS || !TARGET_MMX; +static rtx +choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align, + unsigned int scratch_regno = INVALID_REGNUM) +{ + rtx base_reg = NULL; + HOST_WIDE_INT base_offset = 0; - /* SSE values are returned in XMM0 if available. */ - if (size == 16) - return !TARGET_SSE; + /* If a specific alignment is requested, try to get a base register + with that alignment first. */ + if (align && *align) + choose_basereg (cfa_offset, base_reg, base_offset, *align, align); - /* AVX values are returned in YMM0 if available. */ - if (size == 32) - return !TARGET_AVX; + if (!base_reg) + choose_basereg (cfa_offset, base_reg, base_offset, 0, align); - /* AVX512F values are returned in ZMM0 if available. */ - if (size == 64) - return !TARGET_AVX512F; - } + gcc_assert (base_reg != NULL); - if (mode == XFmode) - return false; + rtx base_offset_rtx = GEN_INT (base_offset); - if (size > 12) - return true; + if (!x86_64_immediate_operand (base_offset_rtx, Pmode)) + { + gcc_assert (scratch_regno != INVALID_REGNUM); - /* OImode shouldn't be used directly. */ - gcc_assert (mode != OImode); + rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno); + emit_move_insn (scratch_reg, base_offset_rtx); - return false; + return gen_rtx_PLUS (Pmode, base_reg, scratch_reg); } -#endif + + return plus_constant (Pmode, base_reg, base_offset); } - -/* Create the va_list data type. */ +/* Emit code to save registers in the prologue. */ -static tree -ix86_build_builtin_va_list_64 (void) +static void +ix86_emit_save_regs (void) { - tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl; - - record = lang_hooks.types.make_type (RECORD_TYPE); - type_decl = build_decl (BUILTINS_LOCATION, - TYPE_DECL, get_identifier ("__va_list_tag"), record); + unsigned int regno; + rtx_insn *insn; - f_gpr = build_decl (BUILTINS_LOCATION, - FIELD_DECL, get_identifier ("gp_offset"), - unsigned_type_node); - f_fpr = build_decl (BUILTINS_LOCATION, - FIELD_DECL, get_identifier ("fp_offset"), - unsigned_type_node); - f_ovf = build_decl (BUILTINS_LOCATION, - FIELD_DECL, get_identifier ("overflow_arg_area"), - ptr_type_node); - f_sav = build_decl (BUILTINS_LOCATION, - FIELD_DECL, get_identifier ("reg_save_area"), - ptr_type_node); + for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; ) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno))); + RTX_FRAME_RELATED_P (insn) = 1; + } +} - va_list_gpr_counter_field = f_gpr; - va_list_fpr_counter_field = f_fpr; +/* Emit a single register save at CFA - CFA_OFFSET. */ - DECL_FIELD_CONTEXT (f_gpr) = record; - DECL_FIELD_CONTEXT (f_fpr) = record; - DECL_FIELD_CONTEXT (f_ovf) = record; - DECL_FIELD_CONTEXT (f_sav) = record; +static void +ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno, + HOST_WIDE_INT cfa_offset) +{ + struct machine_function *m = cfun->machine; + rtx reg = gen_rtx_REG (mode, regno); + rtx mem, addr, base, insn; + unsigned int align = GET_MODE_ALIGNMENT (mode); - TYPE_STUB_DECL (record) = type_decl; - TYPE_NAME (record) = type_decl; - TYPE_FIELDS (record) = f_gpr; - DECL_CHAIN (f_gpr) = f_fpr; - DECL_CHAIN (f_fpr) = f_ovf; - DECL_CHAIN (f_ovf) = f_sav; + addr = choose_baseaddr (cfa_offset, &align); + mem = gen_frame_mem (mode, addr); - layout_type (record); + /* The location aligment depends upon the base register. */ + align = MIN (GET_MODE_ALIGNMENT (mode), align); + gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1))); + set_mem_align (mem, align); - TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"), - NULL_TREE, TYPE_ATTRIBUTES (record)); + insn = emit_insn (gen_rtx_SET (mem, reg)); + RTX_FRAME_RELATED_P (insn) = 1; - /* The correct type is an array type of one element. */ - return build_array_type (record, build_index_type (size_zero_node)); -} - -/* Setup the builtin va_list data type and for 64-bit the additional - calling convention specific va_list data types. */ + base = addr; + if (GET_CODE (base) == PLUS) + base = XEXP (base, 0); + gcc_checking_assert (REG_P (base)); -static tree -ix86_build_builtin_va_list (void) -{ - if (TARGET_64BIT) + /* When saving registers into a re-aligned local stack frame, avoid + any tricky guessing by dwarf2out. */ + if (m->fs.realigned) { - /* Initialize ABI specific va_list builtin types. - - In lto1, we can encounter two va_list types: - - one as a result of the type-merge across TUs, and - - the one constructed here. - These two types will not have the same TYPE_MAIN_VARIANT, and therefore - a type identity check in canonical_va_list_type based on - TYPE_MAIN_VARIANT (which we used to have) will not work. - Instead, we tag each va_list_type_node with its unique attribute, and - look for the attribute in the type identity check in - canonical_va_list_type. - - Tagging sysv_va_list_type_node directly with the attribute is - problematic since it's a array of one record, which will degrade into a - pointer to record when used as parameter (see build_va_arg comments for - an example), dropping the attribute in the process. So we tag the - record instead. */ + gcc_checking_assert (stack_realign_drap); - /* For SYSV_ABI we use an array of one record. */ - sysv_va_list_type_node = ix86_build_builtin_va_list_64 (); - - /* For MS_ABI we use plain pointer to argument area. */ - tree char_ptr_type = build_pointer_type (char_type_node); - tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE, - TYPE_ATTRIBUTES (char_ptr_type)); - ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr); + if (regno == REGNO (crtl->drap_reg)) + { + /* A bit of a hack. We force the DRAP register to be saved in + the re-aligned stack frame, which provides us with a copy + of the CFA that will last past the prologue. Install it. */ + gcc_checking_assert (cfun->machine->fs.fp_valid); + addr = plus_constant (Pmode, hard_frame_pointer_rtx, + cfun->machine->fs.fp_offset - cfa_offset); + mem = gen_rtx_MEM (mode, addr); + add_reg_note (insn, REG_CFA_DEF_CFA, mem); + } + else + { + /* The frame pointer is a stable reference within the + aligned frame. Use it. */ + gcc_checking_assert (cfun->machine->fs.fp_valid); + addr = plus_constant (Pmode, hard_frame_pointer_rtx, + cfun->machine->fs.fp_offset - cfa_offset); + mem = gen_rtx_MEM (mode, addr); + add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg)); + } + } - return ((ix86_abi == MS_ABI) - ? ms_va_list_type_node - : sysv_va_list_type_node); + else if (base == stack_pointer_rtx && m->fs.sp_realigned + && cfa_offset >= m->fs.sp_realigned_offset) + { + gcc_checking_assert (stack_realign_fp); + add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg)); } - else + + /* The memory may not be relative to the current CFA register, + which means that we may need to generate a new pattern for + use by the unwind info. */ + else if (base != m->fs.cfa_reg) { - /* For i386 we use plain pointer to argument area. */ - return build_pointer_type (char_type_node); + addr = plus_constant (Pmode, m->fs.cfa_reg, + m->fs.cfa_offset - cfa_offset); + mem = gen_rtx_MEM (mode, addr); + add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg)); } } -/* Worker function for TARGET_SETUP_INCOMING_VARARGS. */ - +/* Emit code to save registers using MOV insns. + First register is stored at CFA - CFA_OFFSET. */ static void -setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) +ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset) { - rtx save_area, mem; - alias_set_type set; - int i, max; + unsigned int regno; - /* GPR size of varargs save area. */ - if (cfun->va_list_gpr_size) - ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD; - else - ix86_varargs_gpr_size = 0; + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); + cfa_offset -= UNITS_PER_WORD; + } +} - /* FPR size of varargs save area. We don't need it if we don't pass - anything in SSE registers. */ - if (TARGET_SSE && cfun->va_list_fpr_size) - ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16; - else - ix86_varargs_fpr_size = 0; +/* Emit code to save SSE registers using MOV insns. + First register is stored at CFA - CFA_OFFSET. */ +static void +ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset) +{ + unsigned int regno; - if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size) - return; + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true)) + { + ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset); + cfa_offset -= GET_MODE_SIZE (V4SFmode); + } +} - save_area = frame_pointer_rtx; - set = get_varargs_alias_set (); +static GTY(()) rtx queued_cfa_restores; - max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD; - if (max > X86_64_REGPARM_MAX) - max = X86_64_REGPARM_MAX; +/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack + manipulation insn. The value is on the stack at CFA - CFA_OFFSET. + Don't add the note if the previously saved value will be left untouched + within stack red-zone till return, as unwinders can find the same value + in the register and on the stack. */ - for (i = cum->regno; i < max; i++) - { - mem = gen_rtx_MEM (word_mode, - plus_constant (Pmode, save_area, i * UNITS_PER_WORD)); - MEM_NOTRAP_P (mem) = 1; - set_mem_alias_set (mem, set); - emit_move_insn (mem, - gen_rtx_REG (word_mode, - x86_64_int_parameter_registers[i])); - } +static void +ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset) +{ + if (!crtl->shrink_wrapped + && cfa_offset <= cfun->machine->fs.red_zone_offset) + return; - if (ix86_varargs_fpr_size) + if (insn) { - machine_mode smode; - rtx_code_label *label; - rtx test; + add_reg_note (insn, REG_CFA_RESTORE, reg); + RTX_FRAME_RELATED_P (insn) = 1; + } + else + queued_cfa_restores + = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores); +} - /* Now emit code to save SSE registers. The AX parameter contains number - of SSE parameter registers used to call this function, though all we - actually check here is the zero/non-zero status. */ +/* Add queued REG_CFA_RESTORE notes if any to INSN. */ - label = gen_label_rtx (); - test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx); - emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1), - label)); +static void +ix86_add_queued_cfa_restore_notes (rtx insn) +{ + rtx last; + if (!queued_cfa_restores) + return; + for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1)) + ; + XEXP (last, 1) = REG_NOTES (insn); + REG_NOTES (insn) = queued_cfa_restores; + queued_cfa_restores = NULL_RTX; + RTX_FRAME_RELATED_P (insn) = 1; +} - /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if - we used movdqa (i.e. TImode) instead? Perhaps even better would - be if we could determine the real mode of the data, via a hook - into pass_stdarg. Ignore all that for now. */ - smode = V4SFmode; - if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode)) - crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode); +/* Expand prologue or epilogue stack adjustment. + The pattern exist to put a dependency on all ebp-based memory accesses. + STYLE should be negative if instructions should be marked as frame related, + zero if %r11 register is live and cannot be freely used and positive + otherwise. */ - max = cum->sse_regno + cfun->va_list_fpr_size / 16; - if (max > X86_64_SSE_REGPARM_MAX) - max = X86_64_SSE_REGPARM_MAX; +static rtx +pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, + int style, bool set_cfa) +{ + struct machine_function *m = cfun->machine; + rtx insn; + bool add_frame_related_expr = false; - for (i = cum->sse_regno; i < max; ++i) + if (Pmode == SImode) + insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset); + else if (x86_64_immediate_operand (offset, DImode)) + insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset); + else + { + rtx tmp; + /* r11 is used by indirect sibcall return as well, set before the + epilogue and used after the epilogue. */ + if (style) + tmp = gen_rtx_REG (DImode, R11_REG); + else { - mem = plus_constant (Pmode, save_area, - i * 16 + ix86_varargs_gpr_size); - mem = gen_rtx_MEM (smode, mem); - MEM_NOTRAP_P (mem) = 1; - set_mem_alias_set (mem, set); - set_mem_align (mem, GET_MODE_ALIGNMENT (smode)); - - emit_move_insn (mem, gen_rtx_REG (smode, GET_SSE_REGNO (i))); + gcc_assert (src != hard_frame_pointer_rtx + && dest != hard_frame_pointer_rtx); + tmp = hard_frame_pointer_rtx; } + insn = emit_insn (gen_rtx_SET (tmp, offset)); + if (style < 0) + add_frame_related_expr = true; - emit_label (label); + insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp); } -} - -static void -setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum) -{ - alias_set_type set = get_varargs_alias_set (); - int i; - /* Reset to zero, as there might be a sysv vaarg used - before. */ - ix86_varargs_gpr_size = 0; - ix86_varargs_fpr_size = 0; + insn = emit_insn (insn); + if (style >= 0) + ix86_add_queued_cfa_restore_notes (insn); - for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++) + if (set_cfa) { - rtx reg, mem; + rtx r; - mem = gen_rtx_MEM (Pmode, - plus_constant (Pmode, virtual_incoming_args_rtx, - i * UNITS_PER_WORD)); - MEM_NOTRAP_P (mem) = 1; - set_mem_alias_set (mem, set); + gcc_assert (m->fs.cfa_reg == src); + m->fs.cfa_offset += INTVAL (offset); + m->fs.cfa_reg = dest; - reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]); - emit_move_insn (mem, reg); + r = gen_rtx_PLUS (Pmode, src, offset); + r = gen_rtx_SET (dest, r); + add_reg_note (insn, REG_CFA_ADJUST_CFA, r); + RTX_FRAME_RELATED_P (insn) = 1; + } + else if (style < 0) + { + RTX_FRAME_RELATED_P (insn) = 1; + if (add_frame_related_expr) + { + rtx r = gen_rtx_PLUS (Pmode, src, offset); + r = gen_rtx_SET (dest, r); + add_reg_note (insn, REG_FRAME_RELATED_EXPR, r); + } } -} -static void -ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode, - tree type, int *, int no_rtl) -{ - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); - CUMULATIVE_ARGS next_cum; - tree fntype; + if (dest == stack_pointer_rtx) + { + HOST_WIDE_INT ooffset = m->fs.sp_offset; + bool valid = m->fs.sp_valid; + bool realigned = m->fs.sp_realigned; - /* This argument doesn't appear to be used anymore. Which is good, - because the old code here didn't suppress rtl generation. */ - gcc_assert (!no_rtl); + if (src == hard_frame_pointer_rtx) + { + valid = m->fs.fp_valid; + realigned = false; + ooffset = m->fs.fp_offset; + } + else if (src == crtl->drap_reg) + { + valid = m->fs.drap_valid; + realigned = false; + ooffset = 0; + } + else + { + /* Else there are two possibilities: SP itself, which we set + up as the default above. Or EH_RETURN_STACKADJ_RTX, which is + taken care of this by hand along the eh_return path. */ + gcc_checking_assert (src == stack_pointer_rtx + || offset == const0_rtx); + } - if (!TARGET_64BIT) - return; + m->fs.sp_offset = ooffset - INTVAL (offset); + m->fs.sp_valid = valid; + m->fs.sp_realigned = realigned; + } + return insn; +} - fntype = TREE_TYPE (current_function_decl); +/* Find an available register to be used as dynamic realign argument + pointer regsiter. Such a register will be written in prologue and + used in begin of body, so it must not be + 1. parameter passing register. + 2. GOT pointer. + We reuse static-chain register if it is available. Otherwise, we + use DI for i386 and R13 for x86-64. We chose R13 since it has + shorter encoding. - /* For varargs, we do not want to skip the dummy va_dcl argument. - For stdargs, we do want to skip the last named argument. */ - next_cum = *cum; - if (stdarg_p (fntype)) - ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type, - true); + Return: the regno of chosen register. */ - if (cum->call_abi == MS_ABI) - setup_incoming_varargs_ms_64 (&next_cum); +static unsigned int +find_drap_reg (void) +{ + tree decl = cfun->decl; + + /* Always use callee-saved register if there are no caller-saved + registers. */ + if (TARGET_64BIT) + { + /* Use R13 for nested function or function need static chain. + Since function with tail call may use any caller-saved + registers in epilogue, DRAP must not use caller-saved + register in such case. */ + if (DECL_STATIC_CHAIN (decl) + || cfun->machine->no_caller_saved_registers + || crtl->tail_call_emit) + return R13_REG; + + return R10_REG; + } else - setup_incoming_varargs_64 (&next_cum); + { + /* Use DI for nested function or function need static chain. + Since function with tail call may use any caller-saved + registers in epilogue, DRAP must not use caller-saved + register in such case. */ + if (DECL_STATIC_CHAIN (decl) + || cfun->machine->no_caller_saved_registers + || crtl->tail_call_emit) + return DI_REG; + + /* Reuse static chain register if it isn't used for parameter + passing. */ + if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2) + { + unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl)); + if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0) + return CX_REG; + } + return DI_REG; + } } -static void -ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v, - machine_mode mode, - tree type, - int *pretend_size ATTRIBUTE_UNUSED, - int no_rtl) +/* Return minimum incoming stack alignment. */ + +static unsigned int +ix86_minimum_incoming_stack_boundary (bool sibcall) { - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); - CUMULATIVE_ARGS next_cum; - tree fntype; - int max; + unsigned int incoming_stack_boundary; - gcc_assert (!no_rtl); + /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */ + if (cfun->machine->func_type != TYPE_NORMAL) + incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY; + /* Prefer the one specified at command line. */ + else if (ix86_user_incoming_stack_boundary) + incoming_stack_boundary = ix86_user_incoming_stack_boundary; + /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary + if -mstackrealign is used, it isn't used for sibcall check and + estimated stack alignment is 128bit. */ + else if (!sibcall + && ix86_force_align_arg_pointer + && crtl->stack_alignment_estimated == 128) + incoming_stack_boundary = MIN_STACK_BOUNDARY; + else + incoming_stack_boundary = ix86_default_incoming_stack_boundary; - /* Do nothing if we use plain pointer to argument area. */ - if (!TARGET_64BIT || cum->call_abi == MS_ABI) - return; + /* Incoming stack alignment can be changed on individual functions + via force_align_arg_pointer attribute. We use the smallest + incoming stack boundary. */ + if (incoming_stack_boundary > MIN_STACK_BOUNDARY + && lookup_attribute ("force_align_arg_pointer", + TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl)))) + incoming_stack_boundary = MIN_STACK_BOUNDARY; - fntype = TREE_TYPE (current_function_decl); + /* The incoming stack frame has to be aligned at least at + parm_stack_boundary. */ + if (incoming_stack_boundary < crtl->parm_stack_boundary) + incoming_stack_boundary = crtl->parm_stack_boundary; - /* For varargs, we do not want to skip the dummy va_dcl argument. - For stdargs, we do want to skip the last named argument. */ - next_cum = *cum; - if (stdarg_p (fntype)) - ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type, - true); + /* Stack at entrance of main is aligned by runtime. We use the + smallest incoming stack boundary. */ + if (incoming_stack_boundary > MAIN_STACK_BOUNDARY + && DECL_NAME (current_function_decl) + && MAIN_NAME_P (DECL_NAME (current_function_decl)) + && DECL_FILE_SCOPE_P (current_function_decl)) + incoming_stack_boundary = MAIN_STACK_BOUNDARY; - max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD; - if (max > X86_64_REGPARM_MAX) - max = X86_64_REGPARM_MAX; + return incoming_stack_boundary; } +/* Update incoming stack boundary and estimated stack alignment. */ -/* Checks if TYPE is of kind va_list char *. */ - -static bool -is_va_list_char_pointer (tree type) +static void +ix86_update_stack_boundary (void) { - tree canonic; + ix86_incoming_stack_boundary + = ix86_minimum_incoming_stack_boundary (false); - /* For 32-bit it is always true. */ - if (!TARGET_64BIT) - return true; - canonic = ix86_canonical_va_list_type (type); - return (canonic == ms_va_list_type_node - || (ix86_abi == MS_ABI && canonic == va_list_type_node)); + /* x86_64 vararg needs 16byte stack alignment for register save area. */ + if (TARGET_64BIT + && cfun->stdarg + && crtl->stack_alignment_estimated < 128) + crtl->stack_alignment_estimated = 128; + + /* __tls_get_addr needs to be called with 16-byte aligned stack. */ + if (ix86_tls_descriptor_calls_expanded_in_cfun + && crtl->preferred_stack_boundary < 128) + crtl->preferred_stack_boundary = 128; } -/* Implement va_start. */ +/* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is + needed or an rtx for DRAP otherwise. */ -static void -ix86_va_start (tree valist, rtx nextarg) +static rtx +ix86_get_drap_rtx (void) { - HOST_WIDE_INT words, n_gpr, n_fpr; - tree f_gpr, f_fpr, f_ovf, f_sav; - tree gpr, fpr, ovf, sav, t; - tree type; - rtx ovf_rtx; + /* We must use DRAP if there are outgoing arguments on stack and + ACCUMULATE_OUTGOING_ARGS is false. */ + if (ix86_force_drap + || (cfun->machine->outgoing_args_on_stack + && !ACCUMULATE_OUTGOING_ARGS)) + crtl->need_drap = true; - if (flag_split_stack - && cfun->machine->split_stack_varargs_pointer == NULL_RTX) + if (stack_realign_drap) { - unsigned int scratch_regno; - - /* When we are splitting the stack, we can't refer to the stack - arguments using internal_arg_pointer, because they may be on - the old stack. The split stack prologue will arrange to - leave a pointer to the old stack arguments in a scratch - register, which we here copy to a pseudo-register. The split - stack prologue can't set the pseudo-register directly because - it (the prologue) runs before any registers have been saved. */ + /* Assign DRAP to vDRAP and returns vDRAP */ + unsigned int regno = find_drap_reg (); + rtx drap_vreg; + rtx arg_ptr; + rtx_insn *seq, *insn; - scratch_regno = split_stack_prologue_scratch_regno (); - if (scratch_regno != INVALID_REGNUM) - { - rtx reg; - rtx_insn *seq; + arg_ptr = gen_rtx_REG (Pmode, regno); + crtl->drap_reg = arg_ptr; - reg = gen_reg_rtx (Pmode); - cfun->machine->split_stack_varargs_pointer = reg; - - start_sequence (); - emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno)); - seq = get_insns (); - end_sequence (); + start_sequence (); + drap_vreg = copy_to_reg (arg_ptr); + seq = get_insns (); + end_sequence (); - push_topmost_sequence (); - emit_insn_after (seq, entry_of_function ()); - pop_topmost_sequence (); + insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ())); + if (!optimize) + { + add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg); + RTX_FRAME_RELATED_P (insn) = 1; } + return drap_vreg; } + else + return NULL; +} - /* Only 64bit target needs something special. */ - if (is_va_list_char_pointer (TREE_TYPE (valist))) - { - if (cfun->machine->split_stack_varargs_pointer == NULL_RTX) - std_expand_builtin_va_start (valist, nextarg); - else - { - rtx va_r, next; +/* Handle the TARGET_INTERNAL_ARG_POINTER hook. */ - va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE); - next = expand_binop (ptr_mode, add_optab, - cfun->machine->split_stack_varargs_pointer, - crtl->args.arg_offset_rtx, - NULL_RTX, 0, OPTAB_LIB_WIDEN); - convert_move (va_r, next, 0); - } - return; - } +static rtx +ix86_internal_arg_pointer (void) +{ + return virtual_incoming_args_rtx; +} - f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node)); - f_fpr = DECL_CHAIN (f_gpr); - f_ovf = DECL_CHAIN (f_fpr); - f_sav = DECL_CHAIN (f_ovf); +struct scratch_reg { + rtx reg; + bool saved; +}; - valist = build_simple_mem_ref (valist); - TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node); - /* The following should be folded into the MEM_REF offset. */ - gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist), - f_gpr, NULL_TREE); - fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist), - f_fpr, NULL_TREE); - ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist), - f_ovf, NULL_TREE); - sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist), - f_sav, NULL_TREE); +/* Return a short-lived scratch register for use on function entry. + In 32-bit mode, it is valid only after the registers are saved + in the prologue. This register must be released by means of + release_scratch_register_on_entry once it is dead. */ - /* Count number of gp and fp argument registers used. */ - words = crtl->args.info.words; - n_gpr = crtl->args.info.regno; - n_fpr = crtl->args.info.sse_regno; +static void +get_scratch_register_on_entry (struct scratch_reg *sr) +{ + int regno; - if (cfun->va_list_gpr_size) + sr->saved = false; + + if (TARGET_64BIT) { - type = TREE_TYPE (gpr); - t = build2 (MODIFY_EXPR, type, - gpr, build_int_cst (type, n_gpr * 8)); - TREE_SIDE_EFFECTS (t) = 1; - expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + /* We always use R11 in 64-bit mode. */ + regno = R11_REG; } + else + { + tree decl = current_function_decl, fntype = TREE_TYPE (decl); + bool fastcall_p + = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE; + bool thiscall_p + = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE; + bool static_chain_p = DECL_STATIC_CHAIN (decl); + int regparm = ix86_function_regparm (fntype, decl); + int drap_regno + = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM; - if (TARGET_SSE && cfun->va_list_fpr_size) + /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax + for the static chain register. */ + if ((regparm < 1 || (fastcall_p && !static_chain_p)) + && drap_regno != AX_REG) + regno = AX_REG; + /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx + for the static chain register. */ + else if (thiscall_p && !static_chain_p && drap_regno != AX_REG) + regno = AX_REG; + else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG) + regno = DX_REG; + /* ecx is the static chain register. */ + else if (regparm < 3 && !fastcall_p && !thiscall_p + && !static_chain_p + && drap_regno != CX_REG) + regno = CX_REG; + else if (ix86_save_reg (BX_REG, true, false)) + regno = BX_REG; + /* esi is the static chain register. */ + else if (!(regparm == 3 && static_chain_p) + && ix86_save_reg (SI_REG, true, false)) + regno = SI_REG; + else if (ix86_save_reg (DI_REG, true, false)) + regno = DI_REG; + else + { + regno = (drap_regno == AX_REG ? DX_REG : AX_REG); + sr->saved = true; + } + } + + sr->reg = gen_rtx_REG (Pmode, regno); + if (sr->saved) { - type = TREE_TYPE (fpr); - t = build2 (MODIFY_EXPR, type, fpr, - build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX)); - TREE_SIDE_EFFECTS (t) = 1; - expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + rtx_insn *insn = emit_insn (gen_push (sr->reg)); + RTX_FRAME_RELATED_P (insn) = 1; } +} - /* Find the overflow area. */ - type = TREE_TYPE (ovf); - if (cfun->machine->split_stack_varargs_pointer == NULL_RTX) - ovf_rtx = crtl->args.internal_arg_pointer; - else - ovf_rtx = cfun->machine->split_stack_varargs_pointer; - t = make_tree (type, ovf_rtx); - if (words != 0) - t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD); +/* Release a scratch register obtained from the preceding function. - t = build2 (MODIFY_EXPR, type, ovf, t); - TREE_SIDE_EFFECTS (t) = 1; - expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + If RELEASE_VIA_POP is true, we just pop the register off the stack + to release it. This is what non-Linux systems use with -fstack-check. - if (ix86_varargs_gpr_size || ix86_varargs_fpr_size) + Otherwise we use OFFSET to locate the saved register and the + allocated stack space becomes part of the local frame and is + deallocated by the epilogue. */ + +static void +release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset, + bool release_via_pop) +{ + if (sr->saved) { - /* Find the register save area. - Prologue of the function save it right above stack frame. */ - type = TREE_TYPE (sav); - t = make_tree (type, frame_pointer_rtx); - if (!ix86_varargs_gpr_size) - t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX); + if (release_via_pop) + { + struct machine_function *m = cfun->machine; + rtx x, insn = emit_insn (gen_pop (sr->reg)); - t = build2 (MODIFY_EXPR, type, sav, t); - TREE_SIDE_EFFECTS (t) = 1; - expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); + /* The RX FRAME_RELATED_P mechanism doesn't know about pop. */ + RTX_FRAME_RELATED_P (insn) = 1; + x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD)); + x = gen_rtx_SET (stack_pointer_rtx, x); + add_reg_note (insn, REG_FRAME_RELATED_EXPR, x); + m->fs.sp_offset -= UNITS_PER_WORD; + } + else + { + rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset)); + x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x)); + emit_insn (x); + } } } -/* Implement va_arg. */ +/* Emit code to adjust the stack pointer by SIZE bytes while probing it. -static tree -ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, - gimple_seq *post_p) -{ - static const int intreg[6] = { 0, 1, 2, 3, 4, 5 }; - tree f_gpr, f_fpr, f_ovf, f_sav; - tree gpr, fpr, ovf, sav, t; - int size, rsize; - tree lab_false, lab_over = NULL_TREE; - tree addr, t2; - rtx container; - int indirect_p = 0; - tree ptrtype; - machine_mode nat_mode; - unsigned int arg_boundary; + This differs from the next routine in that it tries hard to prevent + attacks that jump the stack guard. Thus it is never allowed to allocate + more than PROBE_INTERVAL bytes of stack space without a suitable + probe. - /* Only 64bit target needs something special. */ - if (is_va_list_char_pointer (TREE_TYPE (valist))) - return std_gimplify_va_arg_expr (valist, type, pre_p, post_p); + INT_REGISTERS_SAVED is true if integer registers have already been + pushed on the stack. */ - f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node)); - f_fpr = DECL_CHAIN (f_gpr); - f_ovf = DECL_CHAIN (f_fpr); - f_sav = DECL_CHAIN (f_ovf); +static void +ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size, + const bool int_registers_saved) +{ + struct machine_function *m = cfun->machine; - gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), - valist, f_gpr, NULL_TREE); + /* If this function does not statically allocate stack space, then + no probes are needed. */ + if (!size) + { + /* However, the allocation of space via pushes for register + saves could be viewed as allocating space, but without the + need to probe. */ + if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed) + dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); + else + dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false); + return; + } - fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE); - ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE); - sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE); + /* If we are a noreturn function, then we have to consider the + possibility that we're called via a jump rather than a call. - indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false); - if (indirect_p) - type = build_pointer_type (type); - size = arg_int_size_in_bytes (type); - rsize = CEIL (size, UNITS_PER_WORD); + Thus we don't have the implicit probe generated by saving the + return address into the stack at the call. Thus, the stack + pointer could be anywhere in the guard page. The safe thing + to do is emit a probe now. - nat_mode = type_natural_mode (type, NULL, false); - switch (nat_mode) + The probe can be avoided if we have already emitted any callee + register saves into the stack or have a frame pointer (which will + have been saved as well). Those saves will function as implicit + probes. + + ?!? This should be revamped to work like aarch64 and s390 where + we track the offset from the most recent probe. Normally that + offset would be zero. For a noreturn function we would reset + it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then + we just probe when we cross PROBE_INTERVAL. */ + if (TREE_THIS_VOLATILE (cfun->decl) + && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)) { - case E_V8SFmode: - case E_V8SImode: - case E_V32QImode: - case E_V16HImode: - case E_V4DFmode: - case E_V4DImode: - case E_V16SFmode: - case E_V16SImode: - case E_V64QImode: - case E_V32HImode: - case E_V8DFmode: - case E_V8DImode: - /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */ - if (!TARGET_64BIT_MS_ABI) + /* We can safely use any register here since we're just going to push + its value and immediately pop it back. But we do try and avoid + argument passing registers so as not to introduce dependencies in + the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */ + rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG); + rtx_insn *insn_push = emit_insn (gen_push (dummy_reg)); + rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg)); + m->fs.sp_offset -= UNITS_PER_WORD; + if (m->fs.cfa_reg == stack_pointer_rtx) { - container = NULL; - break; + m->fs.cfa_offset -= UNITS_PER_WORD; + rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD); + x = gen_rtx_SET (stack_pointer_rtx, x); + add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x); + RTX_FRAME_RELATED_P (insn_push) = 1; + x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); + x = gen_rtx_SET (stack_pointer_rtx, x); + add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x); + RTX_FRAME_RELATED_P (insn_pop) = 1; } - /* FALLTHRU */ - - default: - container = construct_container (nat_mode, TYPE_MODE (type), - type, 0, X86_64_REGPARM_MAX, - X86_64_SSE_REGPARM_MAX, intreg, - 0); - break; + emit_insn (gen_blockage ()); } - /* Pull the value out of the saved registers. */ - - addr = create_tmp_var (ptr_type_node, "addr"); + /* If we allocate less than the size of the guard statically, + then no probing is necessary, but we do need to allocate + the stack. */ + if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE))) + { + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-size), -1, + m->fs.cfa_reg == stack_pointer_rtx); + dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); + return; + } - if (container) + /* We're allocating a large enough stack frame that we need to + emit probes. Either emit them inline or in a loop depending + on the size. */ + HOST_WIDE_INT probe_interval = get_probe_interval (); + if (size <= 4 * probe_interval) { - int needed_intregs, needed_sseregs; - bool need_temp; - tree int_addr, sse_addr; + HOST_WIDE_INT i; + for (i = probe_interval; i <= size; i += probe_interval) + { + /* Allocate PROBE_INTERVAL bytes. */ + rtx insn + = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-probe_interval), -1, + m->fs.cfa_reg == stack_pointer_rtx); + add_reg_note (insn, REG_STACK_CHECK, const0_rtx); - lab_false = create_artificial_label (UNKNOWN_LOCATION); - lab_over = create_artificial_label (UNKNOWN_LOCATION); + /* And probe at *sp. */ + emit_stack_probe (stack_pointer_rtx); + emit_insn (gen_blockage ()); + } - examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs); + /* We need to allocate space for the residual, but we do not need + to probe the residual. */ + HOST_WIDE_INT residual = (i - probe_interval - size); + if (residual) + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (residual), -1, + m->fs.cfa_reg == stack_pointer_rtx); + dump_stack_clash_frame_info (PROBE_INLINE, residual != 0); + } + else + { + /* We expect the GP registers to be saved when probes are used + as the probing sequences might need a scratch register and + the routine to allocate one assumes the integer registers + have already been saved. */ + gcc_assert (int_registers_saved); - need_temp = (!REG_P (container) - && ((needed_intregs && TYPE_ALIGN (type) > 64) - || TYPE_ALIGN (type) > 128)); + struct scratch_reg sr; + get_scratch_register_on_entry (&sr); - /* In case we are passing structure, verify that it is consecutive block - on the register save area. If not we need to do moves. */ - if (!need_temp && !REG_P (container)) - { - /* Verify that all registers are strictly consecutive */ - if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0)))) - { - int i; + /* If we needed to save a register, then account for any space + that was pushed (we are not going to pop the register when + we do the restore). */ + if (sr.saved) + size -= UNITS_PER_WORD; - for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++) - { - rtx slot = XVECEXP (container, 0, i); - if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i - || INTVAL (XEXP (slot, 1)) != i * 16) - need_temp = true; - } - } - else - { - int i; + /* Step 1: round SIZE down to a multiple of the interval. */ + HOST_WIDE_INT rounded_size = size & -probe_interval; - for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++) - { - rtx slot = XVECEXP (container, 0, i); - if (REGNO (XEXP (slot, 0)) != (unsigned int) i - || INTVAL (XEXP (slot, 1)) != i * 8) - need_temp = true; - } - } - } - if (!need_temp) - { - int_addr = addr; - sse_addr = addr; - } + /* Step 2: compute final value of the loop counter. Use lea if + possible. */ + rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size); + rtx insn; + if (address_no_seg_operand (addr, Pmode)) + insn = emit_insn (gen_rtx_SET (sr.reg, addr)); else { - int_addr = create_tmp_var (ptr_type_node, "int_addr"); - sse_addr = create_tmp_var (ptr_type_node, "sse_addr"); - } - - /* First ensure that we fit completely in registers. */ - if (needed_intregs) - { - t = build_int_cst (TREE_TYPE (gpr), - (X86_64_REGPARM_MAX - needed_intregs + 1) * 8); - t = build2 (GE_EXPR, boolean_type_node, gpr, t); - t2 = build1 (GOTO_EXPR, void_type_node, lab_false); - t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE); - gimplify_and_add (t, pre_p); + emit_move_insn (sr.reg, GEN_INT (-rounded_size)); + insn = emit_insn (gen_rtx_SET (sr.reg, + gen_rtx_PLUS (Pmode, sr.reg, + stack_pointer_rtx))); } - if (needed_sseregs) + if (m->fs.cfa_reg == stack_pointer_rtx) { - t = build_int_cst (TREE_TYPE (fpr), - (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16 - + X86_64_REGPARM_MAX * 8); - t = build2 (GE_EXPR, boolean_type_node, fpr, t); - t2 = build1 (GOTO_EXPR, void_type_node, lab_false); - t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE); - gimplify_and_add (t, pre_p); + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, sr.reg, + m->fs.cfa_offset + rounded_size)); + RTX_FRAME_RELATED_P (insn) = 1; } - /* Compute index to start of area used for integer regs. */ - if (needed_intregs) - { - /* int_addr = gpr + sav; */ - t = fold_build_pointer_plus (sav, gpr); - gimplify_assign (int_addr, t, pre_p); - } - if (needed_sseregs) + /* Step 3: the loop. */ + rtx size_rtx = GEN_INT (rounded_size); + insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, + size_rtx)); + if (m->fs.cfa_reg == stack_pointer_rtx) { - /* sse_addr = fpr + sav; */ - t = fold_build_pointer_plus (sav, fpr); - gimplify_assign (sse_addr, t, pre_p); + m->fs.cfa_offset += rounded_size; + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, stack_pointer_rtx, + m->fs.cfa_offset)); + RTX_FRAME_RELATED_P (insn) = 1; } - if (need_temp) - { - int i, prev_size = 0; - tree temp = create_tmp_var (type, "va_arg_tmp"); + m->fs.sp_offset += rounded_size; + emit_insn (gen_blockage ()); - /* addr = &temp; */ - t = build1 (ADDR_EXPR, build_pointer_type (type), temp); - gimplify_assign (addr, t, pre_p); + /* Step 4: adjust SP if we cannot assert at compile-time that SIZE + is equal to ROUNDED_SIZE. */ - for (i = 0; i < XVECLEN (container, 0); i++) - { - rtx slot = XVECEXP (container, 0, i); - rtx reg = XEXP (slot, 0); - machine_mode mode = GET_MODE (reg); - tree piece_type; - tree addr_type; - tree daddr_type; - tree src_addr, src; - int src_offset; - tree dest_addr, dest; - int cur_size = GET_MODE_SIZE (mode); + if (size != rounded_size) + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (rounded_size - size), -1, + m->fs.cfa_reg == stack_pointer_rtx); + dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size); - gcc_assert (prev_size <= INTVAL (XEXP (slot, 1))); - prev_size = INTVAL (XEXP (slot, 1)); - if (prev_size + cur_size > size) - { - cur_size = size - prev_size; - unsigned int nbits = cur_size * BITS_PER_UNIT; - if (!int_mode_for_size (nbits, 1).exists (&mode)) - mode = QImode; - } - piece_type = lang_hooks.types.type_for_mode (mode, 1); - if (mode == GET_MODE (reg)) - addr_type = build_pointer_type (piece_type); - else - addr_type = build_pointer_type_for_mode (piece_type, ptr_mode, - true); - daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode, - true); + /* This does not deallocate the space reserved for the scratch + register. That will be deallocated in the epilogue. */ + release_scratch_register_on_entry (&sr, size, false); + } - if (SSE_REGNO_P (REGNO (reg))) - { - src_addr = sse_addr; - src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16; - } - else - { - src_addr = int_addr; - src_offset = REGNO (reg) * 8; - } - src_addr = fold_convert (addr_type, src_addr); - src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset); + /* Make sure nothing is scheduled before we are done. */ + emit_insn (gen_blockage ()); +} - dest_addr = fold_convert (daddr_type, addr); - dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size); - if (cur_size == GET_MODE_SIZE (mode)) - { - src = build_va_arg_indirect_ref (src_addr); - dest = build_va_arg_indirect_ref (dest_addr); +/* Emit code to adjust the stack pointer by SIZE bytes while probing it. - gimplify_assign (dest, src, pre_p); - } - else - { - tree copy - = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY), - 3, dest_addr, src_addr, - size_int (cur_size)); - gimplify_and_add (copy, pre_p); - } - prev_size += cur_size; - } - } + INT_REGISTERS_SAVED is true if integer registers have already been + pushed on the stack. */ - if (needed_intregs) - { - t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr, - build_int_cst (TREE_TYPE (gpr), needed_intregs * 8)); - gimplify_assign (gpr, t, pre_p); - } +static void +ix86_adjust_stack_and_probe (HOST_WIDE_INT size, + const bool int_registers_saved) +{ + /* We skip the probe for the first interval + a small dope of 4 words and + probe that many bytes past the specified size to maintain a protection + area at the botton of the stack. */ + const int dope = 4 * UNITS_PER_WORD; + rtx size_rtx = GEN_INT (size), last; - if (needed_sseregs) - { - t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr, - build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16)); - gimplify_assign (unshare_expr (fpr), t, pre_p); - } + /* See if we have a constant small number of probes to generate. If so, + that's the easy case. The run-time loop is made up of 9 insns in the + generic case while the compile-time loop is made up of 3+2*(n-1) insns + for n # of intervals. */ + if (size <= 4 * get_probe_interval ()) + { + HOST_WIDE_INT i, adjust; + bool first_probe = true; - gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over)); + /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for + values of N from 1 until it exceeds SIZE. If only one probe is + needed, this will not generate any code. Then adjust and probe + to PROBE_INTERVAL + SIZE. */ + for (i = get_probe_interval (); i < size; i += get_probe_interval ()) + { + if (first_probe) + { + adjust = 2 * get_probe_interval () + dope; + first_probe = false; + } + else + adjust = get_probe_interval (); - gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false)); - } + emit_insn (gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + -adjust))); + emit_stack_probe (stack_pointer_rtx); + } - /* ... otherwise out of the overflow area. */ + if (first_probe) + adjust = size + get_probe_interval () + dope; + else + adjust = size + get_probe_interval () - i; - /* When we align parameter on stack for caller, if the parameter - alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be - aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee - here with caller. */ - arg_boundary = ix86_function_arg_boundary (VOIDmode, type); - if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT) - arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT; + emit_insn (gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + -adjust))); + emit_stack_probe (stack_pointer_rtx); - /* Care for on-stack alignment if needed. */ - if (arg_boundary <= 64 || size == 0) - t = ovf; - else - { - HOST_WIDE_INT align = arg_boundary / 8; - t = fold_build_pointer_plus_hwi (ovf, align - 1); - t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, - build_int_cst (TREE_TYPE (t), -align)); + /* Adjust back to account for the additional first interval. */ + last = emit_insn (gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + (get_probe_interval () + + dope)))); } - gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue); - gimplify_assign (addr, t, pre_p); + /* Otherwise, do the same as above, but in a loop. Note that we must be + extra careful with variables wrapping around because we might be at + the very top (or the very bottom) of the address space and we have + to be able to handle this case properly; in particular, we use an + equality test for the loop condition. */ + else + { + /* We expect the GP registers to be saved when probes are used + as the probing sequences might need a scratch register and + the routine to allocate one assumes the integer registers + have already been saved. */ + gcc_assert (int_registers_saved); - t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD); - gimplify_assign (unshare_expr (ovf), t, pre_p); + HOST_WIDE_INT rounded_size; + struct scratch_reg sr; - if (container) - gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over)); + get_scratch_register_on_entry (&sr); - ptrtype = build_pointer_type_for_mode (type, ptr_mode, true); - addr = fold_convert (ptrtype, addr); + /* If we needed to save a register, then account for any space + that was pushed (we are not going to pop the register when + we do the restore). */ + if (sr.saved) + size -= UNITS_PER_WORD; - if (indirect_p) - addr = build_va_arg_indirect_ref (addr); - return build_va_arg_indirect_ref (addr); -} - -/* Return true if OPNUM's MEM should be matched - in movabs* patterns. */ + /* Step 1: round SIZE to the previous multiple of the interval. */ -bool -ix86_check_movabs (rtx insn, int opnum) -{ - rtx set, mem; + rounded_size = ROUND_DOWN (size, get_probe_interval ()); - set = PATTERN (insn); - if (GET_CODE (set) == PARALLEL) - set = XVECEXP (set, 0, 0); - gcc_assert (GET_CODE (set) == SET); - mem = XEXP (set, opnum); - while (SUBREG_P (mem)) - mem = SUBREG_REG (mem); - gcc_assert (MEM_P (mem)); - return volatile_ok || !MEM_VOLATILE_P (mem); -} -/* Return false if INSN contains a MEM with a non-default address space. */ -bool -ix86_check_no_addr_space (rtx insn) -{ - subrtx_var_iterator::array_type array; - FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL) - { - rtx x = *iter; - if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x))) - return false; - } - return true; -} - -/* Initialize the table of extra 80387 mathematical constants. */ + /* Step 2: compute initial and final value of the loop counter. */ -static void -init_ext_80387_constants (void) -{ - static const char * cst[5] = - { - "0.3010299956639811952256464283594894482", /* 0: fldlg2 */ - "0.6931471805599453094286904741849753009", /* 1: fldln2 */ - "1.4426950408889634073876517827983434472", /* 2: fldl2e */ - "3.3219280948873623478083405569094566090", /* 3: fldl2t */ - "3.1415926535897932385128089594061862044", /* 4: fldpi */ - }; - int i; + /* SP = SP_0 + PROBE_INTERVAL. */ + emit_insn (gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + - (get_probe_interval () + dope)))); - for (i = 0; i < 5; i++) - { - real_from_string (&ext_80387_constants_table[i], cst[i]); - /* Ensure each constant is rounded to XFmode precision. */ - real_convert (&ext_80387_constants_table[i], - XFmode, &ext_80387_constants_table[i]); - } + /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */ + if (rounded_size <= (HOST_WIDE_INT_1 << 31)) + emit_insn (gen_rtx_SET (sr.reg, + plus_constant (Pmode, stack_pointer_rtx, + -rounded_size))); + else + { + emit_move_insn (sr.reg, GEN_INT (-rounded_size)); + emit_insn (gen_rtx_SET (sr.reg, + gen_rtx_PLUS (Pmode, sr.reg, + stack_pointer_rtx))); + } - ext_80387_constants_init = 1; -} -/* Return non-zero if the constant is something that - can be loaded with a special instruction. */ + /* Step 3: the loop -int -standard_80387_constant_p (rtx x) -{ - machine_mode mode = GET_MODE (x); + do + { + SP = SP + PROBE_INTERVAL + probe at SP + } + while (SP != LAST_ADDR) - const REAL_VALUE_TYPE *r; + adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for + values of N from 1 until it is equal to ROUNDED_SIZE. */ - if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode))) - return -1; + emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx)); - if (x == CONST0_RTX (mode)) - return 1; - if (x == CONST1_RTX (mode)) - return 2; - r = CONST_DOUBLE_REAL_VALUE (x); + /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot + assert at compile-time that SIZE is equal to ROUNDED_SIZE. */ - /* For XFmode constants, try to find a special 80387 instruction when - optimizing for size or on those CPUs that benefit from them. */ - if (mode == XFmode - && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS)) - { - int i; + if (size != rounded_size) + { + emit_insn (gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + rounded_size - size))); + emit_stack_probe (stack_pointer_rtx); + } - if (! ext_80387_constants_init) - init_ext_80387_constants (); + /* Adjust back to account for the additional first interval. */ + last = emit_insn (gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + (get_probe_interval () + + dope)))); - for (i = 0; i < 5; i++) - if (real_identical (r, &ext_80387_constants_table[i])) - return i + 3; + /* This does not deallocate the space reserved for the scratch + register. That will be deallocated in the epilogue. */ + release_scratch_register_on_entry (&sr, size, false); } - /* Load of the constant -0.0 or -1.0 will be split as - fldz;fchs or fld1;fchs sequence. */ - if (real_isnegzero (r)) - return 8; - if (real_identical (r, &dconstm1)) - return 9; + /* Even if the stack pointer isn't the CFA register, we need to correctly + describe the adjustments made to it, in particular differentiate the + frame-related ones from the frame-unrelated ones. */ + if (size > 0) + { + rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2)); + XVECEXP (expr, 0, 0) + = gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, -size)); + XVECEXP (expr, 0, 1) + = gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + get_probe_interval () + dope + size)); + add_reg_note (last, REG_FRAME_RELATED_EXPR, expr); + RTX_FRAME_RELATED_P (last) = 1; - return 0; + cfun->machine->fs.sp_offset += size; + } + + /* Make sure nothing is scheduled before we are done. */ + emit_insn (gen_blockage ()); } -/* Return the opcode of the special instruction to be used to load - the constant X. */ +/* Adjust the stack pointer up to REG while probing it. */ const char * -standard_80387_constant_opcode (rtx x) +output_adjust_stack_and_probe (rtx reg) { - switch (standard_80387_constant_p (x)) - { - case 1: - return "fldz"; - case 2: - return "fld1"; - case 3: - return "fldlg2"; - case 4: - return "fldln2"; - case 5: - return "fldl2e"; - case 6: - return "fldl2t"; - case 7: - return "fldpi"; - case 8: - case 9: - return "#"; - default: - gcc_unreachable (); - } + static int labelno = 0; + char loop_lab[32]; + rtx xops[2]; + + ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++); + + /* Loop. */ + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); + + /* SP = SP + PROBE_INTERVAL. */ + xops[0] = stack_pointer_rtx; + xops[1] = GEN_INT (get_probe_interval ()); + output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops); + + /* Probe at SP. */ + xops[1] = const0_rtx; + output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops); + + /* Test if SP == LAST_ADDR. */ + xops[0] = stack_pointer_rtx; + xops[1] = reg; + output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops); + + /* Branch. */ + fputs ("\tjne\t", asm_out_file); + assemble_name_raw (asm_out_file, loop_lab); + fputc ('\n', asm_out_file); + + return ""; } -/* Return the CONST_DOUBLE representing the 80387 constant that is - loaded by the specified special instruction. The argument IDX - matches the return value from standard_80387_constant_p. */ +/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE, + inclusive. These are offsets from the current stack pointer. -rtx -standard_80387_constant_rtx (int idx) + INT_REGISTERS_SAVED is true if integer registers have already been + pushed on the stack. */ + +static void +ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size, + const bool int_registers_saved) { - int i; + /* See if we have a constant small number of probes to generate. If so, + that's the easy case. The run-time loop is made up of 6 insns in the + generic case while the compile-time loop is made up of n insns for n # + of intervals. */ + if (size <= 6 * get_probe_interval ()) + { + HOST_WIDE_INT i; - if (! ext_80387_constants_init) - init_ext_80387_constants (); + /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until + it exceeds SIZE. If only one probe is needed, this will not + generate any code. Then probe at FIRST + SIZE. */ + for (i = get_probe_interval (); i < size; i += get_probe_interval ()) + emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, + -(first + i))); - switch (idx) + emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, + -(first + size))); + } + + /* Otherwise, do the same as above, but in a loop. Note that we must be + extra careful with variables wrapping around because we might be at + the very top (or the very bottom) of the address space and we have + to be able to handle this case properly; in particular, we use an + equality test for the loop condition. */ + else { - case 3: - case 4: - case 5: - case 6: - case 7: - i = idx - 3; - break; + /* We expect the GP registers to be saved when probes are used + as the probing sequences might need a scratch register and + the routine to allocate one assumes the integer registers + have already been saved. */ + gcc_assert (int_registers_saved); - default: - gcc_unreachable (); - } + HOST_WIDE_INT rounded_size, last; + struct scratch_reg sr; - return const_double_from_real_value (ext_80387_constants_table[i], - XFmode); -} + get_scratch_register_on_entry (&sr); -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 - in supported SSE/AVX vector mode. */ -int -standard_sse_constant_p (rtx x, machine_mode pred_mode) -{ - machine_mode mode; + /* Step 1: round SIZE to the previous multiple of the interval. */ - if (!TARGET_SSE) - return 0; + rounded_size = ROUND_DOWN (size, get_probe_interval ()); - mode = GET_MODE (x); - if (x == const0_rtx || const0_operand (x, mode)) - return 1; + /* Step 2: compute initial and final value of the loop counter. */ - if (x == constm1_rtx || vector_all_ones_operand (x, mode)) - { - /* VOIDmode integer constant, get mode from the predicate. */ - if (mode == VOIDmode) - mode = pred_mode; + /* TEST_OFFSET = FIRST. */ + emit_move_insn (sr.reg, GEN_INT (-first)); - switch (GET_MODE_SIZE (mode)) - { - case 64: - if (TARGET_AVX512F) - return 2; - break; - case 32: - if (TARGET_AVX2) - return 2; - break; - case 16: - if (TARGET_SSE2) - return 2; - break; - case 0: - /* VOIDmode */ - gcc_unreachable (); - default: - break; - } + /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */ + last = first + rounded_size; + + + /* Step 3: the loop + + do + { + TEST_ADDR = TEST_ADDR + PROBE_INTERVAL + probe at TEST_ADDR + } + while (TEST_ADDR != LAST_ADDR) + + probes at FIRST + N * PROBE_INTERVAL for values of N from 1 + until it is equal to ROUNDED_SIZE. */ + + emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last))); + + + /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time + that SIZE is equal to ROUNDED_SIZE. */ + + if (size != rounded_size) + emit_stack_probe (plus_constant (Pmode, + gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + sr.reg), + rounded_size - size)); + + release_scratch_register_on_entry (&sr, size, true); } - return 0; + /* Make sure nothing is scheduled before we are done. */ + emit_insn (gen_blockage ()); } -/* Return the opcode of the special instruction to be used to load - the constant operands[1] into operands[0]. */ +/* Probe a range of stack addresses from REG to END, inclusive. These are + offsets from the current stack pointer. */ const char * -standard_sse_constant_opcode (rtx_insn *insn, rtx *operands) +output_probe_stack_range (rtx reg, rtx end) { - machine_mode mode; - rtx x = operands[1]; + static int labelno = 0; + char loop_lab[32]; + rtx xops[3]; - gcc_assert (TARGET_SSE); + ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++); - mode = GET_MODE (x); - - if (x == const0_rtx || const0_operand (x, mode)) - { - switch (get_attr_mode (insn)) - { - case MODE_TI: - if (!EXT_REX_SSE_REG_P (operands[0])) - return "%vpxor\t%0, %d0"; - /* FALLTHRU */ - case MODE_XI: - case MODE_OI: - if (EXT_REX_SSE_REG_P (operands[0])) - return (TARGET_AVX512VL - ? "vpxord\t%x0, %x0, %x0" - : "vpxord\t%g0, %g0, %g0"); - return "vpxor\t%x0, %x0, %x0"; - - case MODE_V2DF: - if (!EXT_REX_SSE_REG_P (operands[0])) - return "%vxorpd\t%0, %d0"; - /* FALLTHRU */ - case MODE_V8DF: - case MODE_V4DF: - if (!EXT_REX_SSE_REG_P (operands[0])) - return "vxorpd\t%x0, %x0, %x0"; - else if (TARGET_AVX512DQ) - return (TARGET_AVX512VL - ? "vxorpd\t%x0, %x0, %x0" - : "vxorpd\t%g0, %g0, %g0"); - else - return (TARGET_AVX512VL - ? "vpxorq\t%x0, %x0, %x0" - : "vpxorq\t%g0, %g0, %g0"); + /* Loop. */ + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); - case MODE_V4SF: - if (!EXT_REX_SSE_REG_P (operands[0])) - return "%vxorps\t%0, %d0"; - /* FALLTHRU */ - case MODE_V16SF: - case MODE_V8SF: - if (!EXT_REX_SSE_REG_P (operands[0])) - return "vxorps\t%x0, %x0, %x0"; - else if (TARGET_AVX512DQ) - return (TARGET_AVX512VL - ? "vxorps\t%x0, %x0, %x0" - : "vxorps\t%g0, %g0, %g0"); - else - return (TARGET_AVX512VL - ? "vpxord\t%x0, %x0, %x0" - : "vpxord\t%g0, %g0, %g0"); + /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */ + xops[0] = reg; + xops[1] = GEN_INT (get_probe_interval ()); + output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops); - default: - gcc_unreachable (); - } - } - else if (x == constm1_rtx || vector_all_ones_operand (x, mode)) - { - enum attr_mode insn_mode = get_attr_mode (insn); - - switch (insn_mode) - { - case MODE_XI: - case MODE_V8DF: - case MODE_V16SF: - gcc_assert (TARGET_AVX512F); - return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; + /* Probe at TEST_ADDR. */ + xops[0] = stack_pointer_rtx; + xops[1] = reg; + xops[2] = const0_rtx; + output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops); - case MODE_OI: - case MODE_V4DF: - case MODE_V8SF: - gcc_assert (TARGET_AVX2); - /* FALLTHRU */ - case MODE_TI: - case MODE_V2DF: - case MODE_V4SF: - gcc_assert (TARGET_SSE2); - if (!EXT_REX_SSE_REG_P (operands[0])) - return (TARGET_AVX - ? "vpcmpeqd\t%0, %0, %0" - : "pcmpeqd\t%0, %0"); - else if (TARGET_AVX512VL) - return "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"; - else - return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}"; + /* Test if TEST_ADDR == LAST_ADDR. */ + xops[0] = reg; + xops[1] = end; + output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops); - default: - gcc_unreachable (); - } - } + /* Branch. */ + fputs ("\tjne\t", asm_out_file); + assemble_name_raw (asm_out_file, loop_lab); + fputc ('\n', asm_out_file); - gcc_unreachable (); + return ""; } -/* Returns true if INSN can be transformed from a memory load - to a supported FP constant load. */ +/* Return true if stack frame is required. Update STACK_ALIGNMENT + to the largest alignment, in bits, of stack slot used if stack + frame is required and CHECK_STACK_SLOT is true. */ -bool -ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst) +static bool +ix86_find_max_used_stack_alignment (unsigned int &stack_alignment, + bool check_stack_slot) { - rtx src = find_constant_src (insn); - - gcc_assert (REG_P (dst)); - - if (src == NULL - || (SSE_REGNO_P (REGNO (dst)) - && standard_sse_constant_p (src, GET_MODE (dst)) != 1) - || (STACK_REGNO_P (REGNO (dst)) - && standard_80387_constant_p (src) < 1)) - return false; - - return true; -} + HARD_REG_SET set_up_by_prologue, prologue_used; + basic_block bb; -/* Returns true if OP contains a symbol reference */ + CLEAR_HARD_REG_SET (prologue_used); + CLEAR_HARD_REG_SET (set_up_by_prologue); + add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM); + add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM); + add_to_hard_reg_set (&set_up_by_prologue, Pmode, + HARD_FRAME_POINTER_REGNUM); -bool -symbolic_reference_mentioned_p (rtx op) -{ - const char *fmt; - int i; + /* The preferred stack alignment is the minimum stack alignment. */ + if (stack_alignment > crtl->preferred_stack_boundary) + stack_alignment = crtl->preferred_stack_boundary; - if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF) - return true; + bool require_stack_frame = false; - fmt = GET_RTX_FORMAT (GET_CODE (op)); - for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--) + FOR_EACH_BB_FN (bb, cfun) { - if (fmt[i] == 'E') - { - int j; - - for (j = XVECLEN (op, i) - 1; j >= 0; j--) - if (symbolic_reference_mentioned_p (XVECEXP (op, i, j))) - return true; - } + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + if (NONDEBUG_INSN_P (insn) + && requires_stack_frame_p (insn, prologue_used, + set_up_by_prologue)) + { + require_stack_frame = true; - else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i))) - return true; + if (check_stack_slot) + { + /* Find the maximum stack alignment. */ + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL) + if (MEM_P (*iter) + && (reg_mentioned_p (stack_pointer_rtx, + *iter) + || reg_mentioned_p (frame_pointer_rtx, + *iter))) + { + unsigned int alignment = MEM_ALIGN (*iter); + if (alignment > stack_alignment) + stack_alignment = alignment; + } + } + } } - return false; + return require_stack_frame; } -/* Return true if it is appropriate to emit `ret' instructions in the - body of a function. Do this only if the epilogue is simple, needing a - couple of insns. Prior to reloading, we can't tell how many registers - must be saved, so return false then. Return false if there is no frame - marker to de-allocate. */ - -bool -ix86_can_use_return_insn_p (void) -{ - if (ix86_function_naked (current_function_decl)) - return false; - - /* Don't use `ret' instruction in interrupt handler. */ - if (! reload_completed - || frame_pointer_needed - || cfun->machine->func_type != TYPE_NORMAL) - return 0; - - /* Don't allow more than 32k pop, since that's all we can do - with one instruction. */ - if (crtl->args.pops_args && crtl->args.size >= 32768) - return 0; - - struct ix86_frame &frame = cfun->machine->frame; - return (frame.stack_pointer_offset == UNITS_PER_WORD - && (frame.nregs + frame.nsseregs) == 0); -} - -/* Value should be nonzero if functions must have frame pointers. - Zero means the frame pointer need not be set up (and parms may - be accessed via the stack pointer) in functions that seem suitable. */ +/* Finalize stack_realign_needed and frame_pointer_needed flags, which + will guide prologue/epilogue to be generated in correct form. */ -static bool -ix86_frame_pointer_required (void) +static void +ix86_finalize_stack_frame_flags (void) { - /* If we accessed previous frames, then the generated code expects - to be able to access the saved ebp value in our frame. */ - if (cfun->machine->accesses_prev_frame) - return true; - - /* Several x86 os'es need a frame pointer for other reasons, - usually pertaining to setjmp. */ - if (SUBTARGET_FRAME_POINTER_REQUIRED) - return true; + /* Check if stack realign is really needed after reload, and + stores result in cfun */ + unsigned int incoming_stack_boundary + = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary + ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary); + unsigned int stack_alignment + = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor + ? crtl->max_used_stack_slot_alignment + : crtl->stack_alignment_needed); + unsigned int stack_realign + = (incoming_stack_boundary < stack_alignment); + bool recompute_frame_layout_p = false; - /* For older 32-bit runtimes setjmp requires valid frame-pointer. */ - if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp) - return true; + if (crtl->stack_realign_finalized) + { + /* After stack_realign_needed is finalized, we can't no longer + change it. */ + gcc_assert (crtl->stack_realign_needed == stack_realign); + return; + } - /* Win64 SEH, very large frames need a frame-pointer as maximum stack - allocation is 4GB. */ - if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE) - return true; - - /* SSE saves require frame-pointer when stack is misaligned. */ - if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128) - return true; - - /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER - turns off the frame pointer by default. Turn it back on now if - we've not got a leaf function. */ - if (TARGET_OMIT_LEAF_FRAME_POINTER - && (!crtl->is_leaf - || ix86_current_function_calls_tls_descriptor)) - return true; - - if (crtl->profile && !flag_fentry) - return true; - - return false; -} + /* If the only reason for frame_pointer_needed is that we conservatively + assumed stack realignment might be needed or -fno-omit-frame-pointer + is used, but in the end nothing that needed the stack alignment had + been spilled nor stack access, clear frame_pointer_needed and say we + don't need stack realignment. */ + if ((stack_realign || (!flag_omit_frame_pointer && optimize)) + && frame_pointer_needed + && crtl->is_leaf + && crtl->sp_is_unchanging + && !ix86_current_function_calls_tls_descriptor + && !crtl->accesses_prior_frames + && !cfun->calls_alloca + && !crtl->calls_eh_return + /* See ira_setup_eliminable_regset for the rationale. */ + && !(STACK_CHECK_MOVING_SP + && flag_stack_check + && flag_exceptions + && cfun->can_throw_non_call_exceptions) + && !ix86_frame_pointer_required () + && get_frame_size () == 0 + && ix86_nsaved_sseregs () == 0 + && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0) + { + if (ix86_find_max_used_stack_alignment (stack_alignment, + stack_realign)) + { + /* Stack frame is required. If stack alignment needed is less + than incoming stack boundary, don't realign stack. */ + stack_realign = incoming_stack_boundary < stack_alignment; + if (!stack_realign) + { + crtl->max_used_stack_slot_alignment + = incoming_stack_boundary; + crtl->stack_alignment_needed + = incoming_stack_boundary; + /* Also update preferred_stack_boundary for leaf + functions. */ + crtl->preferred_stack_boundary + = incoming_stack_boundary; + } + } + else + { + /* If drap has been set, but it actually isn't live at the + start of the function, there is no reason to set it up. */ + if (crtl->drap_reg) + { + basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; + if (! REGNO_REG_SET_P (DF_LR_IN (bb), + REGNO (crtl->drap_reg))) + { + crtl->drap_reg = NULL_RTX; + crtl->need_drap = false; + } + } + else + cfun->machine->no_drap_save_restore = true; -/* Record that the current function accesses previous call frames. */ + frame_pointer_needed = false; + stack_realign = false; + crtl->max_used_stack_slot_alignment = incoming_stack_boundary; + crtl->stack_alignment_needed = incoming_stack_boundary; + crtl->stack_alignment_estimated = incoming_stack_boundary; + if (crtl->preferred_stack_boundary > incoming_stack_boundary) + crtl->preferred_stack_boundary = incoming_stack_boundary; + df_finish_pass (true); + df_scan_alloc (NULL); + df_scan_blocks (); + df_compute_regs_ever_live (true); + df_analyze (); -void -ix86_setup_frame_addresses (void) -{ - cfun->machine->accesses_prev_frame = 1; -} - -#ifndef USE_HIDDEN_LINKONCE -# if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0) -# define USE_HIDDEN_LINKONCE 1 -# else -# define USE_HIDDEN_LINKONCE 0 -# endif -#endif + if (flag_var_tracking) + { + /* Since frame pointer is no longer available, replace it with + stack pointer - UNITS_PER_WORD in debug insns. */ + df_ref ref, next; + for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM); + ref; ref = next) + { + next = DF_REF_NEXT_REG (ref); + if (!DF_REF_INSN_INFO (ref)) + continue; -/* Label count for call and return thunks. It is used to make unique - labels in call and return thunks. */ -static int indirectlabelno; + /* Make sure the next ref is for a different instruction, + so that we're not affected by the rescan. */ + rtx_insn *insn = DF_REF_INSN (ref); + while (next && DF_REF_INSN (next) == insn) + next = DF_REF_NEXT_REG (next); -/* True if call thunk function is needed. */ -static bool indirect_thunk_needed = false; + if (DEBUG_INSN_P (insn)) + { + bool changed = false; + for (; ref != next; ref = DF_REF_NEXT_REG (ref)) + { + rtx *loc = DF_REF_LOC (ref); + if (*loc == hard_frame_pointer_rtx) + { + *loc = plus_constant (Pmode, + stack_pointer_rtx, + -UNITS_PER_WORD); + changed = true; + } + } + if (changed) + df_insn_rescan (insn); + } + } + } -/* Bit masks of integer registers, which contain branch target, used - by call thunk functions. */ -static int indirect_thunks_used; + recompute_frame_layout_p = true; + } + } + else if (crtl->max_used_stack_slot_alignment >= 128) + { + /* We don't need to realign stack. max_used_stack_alignment is + used to decide how stack frame should be aligned. This is + independent of any psABIs nor 32-bit vs 64-bit. It is always + safe to compute max_used_stack_alignment. We compute it only + if 128-bit aligned load/store may be generated on misaligned + stack slot which will lead to segfault. */ + if (ix86_find_max_used_stack_alignment (stack_alignment, true)) + cfun->machine->max_used_stack_alignment + = stack_alignment / BITS_PER_UNIT; + } -/* True if return thunk function is needed. */ -static bool indirect_return_needed = false; + if (crtl->stack_realign_needed != stack_realign) + recompute_frame_layout_p = true; + crtl->stack_realign_needed = stack_realign; + crtl->stack_realign_finalized = true; + if (recompute_frame_layout_p) + ix86_compute_frame_layout (); +} -/* True if return thunk function via CX is needed. */ -static bool indirect_return_via_cx; +/* Delete SET_GOT right after entry block if it is allocated to reg. */ -#ifndef INDIRECT_LABEL -# define INDIRECT_LABEL "LIND" -#endif +static void +ix86_elim_entry_set_got (rtx reg) +{ + basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; + rtx_insn *c_insn = BB_HEAD (bb); + if (!NONDEBUG_INSN_P (c_insn)) + c_insn = next_nonnote_nondebug_insn (c_insn); + if (c_insn && NONJUMP_INSN_P (c_insn)) + { + rtx pat = PATTERN (c_insn); + if (GET_CODE (pat) == PARALLEL) + { + rtx vec = XVECEXP (pat, 0, 0); + if (GET_CODE (vec) == SET + && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT + && REGNO (XEXP (vec, 0)) == REGNO (reg)) + delete_insn (c_insn); + } + } +} -/* Indicate what prefix is needed for an indirect branch. */ -enum indirect_thunk_prefix +static rtx +gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store) { - indirect_thunk_prefix_none, - indirect_thunk_prefix_nt -}; + rtx addr, mem; -/* Return the prefix needed for an indirect branch INSN. */ + if (offset) + addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset)); + mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg); + return gen_rtx_SET (store ? mem : reg, store ? reg : mem); +} -enum indirect_thunk_prefix -indirect_thunk_need_prefix (rtx_insn *insn) +static inline rtx +gen_frame_load (rtx reg, rtx frame_reg, int offset) { - enum indirect_thunk_prefix need_prefix; - if ((cfun->machine->indirect_branch_type - == indirect_branch_thunk_extern) - && ix86_notrack_prefixed_insn_p (insn)) - { - /* NOTRACK prefix is only used with external thunk so that it - can be properly updated to support CET at run-time. */ - need_prefix = indirect_thunk_prefix_nt; - } - else - need_prefix = indirect_thunk_prefix_none; - return need_prefix; + return gen_frame_set (reg, frame_reg, offset, false); } -/* Fills in the label name that should be used for the indirect thunk. */ +static inline rtx +gen_frame_store (rtx reg, rtx frame_reg, int offset) +{ + return gen_frame_set (reg, frame_reg, offset, true); +} static void -indirect_thunk_name (char name[32], unsigned int regno, - enum indirect_thunk_prefix need_prefix, - bool ret_p) +ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame) { - if (regno != INVALID_REGNUM && regno != CX_REG && ret_p) - gcc_unreachable (); + struct machine_function *m = cfun->machine; + const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS + + m->call_ms2sysv_extra_regs; + rtvec v = rtvec_alloc (ncregs + 1); + unsigned int align, i, vi = 0; + rtx_insn *insn; + rtx sym, addr; + rtx rax = gen_rtx_REG (word_mode, AX_REG); + const struct xlogue_layout &xlogue = xlogue_layout::get_instance (); - if (USE_HIDDEN_LINKONCE) - { - const char *prefix; + /* AL should only be live with sysv_abi. */ + gcc_assert (!ix86_eax_live_at_start_p ()); + gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset); - if (need_prefix == indirect_thunk_prefix_nt - && regno != INVALID_REGNUM) - { - /* NOTRACK prefix is only used with external thunk via - register so that NOTRACK prefix can be added to indirect - branch via register to support CET at run-time. */ - prefix = "_nt"; - } - else - prefix = ""; + /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather + we've actually realigned the stack or not. */ + align = GET_MODE_ALIGNMENT (V4SFmode); + addr = choose_baseaddr (frame.stack_realign_offset + + xlogue.get_stub_ptr_offset (), &align, AX_REG); + gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode)); - const char *ret = ret_p ? "return" : "indirect"; + emit_insn (gen_rtx_SET (rax, addr)); - if (regno != INVALID_REGNUM) - { - const char *reg_prefix; - if (LEGACY_INT_REGNO_P (regno)) - reg_prefix = TARGET_64BIT ? "r" : "e"; - else - reg_prefix = ""; - sprintf (name, "__x86_%s_thunk%s_%s%s", - ret, prefix, reg_prefix, reg_names[regno]); - } - else - sprintf (name, "__x86_%s_thunk%s", ret, prefix); - } - else + /* Get the stub symbol. */ + sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP + : XLOGUE_STUB_SAVE); + RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym); + + for (i = 0; i < ncregs; ++i) { - if (regno != INVALID_REGNUM) - ASM_GENERATE_INTERNAL_LABEL (name, "LITR", regno); - else - { - if (ret_p) - ASM_GENERATE_INTERNAL_LABEL (name, "LRT", 0); - else - ASM_GENERATE_INTERNAL_LABEL (name, "LIT", 0); - } + const xlogue_layout::reginfo &r = xlogue.get_reginfo (i); + rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode), + r.regno); + RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset); } -} -/* Output a call and return thunk for indirect branch. If REGNO != -1, - the function address is in REGNO and the call and return thunk looks like: - - call L2 - L1: - pause - lfence - jmp L1 - L2: - mov %REG, (%sp) - ret + gcc_assert (vi == (unsigned)GET_NUM_ELEM (v)); - Otherwise, the function address is on the top of stack and the - call and return thunk looks like: + insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v)); + RTX_FRAME_RELATED_P (insn) = true; +} - call L2 - L1: - pause - lfence - jmp L1 - L2: - lea WORD_SIZE(%sp), %sp - ret - */ +/* Expand the prologue into a bunch of separate insns. */ -static void -output_indirect_thunk (unsigned int regno) +void +ix86_expand_prologue (void) { - char indirectlabel1[32]; - char indirectlabel2[32]; + struct machine_function *m = cfun->machine; + rtx insn, t; + HOST_WIDE_INT allocate; + bool int_registers_saved; + bool sse_registers_saved; + bool save_stub_call_needed; + rtx static_chain = NULL_RTX; - ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, INDIRECT_LABEL, - indirectlabelno++); - ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, INDIRECT_LABEL, - indirectlabelno++); + if (ix86_function_naked (current_function_decl)) + return; - /* Call */ - fputs ("\tcall\t", asm_out_file); - assemble_name_raw (asm_out_file, indirectlabel2); - fputc ('\n', asm_out_file); + ix86_finalize_stack_frame_flags (); - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1); + /* DRAP should not coexist with stack_realign_fp */ + gcc_assert (!(crtl->drap_reg && stack_realign_fp)); - /* AMD and Intel CPUs prefer each a different instruction as loop filler. - Usage of both pause + lfence is compromise solution. */ - fprintf (asm_out_file, "\tpause\n\tlfence\n"); + memset (&m->fs, 0, sizeof (m->fs)); - /* Jump. */ - fputs ("\tjmp\t", asm_out_file); - assemble_name_raw (asm_out_file, indirectlabel1); - fputc ('\n', asm_out_file); + /* Initialize CFA state for before the prologue. */ + m->fs.cfa_reg = stack_pointer_rtx; + m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET; - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2); + /* Track SP offset to the CFA. We continue tracking this after we've + swapped the CFA register away from SP. In the case of re-alignment + this is fudged; we're interested to offsets within the local frame. */ + m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET; + m->fs.sp_valid = true; + m->fs.sp_realigned = false; - /* The above call insn pushed a word to stack. Adjust CFI info. */ - if (flag_asynchronous_unwind_tables && dwarf2out_do_frame ()) - { - if (! dwarf2out_do_cfi_asm ()) - { - dw_cfi_ref xcfi = ggc_cleared_alloc (); - xcfi->dw_cfi_opc = DW_CFA_advance_loc4; - xcfi->dw_cfi_oprnd1.dw_cfi_addr = ggc_strdup (indirectlabel2); - vec_safe_push (cfun->fde->dw_fde_cfi, xcfi); - } - dw_cfi_ref xcfi = ggc_cleared_alloc (); - xcfi->dw_cfi_opc = DW_CFA_def_cfa_offset; - xcfi->dw_cfi_oprnd1.dw_cfi_offset = 2 * UNITS_PER_WORD; - vec_safe_push (cfun->fde->dw_fde_cfi, xcfi); - dwarf2out_emit_cfi (xcfi); - } + const struct ix86_frame &frame = cfun->machine->frame; - if (regno != INVALID_REGNUM) - { - /* MOV. */ - rtx xops[2]; - xops[0] = gen_rtx_MEM (word_mode, stack_pointer_rtx); - xops[1] = gen_rtx_REG (word_mode, regno); - output_asm_insn ("mov\t{%1, %0|%0, %1}", xops); - } - else + if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl)) { - /* LEA. */ - rtx xops[2]; - xops[0] = stack_pointer_rtx; - xops[1] = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); - output_asm_insn ("lea\t{%E1, %0|%0, %E1}", xops); - } + /* We should have already generated an error for any use of + ms_hook on a nested function. */ + gcc_checking_assert (!ix86_static_chain_on_stack); - fputs ("\tret\n", asm_out_file); -} + /* Check if profiling is active and we shall use profiling before + prologue variant. If so sorry. */ + if (crtl->profile && flag_fentry != 0) + sorry ("ms_hook_prologue attribute isn%'t compatible " + "with %<-mfentry%> for 32-bit"); -/* Output a funtion with a call and return thunk for indirect branch. - If REGNO != INVALID_REGNUM, the function address is in REGNO. - Otherwise, the function address is on the top of stack. Thunk is - used for function return if RET_P is true. */ + /* In ix86_asm_output_function_label we emitted: + 8b ff movl.s %edi,%edi + 55 push %ebp + 8b ec movl.s %esp,%ebp -static void -output_indirect_thunk_function (enum indirect_thunk_prefix need_prefix, - unsigned int regno, bool ret_p) -{ - char name[32]; - tree decl; + This matches the hookable function prologue in Win32 API + functions in Microsoft Windows XP Service Pack 2 and newer. + Wine uses this to enable Windows apps to hook the Win32 API + functions provided by Wine. - /* Create __x86_indirect_thunk. */ - indirect_thunk_name (name, regno, need_prefix, ret_p); - decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, - get_identifier (name), - build_function_type_list (void_type_node, NULL_TREE)); - DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, - NULL_TREE, void_type_node); - TREE_PUBLIC (decl) = 1; - TREE_STATIC (decl) = 1; - DECL_IGNORED_P (decl) = 1; + What that means is that we've already set up the frame pointer. */ -#if TARGET_MACHO - if (TARGET_MACHO) - { - switch_to_section (darwin_sections[picbase_thunk_section]); - fputs ("\t.weak_definition\t", asm_out_file); - assemble_name (asm_out_file, name); - fputs ("\n\t.private_extern\t", asm_out_file); - assemble_name (asm_out_file, name); - putc ('\n', asm_out_file); - ASM_OUTPUT_LABEL (asm_out_file, name); - DECL_WEAK (decl) = 1; - } - else -#endif - if (USE_HIDDEN_LINKONCE) - { - cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl)); + if (frame_pointer_needed + && !(crtl->drap_reg && crtl->stack_realign_needed)) + { + rtx push, mov; - targetm.asm_out.unique_section (decl, 0); - switch_to_section (get_named_section (decl, NULL, 0)); + /* We've decided to use the frame pointer already set up. + Describe this to the unwinder by pretending that both + push and mov insns happen right here. - targetm.asm_out.globalize_label (asm_out_file, name); - fputs ("\t.hidden\t", asm_out_file); - assemble_name (asm_out_file, name); - putc ('\n', asm_out_file); - ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl); - } - else - { - switch_to_section (text_section); - ASM_OUTPUT_LABEL (asm_out_file, name); - } + Putting the unwind info here at the end of the ms_hook + is done so that we can make absolutely certain we get + the required byte sequence at the start of the function, + rather than relying on an assembler that can produce + the exact encoding required. - DECL_INITIAL (decl) = make_node (BLOCK); - current_function_decl = decl; - allocate_struct_function (decl, false); - init_function_start (decl); - /* We're about to hide the function body from callees of final_* by - emitting it directly; tell them we're a thunk, if they care. */ - cfun->is_thunk = true; - first_function_block_is_cold = false; - /* Make sure unwind info is emitted for the thunk if needed. */ - final_start_function (emit_barrier (), asm_out_file, 1); + However it does mean (in the unpatched case) that we have + a 1 insn window where the asynchronous unwind info is + incorrect. However, if we placed the unwind info at + its correct location we would have incorrect unwind info + in the patched case. Which is probably all moot since + I don't expect Wine generates dwarf2 unwind info for the + system libraries that use this feature. */ - output_indirect_thunk (regno); + insn = emit_insn (gen_blockage ()); - final_end_function (); - init_insn_lengths (); - free_after_compilation (cfun); - set_cfun (NULL); - current_function_decl = NULL; -} + push = gen_push (hard_frame_pointer_rtx); + mov = gen_rtx_SET (hard_frame_pointer_rtx, + stack_pointer_rtx); + RTX_FRAME_RELATED_P (push) = 1; + RTX_FRAME_RELATED_P (mov) = 1; -static int pic_labels_used; + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov))); -/* Fills in the label name that should be used for a pc thunk for - the given register. */ + /* Note that gen_push incremented m->fs.cfa_offset, even + though we didn't emit the push insn here. */ + m->fs.cfa_reg = hard_frame_pointer_rtx; + m->fs.fp_offset = m->fs.cfa_offset; + m->fs.fp_valid = true; + } + else + { + /* The frame pointer is not needed so pop %ebp again. + This leaves us with a pristine state. */ + emit_insn (gen_pop (hard_frame_pointer_rtx)); + } + } -static void -get_pc_thunk_name (char name[32], unsigned int regno) -{ - gcc_assert (!TARGET_64BIT); + /* The first insn of a function that accepts its static chain on the + stack is to push the register that would be filled in by a direct + call. This insn will be skipped by the trampoline. */ + else if (ix86_static_chain_on_stack) + { + static_chain = ix86_static_chain (cfun->decl, false); + insn = emit_insn (gen_push (static_chain)); + emit_insn (gen_blockage ()); - if (USE_HIDDEN_LINKONCE) - sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]); - else - ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno); -} + /* We don't want to interpret this push insn as a register save, + only as a stack adjustment. The real copy of the register as + a save will be done later, if needed. */ + t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD); + t = gen_rtx_SET (stack_pointer_rtx, t); + add_reg_note (insn, REG_CFA_ADJUST_CFA, t); + RTX_FRAME_RELATED_P (insn) = 1; + } + /* Emit prologue code to adjust stack alignment and setup DRAP, in case + of DRAP is needed and stack realignment is really needed after reload */ + if (stack_realign_drap) + { + int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT; -/* This function generates code for -fpic that loads %ebx with - the return address of the caller and then returns. */ + /* Can't use DRAP in interrupt function. */ + if (cfun->machine->func_type != TYPE_NORMAL) + sorry ("Dynamic Realign Argument Pointer (DRAP) not supported " + "in interrupt service routine. This may be worked " + "around by avoiding functions with aggregate return."); -static void -ix86_code_end (void) -{ - rtx xops[2]; - unsigned int regno; + /* Only need to push parameter pointer reg if it is caller saved. */ + if (!call_used_regs[REGNO (crtl->drap_reg)]) + { + /* Push arg pointer reg */ + insn = emit_insn (gen_push (crtl->drap_reg)); + RTX_FRAME_RELATED_P (insn) = 1; + } - if (indirect_return_needed) - output_indirect_thunk_function (indirect_thunk_prefix_none, - INVALID_REGNUM, true); - if (indirect_return_via_cx) - output_indirect_thunk_function (indirect_thunk_prefix_none, - CX_REG, true); - if (indirect_thunk_needed) - output_indirect_thunk_function (indirect_thunk_prefix_none, - INVALID_REGNUM, false); + /* Grab the argument pointer. */ + t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset); + insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t)); + RTX_FRAME_RELATED_P (insn) = 1; + m->fs.cfa_reg = crtl->drap_reg; + m->fs.cfa_offset = 0; - for (regno = FIRST_REX_INT_REG; regno <= LAST_REX_INT_REG; regno++) - { - unsigned int i = regno - FIRST_REX_INT_REG + LAST_INT_REG + 1; - if ((indirect_thunks_used & (1 << i))) - output_indirect_thunk_function (indirect_thunk_prefix_none, - regno, false); - } + /* Align the stack. */ + insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx, + stack_pointer_rtx, + GEN_INT (-align_bytes))); + RTX_FRAME_RELATED_P (insn) = 1; - for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++) - { - char name[32]; - tree decl; + /* Replicate the return address on the stack so that return + address can be reached via (argp - 1) slot. This is needed + to implement macro RETURN_ADDR_RTX and intrinsic function + expand_builtin_return_addr etc. */ + t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD); + t = gen_frame_mem (word_mode, t); + insn = emit_insn (gen_push (t)); + RTX_FRAME_RELATED_P (insn) = 1; - if ((indirect_thunks_used & (1 << regno))) - output_indirect_thunk_function (indirect_thunk_prefix_none, - regno, false); + /* For the purposes of frame and register save area addressing, + we've started over with a new frame. */ + m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET; + m->fs.realigned = true; - if (!(pic_labels_used & (1 << regno))) - continue; + if (static_chain) + { + /* Replicate static chain on the stack so that static chain + can be reached via (argp - 2) slot. This is needed for + nested function with stack realignment. */ + insn = emit_insn (gen_push (static_chain)); + RTX_FRAME_RELATED_P (insn) = 1; + } + } - get_pc_thunk_name (name, regno); + int_registers_saved = (frame.nregs == 0); + sse_registers_saved = (frame.nsseregs == 0); + save_stub_call_needed = (m->call_ms2sysv); + gcc_assert (sse_registers_saved || !save_stub_call_needed); - decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, - get_identifier (name), - build_function_type_list (void_type_node, NULL_TREE)); - DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, - NULL_TREE, void_type_node); - TREE_PUBLIC (decl) = 1; - TREE_STATIC (decl) = 1; - DECL_IGNORED_P (decl) = 1; + if (frame_pointer_needed && !m->fs.fp_valid) + { + /* Note: AT&T enter does NOT have reversed args. Enter is probably + slower on all targets. Also sdb didn't like it. */ + insn = emit_insn (gen_push (hard_frame_pointer_rtx)); + RTX_FRAME_RELATED_P (insn) = 1; -#if TARGET_MACHO - if (TARGET_MACHO) + /* Push registers now, before setting the frame pointer + on SEH target. */ + if (!int_registers_saved + && TARGET_SEH + && !frame.save_regs_using_mov) { - switch_to_section (darwin_sections[picbase_thunk_section]); - fputs ("\t.weak_definition\t", asm_out_file); - assemble_name (asm_out_file, name); - fputs ("\n\t.private_extern\t", asm_out_file); - assemble_name (asm_out_file, name); - putc ('\n', asm_out_file); - ASM_OUTPUT_LABEL (asm_out_file, name); - DECL_WEAK (decl) = 1; + ix86_emit_save_regs (); + int_registers_saved = true; + gcc_assert (m->fs.sp_offset == frame.reg_save_offset); } - else -#endif - if (USE_HIDDEN_LINKONCE) - { - cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl)); - targetm.asm_out.unique_section (decl, 0); - switch_to_section (get_named_section (decl, NULL, 0)); + if (m->fs.sp_offset == frame.hard_frame_pointer_offset) + { + insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx); + RTX_FRAME_RELATED_P (insn) = 1; - targetm.asm_out.globalize_label (asm_out_file, name); - fputs ("\t.hidden\t", asm_out_file); - assemble_name (asm_out_file, name); - putc ('\n', asm_out_file); - ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl); + if (m->fs.cfa_reg == stack_pointer_rtx) + m->fs.cfa_reg = hard_frame_pointer_rtx; + m->fs.fp_offset = m->fs.sp_offset; + m->fs.fp_valid = true; } - else + } + + if (!int_registers_saved) + { + /* If saving registers via PUSH, do so now. */ + if (!frame.save_regs_using_mov) { - switch_to_section (text_section); - ASM_OUTPUT_LABEL (asm_out_file, name); + ix86_emit_save_regs (); + int_registers_saved = true; + gcc_assert (m->fs.sp_offset == frame.reg_save_offset); } - DECL_INITIAL (decl) = make_node (BLOCK); - current_function_decl = decl; - allocate_struct_function (decl, false); - init_function_start (decl); - /* We're about to hide the function body from callees of final_* by - emitting it directly; tell them we're a thunk, if they care. */ - cfun->is_thunk = true; - first_function_block_is_cold = false; - /* Make sure unwind info is emitted for the thunk if needed. */ - final_start_function (emit_barrier (), asm_out_file, 1); - - /* Pad stack IP move with 4 instructions (two NOPs count - as one instruction). */ - if (TARGET_PAD_SHORT_FUNCTION) + /* When using red zone we may start register saving before allocating + the stack frame saving one cycle of the prologue. However, avoid + doing this if we have to probe the stack; at least on x86_64 the + stack probe can turn into a call that clobbers a red zone location. */ + else if (ix86_using_red_zone () + && (! TARGET_STACK_PROBE + || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) { - int i = 8; - - while (i--) - fputs ("\tnop\n", asm_out_file); + ix86_emit_save_regs_using_mov (frame.reg_save_offset); + int_registers_saved = true; } - - xops[0] = gen_rtx_REG (Pmode, regno); - xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx); - output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops); - output_asm_insn ("%!ret", NULL); - final_end_function (); - init_insn_lengths (); - free_after_compilation (cfun); - set_cfun (NULL); - current_function_decl = NULL; } - if (flag_split_stack) - file_end_indicate_split_stack (); -} + if (stack_realign_fp) + { + int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT; + gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT); -/* Emit code for the SET_GOT patterns. */ + /* Record last valid frame pointer offset. */ + m->fs.sp_realigned_fp_last = frame.reg_save_offset; -const char * -output_set_got (rtx dest, rtx label) -{ - rtx xops[3]; + /* The computation of the size of the re-aligned stack frame means + that we must allocate the size of the register save area before + performing the actual alignment. Otherwise we cannot guarantee + that there's enough storage above the realignment point. */ + allocate = frame.reg_save_offset - m->fs.sp_offset + + frame.stack_realign_allocate; + if (allocate) + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-allocate), -1, false); - xops[0] = dest; + /* Align the stack. */ + insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx, + stack_pointer_rtx, + GEN_INT (-align_bytes))); + m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes); + m->fs.sp_realigned_offset = m->fs.sp_offset + - frame.stack_realign_allocate; + /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset. + Beyond this point, stack access should be done via choose_baseaddr or + by using sp_valid_at and fp_valid_at to determine the correct base + register. Henceforth, any CFA offset should be thought of as logical + and not physical. */ + gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last); + gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset); + m->fs.sp_realigned = true; - if (TARGET_VXWORKS_RTP && flag_pic) - { - /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */ - xops[2] = gen_rtx_MEM (Pmode, - gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE)); - output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops); + /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which + is needed to describe where a register is saved using a realigned + stack pointer, so we need to invalidate the stack pointer for that + target. */ + if (TARGET_SEH) + m->fs.sp_valid = false; - /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register. - Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as - an unadorned address. */ - xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX); - SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL; - output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops); - return ""; + /* If SP offset is non-immediate after allocation of the stack frame, + then emit SSE saves or stub call prior to allocating the rest of the + stack frame. This is less efficient for the out-of-line stub because + we can't combine allocations across the call barrier, but it's better + than using a scratch register. */ + else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset + - m->fs.sp_realigned_offset), + Pmode)) + { + if (!sse_registers_saved) + { + ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); + sse_registers_saved = true; + } + else if (save_stub_call_needed) + { + ix86_emit_outlined_ms2sysv_save (frame); + save_stub_call_needed = false; + } + } } - xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); + allocate = frame.stack_pointer_offset - m->fs.sp_offset; - if (flag_pic) + if (flag_stack_usage_info) { - char name[32]; - get_pc_thunk_name (name, REGNO (dest)); - pic_labels_used |= 1 << REGNO (dest); + /* We start to count from ARG_POINTER. */ + HOST_WIDE_INT stack_size = frame.stack_pointer_offset; - xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name)); - xops[2] = gen_rtx_MEM (QImode, xops[2]); - output_asm_insn ("%!call\t%X2", xops); + /* If it was realigned, take into account the fake frame. */ + if (stack_realign_drap) + { + if (ix86_static_chain_on_stack) + stack_size += UNITS_PER_WORD; -#if TARGET_MACHO - /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here. - This is what will be referenced by the Mach-O PIC subsystem. */ - if (machopic_should_output_picbase_label () || !label) - ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME); + if (!call_used_regs[REGNO (crtl->drap_reg)]) + stack_size += UNITS_PER_WORD; - /* When we are restoring the pic base at the site of a nonlocal label, - and we decided to emit the pic base above, we will still output a - local label used for calculating the correction offset (even though - the offset will be 0 in that case). */ - if (label) - targetm.asm_out.internal_label (asm_out_file, "L", - CODE_LABEL_NUMBER (label)); -#endif - } - else - { - if (TARGET_MACHO) - /* We don't need a pic base, we're not producing pic. */ - gcc_unreachable (); + /* This over-estimates by 1 minimal-stack-alignment-unit but + mitigates that by counting in the new return address slot. */ + current_function_dynamic_stack_size + += crtl->stack_alignment_needed / BITS_PER_UNIT; + } - xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ()); - output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops); - targetm.asm_out.internal_label (asm_out_file, "L", - CODE_LABEL_NUMBER (XEXP (xops[2], 0))); + current_function_static_stack_size = stack_size; } - if (!TARGET_MACHO) - output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops); - - return ""; -} - -/* Generate an "push" pattern for input ARG. */ - -static rtx -gen_push (rtx arg) -{ - struct machine_function *m = cfun->machine; - - if (m->fs.cfa_reg == stack_pointer_rtx) - m->fs.cfa_offset += UNITS_PER_WORD; - m->fs.sp_offset += UNITS_PER_WORD; - - if (REG_P (arg) && GET_MODE (arg) != word_mode) - arg = gen_rtx_REG (word_mode, REGNO (arg)); - - return gen_rtx_SET (gen_rtx_MEM (word_mode, - gen_rtx_PRE_DEC (Pmode, - stack_pointer_rtx)), - arg); -} + /* On SEH target with very large frame size, allocate an area to save + SSE registers (as the very large allocation won't be described). */ + if (TARGET_SEH + && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE + && !sse_registers_saved) + { + HOST_WIDE_INT sse_size + = frame.sse_reg_save_offset - frame.reg_save_offset; -/* Generate an "pop" pattern for input ARG. */ + gcc_assert (int_registers_saved); -static rtx -gen_pop (rtx arg) -{ - if (REG_P (arg) && GET_MODE (arg) != word_mode) - arg = gen_rtx_REG (word_mode, REGNO (arg)); + /* No need to do stack checking as the area will be immediately + written. */ + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-sse_size), -1, + m->fs.cfa_reg == stack_pointer_rtx); + allocate -= sse_size; + ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); + sse_registers_saved = true; + } - return gen_rtx_SET (arg, - gen_rtx_MEM (word_mode, - gen_rtx_POST_INC (Pmode, - stack_pointer_rtx))); -} + /* The stack has already been decremented by the instruction calling us + so probe if the size is non-negative to preserve the protection area. */ + if (allocate >= 0 + && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK + || flag_stack_clash_protection)) + { + if (flag_stack_clash_protection) + { + ix86_adjust_stack_and_probe_stack_clash (allocate, + int_registers_saved); + allocate = 0; + } + else if (STACK_CHECK_MOVING_SP) + { + if (!(crtl->is_leaf && !cfun->calls_alloca + && allocate <= get_probe_interval ())) + { + ix86_adjust_stack_and_probe (allocate, int_registers_saved); + allocate = 0; + } + } + else + { + HOST_WIDE_INT size = allocate; -/* Return >= 0 if there is an unused call-clobbered register available - for the entire function. */ + if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000)) + size = 0x80000000 - get_stack_check_protect () - 1; -static unsigned int -ix86_select_alt_pic_regnum (void) -{ - if (ix86_use_pseudo_pic_reg ()) - return INVALID_REGNUM; + if (TARGET_STACK_PROBE) + { + if (crtl->is_leaf && !cfun->calls_alloca) + { + if (size > get_probe_interval ()) + ix86_emit_probe_stack_range (0, size, int_registers_saved); + } + else + ix86_emit_probe_stack_range (0, + size + get_stack_check_protect (), + int_registers_saved); + } + else + { + if (crtl->is_leaf && !cfun->calls_alloca) + { + if (size > get_probe_interval () + && size > get_stack_check_protect ()) + ix86_emit_probe_stack_range (get_stack_check_protect (), + (size + - get_stack_check_protect ()), + int_registers_saved); + } + else + ix86_emit_probe_stack_range (get_stack_check_protect (), size, + int_registers_saved); + } + } + } - if (crtl->is_leaf - && !crtl->profile - && !ix86_current_function_calls_tls_descriptor) + if (allocate == 0) + ; + else if (!ix86_target_stack_probe () + || frame.stack_pointer_offset < CHECK_STACK_LIMIT) { - int i, drap; - /* Can't use the same register for both PIC and DRAP. */ - if (crtl->drap_reg) - drap = REGNO (crtl->drap_reg); - else - drap = -1; - for (i = 2; i >= 0; --i) - if (i != drap && !df_regs_ever_live_p (i)) - return i; + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-allocate), -1, + m->fs.cfa_reg == stack_pointer_rtx); } + else + { + rtx eax = gen_rtx_REG (Pmode, AX_REG); + rtx r10 = NULL; + rtx (*adjust_stack_insn)(rtx, rtx, rtx); + const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx); + bool eax_live = ix86_eax_live_at_start_p (); + bool r10_live = false; - return INVALID_REGNUM; -} + if (TARGET_64BIT) + r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0); -/* Return true if REGNO is used by the epilogue. */ + if (eax_live) + { + insn = emit_insn (gen_push (eax)); + allocate -= UNITS_PER_WORD; + /* Note that SEH directives need to continue tracking the stack + pointer even after the frame pointer has been set up. */ + if (sp_is_cfa_reg || TARGET_SEH) + { + if (sp_is_cfa_reg) + m->fs.cfa_offset += UNITS_PER_WORD; + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + -UNITS_PER_WORD))); + } + } -bool -ix86_epilogue_uses (int regno) -{ - /* If there are no caller-saved registers, we preserve all registers, - except for MMX and x87 registers which aren't supported when saving - and restoring registers. Don't explicitly save SP register since - it is always preserved. */ - return (epilogue_completed - && cfun->machine->no_caller_saved_registers - && !fixed_regs[regno] - && !STACK_REGNO_P (regno) - && !MMX_REGNO_P (regno)); -} + if (r10_live) + { + r10 = gen_rtx_REG (Pmode, R10_REG); + insn = emit_insn (gen_push (r10)); + allocate -= UNITS_PER_WORD; + if (sp_is_cfa_reg || TARGET_SEH) + { + if (sp_is_cfa_reg) + m->fs.cfa_offset += UNITS_PER_WORD; + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + -UNITS_PER_WORD))); + } + } -/* Return nonzero if register REGNO can be used as a scratch register - in peephole2. */ + emit_move_insn (eax, GEN_INT (allocate)); + emit_insn (ix86_gen_allocate_stack_worker (eax, eax)); -static bool -ix86_hard_regno_scratch_ok (unsigned int regno) -{ - /* If there are no caller-saved registers, we can't use any register - as a scratch register after epilogue and use REGNO as scratch - register only if it has been used before to avoid saving and - restoring it. */ - return (!cfun->machine->no_caller_saved_registers - || (!epilogue_completed - && df_regs_ever_live_p (regno))); -} + /* Use the fact that AX still contains ALLOCATE. */ + adjust_stack_insn = (Pmode == DImode + ? gen_pro_epilogue_adjust_stack_di_sub + : gen_pro_epilogue_adjust_stack_si_sub); -/* Return TRUE if we need to save REGNO. */ + insn = emit_insn (adjust_stack_insn (stack_pointer_rtx, + stack_pointer_rtx, eax)); -static bool -ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined) -{ - /* If there are no caller-saved registers, we preserve all registers, - except for MMX and x87 registers which aren't supported when saving - and restoring registers. Don't explicitly save SP register since - it is always preserved. */ - if (cfun->machine->no_caller_saved_registers) - { - /* Don't preserve registers used for function return value. */ - rtx reg = crtl->return_rtx; - if (reg) + if (sp_is_cfa_reg || TARGET_SEH) { - unsigned int i = REGNO (reg); - unsigned int nregs = REG_NREGS (reg); - while (nregs-- > 0) - if ((i + nregs) == regno) - return false; + if (sp_is_cfa_reg) + m->fs.cfa_offset += allocate; + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, + gen_rtx_SET (stack_pointer_rtx, + plus_constant (Pmode, stack_pointer_rtx, + -allocate))); } + m->fs.sp_offset += allocate; - return (df_regs_ever_live_p (regno) - && !fixed_regs[regno] - && !STACK_REGNO_P (regno) - && !MMX_REGNO_P (regno) - && (regno != HARD_FRAME_POINTER_REGNUM - || !frame_pointer_needed)); + /* Use stack_pointer_rtx for relative addressing so that code works for + realigned stack. But this means that we need a blockage to prevent + stores based on the frame pointer from being scheduled before. */ + if (r10_live && eax_live) + { + t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax); + emit_move_insn (gen_rtx_REG (word_mode, R10_REG), + gen_frame_mem (word_mode, t)); + t = plus_constant (Pmode, t, UNITS_PER_WORD); + emit_move_insn (gen_rtx_REG (word_mode, AX_REG), + gen_frame_mem (word_mode, t)); + emit_insn (gen_memory_blockage ()); + } + else if (eax_live || r10_live) + { + t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax); + emit_move_insn (gen_rtx_REG (word_mode, + (eax_live ? AX_REG : R10_REG)), + gen_frame_mem (word_mode, t)); + emit_insn (gen_memory_blockage ()); + } } + gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset); - if (regno == REAL_PIC_OFFSET_TABLE_REGNUM - && pic_offset_table_rtx) + /* If we havn't already set up the frame pointer, do so now. */ + if (frame_pointer_needed && !m->fs.fp_valid) { - if (ix86_use_pseudo_pic_reg ()) - { - /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to - _mcount in prologue. */ - if (!TARGET_64BIT && flag_pic && crtl->profile) - return true; - } - else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM) - || crtl->profile - || crtl->calls_eh_return - || crtl->uses_const_pool - || cfun->has_nonlocal_label) - return ix86_select_alt_pic_regnum () == INVALID_REGNUM; + insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx, + GEN_INT (frame.stack_pointer_offset + - frame.hard_frame_pointer_offset)); + insn = emit_insn (insn); + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL); + + if (m->fs.cfa_reg == stack_pointer_rtx) + m->fs.cfa_reg = hard_frame_pointer_rtx; + m->fs.fp_offset = frame.hard_frame_pointer_offset; + m->fs.fp_valid = true; } - if (crtl->calls_eh_return && maybe_eh_return) + if (!int_registers_saved) + ix86_emit_save_regs_using_mov (frame.reg_save_offset); + if (!sse_registers_saved) + ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); + else if (save_stub_call_needed) + ix86_emit_outlined_ms2sysv_save (frame); + + /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT + in PROLOGUE. */ + if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry) { - unsigned i; - for (i = 0; ; i++) - { - unsigned test = EH_RETURN_DATA_REGNO (i); - if (test == INVALID_REGNUM) - break; - if (test == regno) - return true; - } + rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM); + insn = emit_insn (gen_set_got (pic)); + RTX_FRAME_RELATED_P (insn) = 1; + add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX); + emit_insn (gen_prologue_use (pic)); + /* Deleting already emmitted SET_GOT if exist and allocated to + REAL_PIC_OFFSET_TABLE_REGNUM. */ + ix86_elim_entry_set_got (pic); } - if (ignore_outlined && cfun->machine->call_ms2sysv) + if (crtl->drap_reg && !crtl->stack_realign_needed) { - unsigned count = cfun->machine->call_ms2sysv_extra_regs - + xlogue_layout::MIN_REGS; - if (xlogue_layout::is_stub_managed_reg (regno, count)) - return false; + /* vDRAP is setup but after reload it turns out stack realign + isn't necessary, here we will emit prologue to setup DRAP + without stack realign adjustment */ + t = choose_baseaddr (0, NULL); + emit_insn (gen_rtx_SET (crtl->drap_reg, t)); } - if (crtl->drap_reg - && regno == REGNO (crtl->drap_reg) - && !cfun->machine->no_drap_save_restore) - return true; + /* Prevent instructions from being scheduled into register save push + sequence when access to the redzone area is done through frame pointer. + The offset between the frame pointer and the stack pointer is calculated + relative to the value of the stack pointer at the end of the function + prologue, and moving instructions that access redzone area via frame + pointer inside push sequence violates this assumption. */ + if (frame_pointer_needed && frame.red_zone_size) + emit_insn (gen_memory_blockage ()); - return (df_regs_ever_live_p (regno) - && !call_used_regs[regno] - && !fixed_regs[regno] - && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed)); + /* SEH requires that the prologue end within 256 bytes of the start of + the function. Prevent instruction schedules that would extend that. + Further, prevent alloca modifications to the stack pointer from being + combined with prologue modifications. */ + if (TARGET_SEH) + emit_insn (gen_prologue_use (stack_pointer_rtx)); } -/* Return number of saved general prupose registers. */ +/* Emit code to restore REG using a POP insn. */ -static int -ix86_nsaved_regs (void) +static void +ix86_emit_restore_reg_using_pop (rtx reg) { - int nregs = 0; - int regno; + struct machine_function *m = cfun->machine; + rtx_insn *insn = emit_insn (gen_pop (reg)); - for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) - nregs ++; - return nregs; -} + ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset); + m->fs.sp_offset -= UNITS_PER_WORD; -/* Return number of saved SSE registers. */ + if (m->fs.cfa_reg == crtl->drap_reg + && REGNO (reg) == REGNO (crtl->drap_reg)) + { + /* Previously we'd represented the CFA as an expression + like *(%ebp - 8). We've just popped that value from + the stack, which means we need to reset the CFA to + the drap register. This will remain until we restore + the stack pointer. */ + add_reg_note (insn, REG_CFA_DEF_CFA, reg); + RTX_FRAME_RELATED_P (insn) = 1; -static int -ix86_nsaved_sseregs (void) -{ - int nregs = 0; - int regno; + /* This means that the DRAP register is valid for addressing too. */ + m->fs.drap_valid = true; + return; + } - if (!TARGET_64BIT_MS_ABI) - return 0; - for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true)) - nregs ++; - return nregs; + if (m->fs.cfa_reg == stack_pointer_rtx) + { + rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); + x = gen_rtx_SET (stack_pointer_rtx, x); + add_reg_note (insn, REG_CFA_ADJUST_CFA, x); + RTX_FRAME_RELATED_P (insn) = 1; + + m->fs.cfa_offset -= UNITS_PER_WORD; + } + + /* When the frame pointer is the CFA, and we pop it, we are + swapping back to the stack pointer as the CFA. This happens + for stack frames that don't allocate other data, so we assume + the stack pointer is now pointing at the return address, i.e. + the function entry state, which makes the offset be 1 word. */ + if (reg == hard_frame_pointer_rtx) + { + m->fs.fp_valid = false; + if (m->fs.cfa_reg == hard_frame_pointer_rtx) + { + m->fs.cfa_reg = stack_pointer_rtx; + m->fs.cfa_offset -= UNITS_PER_WORD; + + add_reg_note (insn, REG_CFA_DEF_CFA, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (m->fs.cfa_offset))); + RTX_FRAME_RELATED_P (insn) = 1; + } + } } -/* Given FROM and TO register numbers, say whether this elimination is - allowed. If stack alignment is needed, we can only replace argument - pointer with hard frame pointer, or replace frame pointer with stack - pointer. Otherwise, frame pointer elimination is automatically - handled and all other eliminations are valid. */ +/* Emit code to restore saved registers using POP insns. */ -static bool -ix86_can_eliminate (const int from, const int to) +static void +ix86_emit_restore_regs_using_pop (void) { - if (stack_realign_fp) - return ((from == ARG_POINTER_REGNUM - && to == HARD_FRAME_POINTER_REGNUM) - || (from == FRAME_POINTER_REGNUM - && to == STACK_POINTER_REGNUM)); - else - return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true; + unsigned int regno; + + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true)) + ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno)); } -/* Return the offset between two registers, one to be eliminated, and the other - its replacement, at the start of a routine. */ +/* Emit code and notes for the LEAVE instruction. If insn is non-null, + omits the emit and only attaches the notes. */ -HOST_WIDE_INT -ix86_initial_elimination_offset (int from, int to) +static void +ix86_emit_leave (rtx_insn *insn) { - struct ix86_frame &frame = cfun->machine->frame; + struct machine_function *m = cfun->machine; + if (!insn) + insn = emit_insn (ix86_gen_leave ()); - if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM) - return frame.hard_frame_pointer_offset; - else if (from == FRAME_POINTER_REGNUM - && to == HARD_FRAME_POINTER_REGNUM) - return frame.hard_frame_pointer_offset - frame.frame_pointer_offset; - else - { - gcc_assert (to == STACK_POINTER_REGNUM); + ix86_add_queued_cfa_restore_notes (insn); - if (from == ARG_POINTER_REGNUM) - return frame.stack_pointer_offset; + gcc_assert (m->fs.fp_valid); + m->fs.sp_valid = true; + m->fs.sp_realigned = false; + m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD; + m->fs.fp_valid = false; - gcc_assert (from == FRAME_POINTER_REGNUM); - return frame.stack_pointer_offset - frame.frame_pointer_offset; + if (m->fs.cfa_reg == hard_frame_pointer_rtx) + { + m->fs.cfa_reg = stack_pointer_rtx; + m->fs.cfa_offset = m->fs.sp_offset; + + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, stack_pointer_rtx, + m->fs.sp_offset)); + RTX_FRAME_RELATED_P (insn) = 1; } + ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx, + m->fs.fp_offset); } -/* In a dynamically-aligned function, we can't know the offset from - stack pointer to frame pointer, so we must ensure that setjmp - eliminates fp against the hard fp (%ebp) rather than trying to - index from %esp up to the top of the frame across a gap that is - of unknown (at compile-time) size. */ -static rtx -ix86_builtin_setjmp_frame_value (void) +/* Emit code to restore saved registers using MOV insns. + First register is restored from CFA - CFA_OFFSET. */ +static void +ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset, + bool maybe_eh_return) { - return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx; + struct machine_function *m = cfun->machine; + unsigned int regno; + + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true)) + { + rtx reg = gen_rtx_REG (word_mode, regno); + rtx mem; + rtx_insn *insn; + + mem = choose_baseaddr (cfa_offset, NULL); + mem = gen_frame_mem (word_mode, mem); + insn = emit_move_insn (reg, mem); + + if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg)) + { + /* Previously we'd represented the CFA as an expression + like *(%ebp - 8). We've just popped that value from + the stack, which means we need to reset the CFA to + the drap register. This will remain until we restore + the stack pointer. */ + add_reg_note (insn, REG_CFA_DEF_CFA, reg); + RTX_FRAME_RELATED_P (insn) = 1; + + /* This means that the DRAP register is valid for addressing. */ + m->fs.drap_valid = true; + } + else + ix86_add_cfa_restore_note (NULL, reg, cfa_offset); + + cfa_offset -= UNITS_PER_WORD; + } } -/* Emits a warning for unsupported msabi to sysv pro/epilogues. */ -static void warn_once_call_ms2sysv_xlogues (const char *feature) +/* Emit code to restore saved registers using MOV insns. + First register is restored from CFA - CFA_OFFSET. */ +static void +ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset, + bool maybe_eh_return) { - static bool warned_once = false; - if (!warned_once) - { - warning (0, "%<-mcall-ms2sysv-xlogues%> is not compatible with %s", - feature); - warned_once = true; - } -} + unsigned int regno; -/* Return the probing interval for -fstack-clash-protection. */ + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true)) + { + rtx reg = gen_rtx_REG (V4SFmode, regno); + rtx mem; + unsigned int align = GET_MODE_ALIGNMENT (V4SFmode); -static HOST_WIDE_INT -get_probe_interval (void) -{ - if (flag_stack_clash_protection) - return (HOST_WIDE_INT_1U - << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL)); - else - return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP); -} + mem = choose_baseaddr (cfa_offset, &align); + mem = gen_rtx_MEM (V4SFmode, mem); -/* When using -fsplit-stack, the allocation routines set a field in - the TCB to the bottom of the stack plus this much space, measured - in bytes. */ + /* The location aligment depends upon the base register. */ + align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align); + gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1))); + set_mem_align (mem, align); + emit_insn (gen_rtx_SET (reg, mem)); -#define SPLIT_STACK_AVAILABLE 256 + ix86_add_cfa_restore_note (NULL, reg, cfa_offset); -/* Fill structure ix86_frame about frame of currently computed function. */ + cfa_offset -= GET_MODE_SIZE (V4SFmode); + } +} static void -ix86_compute_frame_layout (void) +ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame, + bool use_call, int style) { - struct ix86_frame *frame = &cfun->machine->frame; struct machine_function *m = cfun->machine; - unsigned HOST_WIDE_INT stack_alignment_needed; - HOST_WIDE_INT offset; - unsigned HOST_WIDE_INT preferred_alignment; - HOST_WIDE_INT size = get_frame_size (); - HOST_WIDE_INT to_allocate; - - /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit - * ms_abi functions that call a sysv function. We now need to prune away - * cases where it should be disabled. */ - if (TARGET_64BIT && m->call_ms2sysv) - { - gcc_assert (TARGET_64BIT_MS_ABI); - gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES); - gcc_assert (!TARGET_SEH); - gcc_assert (TARGET_SSE); - gcc_assert (!ix86_using_red_zone ()); - - if (crtl->calls_eh_return) - { - gcc_assert (!reload_completed); - m->call_ms2sysv = false; - warn_once_call_ms2sysv_xlogues ("__builtin_eh_return"); - } + const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS + + m->call_ms2sysv_extra_regs; + rtvec v; + unsigned int elems_needed, align, i, vi = 0; + rtx_insn *insn; + rtx sym, tmp; + rtx rsi = gen_rtx_REG (word_mode, SI_REG); + rtx r10 = NULL_RTX; + const struct xlogue_layout &xlogue = xlogue_layout::get_instance (); + HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset (); + HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset; + rtx rsi_frame_load = NULL_RTX; + HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1; + enum xlogue_stub stub; - else if (ix86_static_chain_on_stack) - { - gcc_assert (!reload_completed); - m->call_ms2sysv = false; - warn_once_call_ms2sysv_xlogues ("static call chains"); - } + gcc_assert (!m->fs.fp_valid || frame_pointer_needed); - /* Finally, compute which registers the stub will manage. */ - else - { - unsigned count = xlogue_layout::count_stub_managed_regs (); - m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS; - m->call_ms2sysv_pad_in = 0; - } - } + /* If using a realigned stack, we should never start with padding. */ + gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ()); - frame->nregs = ix86_nsaved_regs (); - frame->nsseregs = ix86_nsaved_sseregs (); + /* Setup RSI as the stub's base pointer. */ + align = GET_MODE_ALIGNMENT (V4SFmode); + tmp = choose_baseaddr (rsi_offset, &align, SI_REG); + gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode)); - /* 64-bit MS ABI seem to require stack alignment to be always 16, - except for function prologues, leaf functions and when the defult - incoming stack boundary is overriden at command line or via - force_align_arg_pointer attribute. + emit_insn (gen_rtx_SET (rsi, tmp)); - Darwin's ABI specifies 128b alignment for both 32 and 64 bit variants - at call sites, including profile function calls. - */ - if (((TARGET_64BIT_MS_ABI || TARGET_MACHO) - && crtl->preferred_stack_boundary < 128) - && (!crtl->is_leaf || cfun->calls_alloca != 0 - || ix86_current_function_calls_tls_descriptor - || (TARGET_MACHO && crtl->profile) - || ix86_incoming_stack_boundary < 128)) + /* Get a symbol for the stub. */ + if (frame_pointer_needed) + stub = use_call ? XLOGUE_STUB_RESTORE_HFP + : XLOGUE_STUB_RESTORE_HFP_TAIL; + else + stub = use_call ? XLOGUE_STUB_RESTORE + : XLOGUE_STUB_RESTORE_TAIL; + sym = xlogue.get_stub_rtx (stub); + + elems_needed = ncregs; + if (use_call) + elems_needed += 1; + else + elems_needed += frame_pointer_needed ? 5 : 3; + v = rtvec_alloc (elems_needed); + + /* We call the epilogue stub when we need to pop incoming args or we are + doing a sibling call as the tail. Otherwise, we will emit a jmp to the + epilogue stub and it is the tail-call. */ + if (use_call) + RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym); + else { - crtl->preferred_stack_boundary = 128; - crtl->stack_alignment_needed = 128; - } + RTVEC_ELT (v, vi++) = ret_rtx; + RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym); + if (frame_pointer_needed) + { + rtx rbp = gen_rtx_REG (DImode, BP_REG); + gcc_assert (m->fs.fp_valid); + gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx); - stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT; - preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT; + tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8)); + RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp); + RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp)); + tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode)); + RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp); + } + else + { + /* If no hard frame pointer, we set R10 to the SP restore value. */ + gcc_assert (!m->fs.fp_valid); + gcc_assert (m->fs.cfa_reg == stack_pointer_rtx); + gcc_assert (m->fs.sp_valid); - gcc_assert (!size || stack_alignment_needed); - gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT); - gcc_assert (preferred_alignment <= stack_alignment_needed); + r10 = gen_rtx_REG (DImode, R10_REG); + tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset)); + emit_insn (gen_rtx_SET (r10, tmp)); - /* The only ABI saving SSE regs should be 64-bit ms_abi. */ - gcc_assert (TARGET_64BIT || !frame->nsseregs); - if (TARGET_64BIT && m->call_ms2sysv) - { - gcc_assert (stack_alignment_needed >= 16); - gcc_assert (!frame->nsseregs); + RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10); + } } - /* For SEH we have to limit the amount of code movement into the prologue. - At present we do this via a BLOCKAGE, at which point there's very little - scheduling that can be done, which means that there's very little point - in doing anything except PUSHs. */ - if (TARGET_SEH) - m->use_fast_prologue_epilogue = false; - else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun))) + /* Generate frame load insns and restore notes. */ + for (i = 0; i < ncregs; ++i) { - int count = frame->nregs; - struct cgraph_node *node = cgraph_node::get (current_function_decl); + const xlogue_layout::reginfo &r = xlogue.get_reginfo (i); + machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode; + rtx reg, frame_load; - /* The fast prologue uses move instead of push to save registers. This - is significantly longer, but also executes faster as modern hardware - can execute the moves in parallel, but can't do that for push/pop. + reg = gen_rtx_REG (mode, r.regno); + frame_load = gen_frame_load (reg, rsi, r.offset); - Be careful about choosing what prologue to emit: When function takes - many instructions to execute we may use slow version as well as in - case function is known to be outside hot spot (this is known with - feedback only). Weight the size of function by number of registers - to save as it is cheap to use one or two push instructions but very - slow to use many of them. */ - if (count) - count = (count - 1) * FAST_PROLOGUE_INSN_COUNT; - if (node->frequency < NODE_FREQUENCY_NORMAL - || (flag_branch_probabilities - && node->frequency < NODE_FREQUENCY_HOT)) - m->use_fast_prologue_epilogue = false; + /* Save RSI frame load insn & note to add last. */ + if (r.regno == SI_REG) + { + gcc_assert (!rsi_frame_load); + rsi_frame_load = frame_load; + rsi_restore_offset = r.offset; + } else - m->use_fast_prologue_epilogue - = !expensive_function_p (count); + { + RTVEC_ELT (v, vi++) = frame_load; + ix86_add_cfa_restore_note (NULL, reg, r.offset); + } } - frame->save_regs_using_mov - = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue - /* If static stack checking is enabled and done with probes, - the registers need to be saved before allocating the frame. */ - && flag_stack_check != STATIC_BUILTIN_STACK_CHECK); + /* Add RSI frame load & restore note at the end. */ + gcc_assert (rsi_frame_load); + gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1); + RTVEC_ELT (v, vi++) = rsi_frame_load; + ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG), + rsi_restore_offset); - /* Skip return address and error code in exception handler. */ - offset = INCOMING_FRAME_SP_OFFSET; + /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */ + if (!use_call && !frame_pointer_needed) + { + gcc_assert (m->fs.sp_valid); + gcc_assert (!m->fs.sp_realigned); - /* Skip pushed static chain. */ - if (ix86_static_chain_on_stack) - offset += UNITS_PER_WORD; + /* At this point, R10 should point to frame.stack_realign_offset. */ + if (m->fs.cfa_reg == stack_pointer_rtx) + m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset; + m->fs.sp_offset = frame.stack_realign_offset; + } - /* Skip saved base pointer. */ - if (frame_pointer_needed) - offset += UNITS_PER_WORD; - frame->hfp_save_offset = offset; + gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v)); + tmp = gen_rtx_PARALLEL (VOIDmode, v); + if (use_call) + insn = emit_insn (tmp); + else + { + insn = emit_jump_insn (tmp); + JUMP_LABEL (insn) = ret_rtx; - /* The traditional frame pointer location is at the top of the frame. */ - frame->hard_frame_pointer_offset = offset; + if (frame_pointer_needed) + ix86_emit_leave (insn); + else + { + /* Need CFA adjust note. */ + tmp = gen_rtx_SET (stack_pointer_rtx, r10); + add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp); + } + } - /* Register save area */ - offset += frame->nregs * UNITS_PER_WORD; - frame->reg_save_offset = offset; + RTX_FRAME_RELATED_P (insn) = true; + ix86_add_queued_cfa_restore_notes (insn); - /* On SEH target, registers are pushed just before the frame pointer - location. */ - if (TARGET_SEH) - frame->hard_frame_pointer_offset = offset; + /* If we're not doing a tail-call, we need to adjust the stack. */ + if (use_call && m->fs.sp_valid) + { + HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset; + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (dealloc), style, + m->fs.cfa_reg == stack_pointer_rtx); + } +} - /* Calculate the size of the va-arg area (not including padding, if any). */ - frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size; +/* Restore function stack, frame, and registers. */ - /* Also adjust stack_realign_offset for the largest alignment of - stack slot actually used. */ - if (stack_realign_fp - || (cfun->machine->max_used_stack_alignment != 0 - && (offset % cfun->machine->max_used_stack_alignment) != 0)) - { - /* We may need a 16-byte aligned stack for the remainder of the - register save area, but the stack frame for the local function - may require a greater alignment if using AVX/2/512. In order - to avoid wasting space, we first calculate the space needed for - the rest of the register saves, add that to the stack pointer, - and then realign the stack to the boundary of the start of the - frame for the local function. */ - HOST_WIDE_INT space_needed = 0; - HOST_WIDE_INT sse_reg_space_needed = 0; +void +ix86_expand_epilogue (int style) +{ + struct machine_function *m = cfun->machine; + struct machine_frame_state frame_state_save = m->fs; + bool restore_regs_via_mov; + bool using_drap; + bool restore_stub_is_tail = false; - if (TARGET_64BIT) - { - if (m->call_ms2sysv) - { - m->call_ms2sysv_pad_in = 0; - space_needed = xlogue_layout::get_instance ().get_stack_space_used (); - } + if (ix86_function_naked (current_function_decl)) + { + /* The program should not reach this point. */ + emit_insn (gen_ud2 ()); + return; + } - else if (frame->nsseregs) - /* The only ABI that has saved SSE registers (Win64) also has a - 16-byte aligned default stack. However, many programs violate - the ABI, and Wine64 forces stack realignment to compensate. */ - space_needed = frame->nsseregs * 16; + ix86_finalize_stack_frame_flags (); + const struct ix86_frame &frame = cfun->machine->frame; - sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16); + m->fs.sp_realigned = stack_realign_fp; + m->fs.sp_valid = stack_realign_fp + || !frame_pointer_needed + || crtl->sp_is_unchanging; + gcc_assert (!m->fs.sp_valid + || m->fs.sp_offset == frame.stack_pointer_offset); - /* 64-bit frame->va_arg_size should always be a multiple of 16, but - rounding to be pedantic. */ - space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16); - } - else - space_needed = frame->va_arg_size; + /* The FP must be valid if the frame pointer is present. */ + gcc_assert (frame_pointer_needed == m->fs.fp_valid); + gcc_assert (!m->fs.fp_valid + || m->fs.fp_offset == frame.hard_frame_pointer_offset); - /* Record the allocation size required prior to the realignment AND. */ - frame->stack_realign_allocate = space_needed; + /* We must have *some* valid pointer to the stack frame. */ + gcc_assert (m->fs.sp_valid || m->fs.fp_valid); - /* The re-aligned stack starts at frame->stack_realign_offset. Values - before this point are not directly comparable with values below - this point. Use sp_valid_at to determine if the stack pointer is - valid for a given offset, fp_valid_at for the frame pointer, or - choose_baseaddr to have a base register chosen for you. + /* The DRAP is never valid at this point. */ + gcc_assert (!m->fs.drap_valid); - Note that the result of (frame->stack_realign_offset - & (stack_alignment_needed - 1)) may not equal zero. */ - offset = ROUND_UP (offset + space_needed, stack_alignment_needed); - frame->stack_realign_offset = offset - space_needed; - frame->sse_reg_save_offset = frame->stack_realign_offset - + sse_reg_space_needed; - } - else - { - frame->stack_realign_offset = offset; + /* See the comment about red zone and frame + pointer usage in ix86_expand_prologue. */ + if (frame_pointer_needed && frame.red_zone_size) + emit_insn (gen_memory_blockage ()); - if (TARGET_64BIT && m->call_ms2sysv) - { - m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD); - offset += xlogue_layout::get_instance ().get_stack_space_used (); - } + using_drap = crtl->drap_reg && crtl->stack_realign_needed; + gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg); - /* Align and set SSE register save area. */ - else if (frame->nsseregs) - { - /* If the incoming stack boundary is at least 16 bytes, or DRAP is - required and the DRAP re-alignment boundary is at least 16 bytes, - then we want the SSE register save area properly aligned. */ - if (ix86_incoming_stack_boundary >= 128 - || (stack_realign_drap && stack_alignment_needed >= 16)) - offset = ROUND_UP (offset, 16); - offset += frame->nsseregs * 16; - } - frame->sse_reg_save_offset = offset; - offset += frame->va_arg_size; + /* Determine the CFA offset of the end of the red-zone. */ + m->fs.red_zone_offset = 0; + if (ix86_using_red_zone () && crtl->args.pops_args < 65536) + { + /* The red-zone begins below return address and error code in + exception handler. */ + m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET; + + /* When the register save area is in the aligned portion of + the stack, determine the maximum runtime displacement that + matches up with the aligned frame. */ + if (stack_realign_drap) + m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT + + UNITS_PER_WORD); } - /* Align start of frame for local function. When a function call - is removed, it may become a leaf function. But if argument may - be passed on stack, we need to align the stack when there is no - tail call. */ - if (m->call_ms2sysv - || frame->va_arg_size != 0 - || size != 0 - || !crtl->is_leaf - || (!crtl->tail_call_emit - && cfun->machine->outgoing_args_on_stack) - || cfun->calls_alloca - || ix86_current_function_calls_tls_descriptor) - offset = ROUND_UP (offset, stack_alignment_needed); + HOST_WIDE_INT reg_save_offset = frame.reg_save_offset; - /* Frame pointer points here. */ - frame->frame_pointer_offset = offset; + /* Special care must be taken for the normal return case of a function + using eh_return: the eax and edx registers are marked as saved, but + not restored along this path. Adjust the save location to match. */ + if (crtl->calls_eh_return && style != 2) + reg_save_offset -= 2 * UNITS_PER_WORD; - offset += size; + /* EH_RETURN requires the use of moves to function properly. */ + if (crtl->calls_eh_return) + restore_regs_via_mov = true; + /* SEH requires the use of pops to identify the epilogue. */ + else if (TARGET_SEH) + restore_regs_via_mov = false; + /* If we're only restoring one register and sp cannot be used then + using a move instruction to restore the register since it's + less work than reloading sp and popping the register. */ + else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1) + restore_regs_via_mov = true; + else if (TARGET_EPILOGUE_USING_MOVE + && cfun->machine->use_fast_prologue_epilogue + && (frame.nregs > 1 + || m->fs.sp_offset != reg_save_offset)) + restore_regs_via_mov = true; + else if (frame_pointer_needed + && !frame.nregs + && m->fs.sp_offset != reg_save_offset) + restore_regs_via_mov = true; + else if (frame_pointer_needed + && TARGET_USE_LEAVE + && cfun->machine->use_fast_prologue_epilogue + && frame.nregs == 1) + restore_regs_via_mov = true; + else + restore_regs_via_mov = false; - /* Add outgoing arguments area. Can be skipped if we eliminated - all the function calls as dead code. - Skipping is however impossible when function calls alloca. Alloca - expander assumes that last crtl->outgoing_args_size - of stack frame are unused. */ - if (ACCUMULATE_OUTGOING_ARGS - && (!crtl->is_leaf || cfun->calls_alloca - || ix86_current_function_calls_tls_descriptor)) + if (restore_regs_via_mov || frame.nsseregs) { - offset += crtl->outgoing_args_size; - frame->outgoing_arguments_size = crtl->outgoing_args_size; + /* Ensure that the entire register save area is addressable via + the stack pointer, if we will restore SSE regs via sp. */ + if (TARGET_64BIT + && m->fs.sp_offset > 0x7fffffff + && sp_valid_at (frame.stack_realign_offset + 1) + && (frame.nsseregs + frame.nregs) != 0) + { + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (m->fs.sp_offset + - frame.sse_reg_save_offset), + style, + m->fs.cfa_reg == stack_pointer_rtx); + } } - else - frame->outgoing_arguments_size = 0; - /* Align stack boundary. Only needed if we're calling another function - or using alloca. */ - if (!crtl->is_leaf || cfun->calls_alloca - || ix86_current_function_calls_tls_descriptor) - offset = ROUND_UP (offset, preferred_alignment); + /* If there are any SSE registers to restore, then we have to do it + via moves, since there's obviously no pop for SSE regs. */ + if (frame.nsseregs) + ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset, + style == 2); - /* We've reached end of stack frame. */ - frame->stack_pointer_offset = offset; + if (m->call_ms2sysv) + { + int pop_incoming_args = crtl->args.pops_args && crtl->args.size; - /* Size prologue needs to allocate. */ - to_allocate = offset - frame->sse_reg_save_offset; + /* We cannot use a tail-call for the stub if: + 1. We have to pop incoming args, + 2. We have additional int regs to restore, or + 3. A sibling call will be the tail-call, or + 4. We are emitting an eh_return_internal epilogue. - if ((!to_allocate && frame->nregs <= 1) - || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)) - /* If stack clash probing needs a loop, then it needs a - scratch register. But the returned register is only guaranteed - to be safe to use after register saves are complete. So if - stack clash protections are enabled and the allocated frame is - larger than the probe interval, then use pushes to save - callee saved registers. */ - || (flag_stack_clash_protection && to_allocate > get_probe_interval ())) - frame->save_regs_using_mov = false; + TODO: Item 4 has not yet tested! - if (ix86_using_red_zone () - && crtl->sp_is_unchanging - && crtl->is_leaf - && !ix86_pc_thunk_call_expanded - && !ix86_current_function_calls_tls_descriptor) - { - frame->red_zone_size = to_allocate; - if (frame->save_regs_using_mov) - frame->red_zone_size += frame->nregs * UNITS_PER_WORD; - if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE) - frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE; + If any of the above are true, we will call the stub rather than + jump to it. */ + restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1); + ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style); } - else - frame->red_zone_size = 0; - frame->stack_pointer_offset -= frame->red_zone_size; - /* The SEH frame pointer location is near the bottom of the frame. - This is enforced by the fact that the difference between the - stack pointer and the frame pointer is limited to 240 bytes in - the unwind data structure. */ - if (TARGET_SEH) + /* If using out-of-line stub that is a tail-call, then...*/ + if (m->call_ms2sysv && restore_stub_is_tail) { - HOST_WIDE_INT diff; - - /* If we can leave the frame pointer where it is, do so. Also, returns - the establisher frame for __builtin_frame_address (0). */ - diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset; - if (diff <= SEH_MAX_FRAME_SIZE - && (diff > 240 || (diff & 15) != 0) - && !crtl->accesses_prior_frames) - { - /* Ideally we'd determine what portion of the local stack frame - (within the constraint of the lowest 240) is most heavily used. - But without that complication, simply bias the frame pointer - by 128 bytes so as to maximize the amount of the local stack - frame that is addressable with 8-bit offsets. */ - frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128; - } + /* TODO: parinoid tests. (remove eventually) */ + gcc_assert (m->fs.sp_valid); + gcc_assert (!m->fs.sp_realigned); + gcc_assert (!m->fs.fp_valid); + gcc_assert (!m->fs.realigned); + gcc_assert (m->fs.sp_offset == UNITS_PER_WORD); + gcc_assert (!crtl->drap_reg); + gcc_assert (!frame.nregs); } -} + else if (restore_regs_via_mov) + { + rtx t; -/* This is semi-inlined memory_address_length, but simplified - since we know that we're always dealing with reg+offset, and - to avoid having to create and discard all that rtl. */ + if (frame.nregs) + ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2); -static inline int -choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset) -{ - int len = 4; + /* eh_return epilogues need %ecx added to the stack pointer. */ + if (style == 2) + { + rtx sa = EH_RETURN_STACKADJ_RTX; + rtx_insn *insn; - if (offset == 0) - { - /* EBP and R13 cannot be encoded without an offset. */ - len = (regno == BP_REG || regno == R13_REG); - } - else if (IN_RANGE (offset, -128, 127)) - len = 1; + /* %ecx can't be used for both DRAP register and eh_return. */ + if (crtl->drap_reg) + gcc_assert (REGNO (crtl->drap_reg) != CX_REG); - /* ESP and R12 must be encoded with a SIB byte. */ - if (regno == SP_REG || regno == R12_REG) - len++; + /* regparm nested functions don't work with eh_return. */ + gcc_assert (!ix86_static_chain_on_stack); - return len; -} + if (frame_pointer_needed) + { + t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa); + t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD); + emit_insn (gen_rtx_SET (sa, t)); -/* Determine if the stack pointer is valid for accessing the CFA_OFFSET in - the frame save area. The register is saved at CFA - CFA_OFFSET. */ + t = gen_frame_mem (Pmode, hard_frame_pointer_rtx); + insn = emit_move_insn (hard_frame_pointer_rtx, t); -static bool -sp_valid_at (HOST_WIDE_INT cfa_offset) -{ - const struct machine_frame_state &fs = cfun->machine->fs; - if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset) - { - /* Validate that the cfa_offset isn't in a "no-man's land". */ - gcc_assert (cfa_offset <= fs.sp_realigned_fp_last); - return false; - } - return fs.sp_valid; -} - -/* Determine if the frame pointer is valid for accessing the CFA_OFFSET in - the frame save area. The register is saved at CFA - CFA_OFFSET. */ - -static inline bool -fp_valid_at (HOST_WIDE_INT cfa_offset) -{ - const struct machine_frame_state &fs = cfun->machine->fs; - if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last) - { - /* Validate that the cfa_offset isn't in a "no-man's land". */ - gcc_assert (cfa_offset >= fs.sp_realigned_offset); - return false; - } - return fs.fp_valid; -} - -/* Choose a base register based upon alignment requested, speed and/or - size. */ - -static void -choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg, - HOST_WIDE_INT &base_offset, - unsigned int align_reqested, unsigned int *align) -{ - const struct machine_function *m = cfun->machine; - unsigned int hfp_align; - unsigned int drap_align; - unsigned int sp_align; - bool hfp_ok = fp_valid_at (cfa_offset); - bool drap_ok = m->fs.drap_valid; - bool sp_ok = sp_valid_at (cfa_offset); - - hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY; - - /* Filter out any registers that don't meet the requested alignment - criteria. */ - if (align_reqested) - { - if (m->fs.realigned) - hfp_align = drap_align = sp_align = crtl->stack_alignment_needed; - /* SEH unwind code does do not currently support REG_CFA_EXPRESSION - notes (which we would need to use a realigned stack pointer), - so disable on SEH targets. */ - else if (m->fs.sp_realigned) - sp_align = crtl->stack_alignment_needed; + /* Note that we use SA as a temporary CFA, as the return + address is at the proper place relative to it. We + pretend this happens at the FP restore insn because + prior to this insn the FP would be stored at the wrong + offset relative to SA, and after this insn we have no + other reasonable register to use for the CFA. We don't + bother resetting the CFA to the SP for the duration of + the return insn, unless the control flow instrumentation + is done. In this case the SP is used later and we have + to reset CFA to SP. */ + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, sa, UNITS_PER_WORD)); + ix86_add_queued_cfa_restore_notes (insn); + add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx); + RTX_FRAME_RELATED_P (insn) = 1; - hfp_ok = hfp_ok && hfp_align >= align_reqested; - drap_ok = drap_ok && drap_align >= align_reqested; - sp_ok = sp_ok && sp_align >= align_reqested; - } + m->fs.cfa_reg = sa; + m->fs.cfa_offset = UNITS_PER_WORD; + m->fs.fp_valid = false; - if (m->use_fast_prologue_epilogue) - { - /* Choose the base register most likely to allow the most scheduling - opportunities. Generally FP is valid throughout the function, - while DRAP must be reloaded within the epilogue. But choose either - over the SP due to increased encoding size. */ + pro_epilogue_adjust_stack (stack_pointer_rtx, sa, + const0_rtx, style, + flag_cf_protection); + } + else + { + t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa); + t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD); + insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t)); + ix86_add_queued_cfa_restore_notes (insn); - if (hfp_ok) - { - base_reg = hard_frame_pointer_rtx; - base_offset = m->fs.fp_offset - cfa_offset; - } - else if (drap_ok) - { - base_reg = crtl->drap_reg; - base_offset = 0 - cfa_offset; - } - else if (sp_ok) - { - base_reg = stack_pointer_rtx; - base_offset = m->fs.sp_offset - cfa_offset; + gcc_assert (m->fs.cfa_reg == stack_pointer_rtx); + if (m->fs.cfa_offset != UNITS_PER_WORD) + { + m->fs.cfa_offset = UNITS_PER_WORD; + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, stack_pointer_rtx, + UNITS_PER_WORD)); + RTX_FRAME_RELATED_P (insn) = 1; + } + } + m->fs.sp_offset = UNITS_PER_WORD; + m->fs.sp_valid = true; + m->fs.sp_realigned = false; } } else { - HOST_WIDE_INT toffset; - int len = 16, tlen; - - /* Choose the base register with the smallest address encoding. - With a tie, choose FP > DRAP > SP. */ - if (sp_ok) + /* SEH requires that the function end with (1) a stack adjustment + if necessary, (2) a sequence of pops, and (3) a return or + jump instruction. Prevent insns from the function body from + being scheduled into this sequence. */ + if (TARGET_SEH) { - base_reg = stack_pointer_rtx; - base_offset = m->fs.sp_offset - cfa_offset; - len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset); + /* Prevent a catch region from being adjacent to the standard + epilogue sequence. Unfortunately neither crtl->uses_eh_lsda + nor several other flags that would be interesting to test are + set up yet. */ + if (flag_non_call_exceptions) + emit_insn (gen_nops (const1_rtx)); + else + emit_insn (gen_blockage ()); } - if (drap_ok) + + /* First step is to deallocate the stack frame so that we can + pop the registers. If the stack pointer was realigned, it needs + to be restored now. Also do it on SEH target for very large + frame as the emitted instructions aren't allowed by the ABI + in epilogues. */ + if (!m->fs.sp_valid || m->fs.sp_realigned + || (TARGET_SEH + && (m->fs.sp_offset - reg_save_offset + >= SEH_MAX_FRAME_SIZE))) { - toffset = 0 - cfa_offset; - tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset); - if (tlen <= len) - { - base_reg = crtl->drap_reg; - base_offset = toffset; - len = tlen; - } + pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx, + GEN_INT (m->fs.fp_offset + - reg_save_offset), + style, false); } - if (hfp_ok) + else if (m->fs.sp_offset != reg_save_offset) { - toffset = m->fs.fp_offset - cfa_offset; - tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset); - if (tlen <= len) - { - base_reg = hard_frame_pointer_rtx; - base_offset = toffset; - len = tlen; - } + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (m->fs.sp_offset + - reg_save_offset), + style, + m->fs.cfa_reg == stack_pointer_rtx); } - } - /* Set the align return value. */ - if (align) - { - if (base_reg == stack_pointer_rtx) - *align = sp_align; - else if (base_reg == crtl->drap_reg) - *align = drap_align; - else if (base_reg == hard_frame_pointer_rtx) - *align = hfp_align; - } -} + ix86_emit_restore_regs_using_pop (); + } -/* Return an RTX that points to CFA_OFFSET within the stack frame and - the alignment of address. If ALIGN is non-null, it should point to - an alignment value (in bits) that is preferred or zero and will - recieve the alignment of the base register that was selected, - irrespective of rather or not CFA_OFFSET is a multiple of that - alignment value. If it is possible for the base register offset to be - non-immediate then SCRATCH_REGNO should specify a scratch register to - use. + /* If we used a stack pointer and haven't already got rid of it, + then do so now. */ + if (m->fs.fp_valid) + { + /* If the stack pointer is valid and pointing at the frame + pointer store address, then we only need a pop. */ + if (sp_valid_at (frame.hfp_save_offset) + && m->fs.sp_offset == frame.hfp_save_offset) + ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx); + /* Leave results in shorter dependency chains on CPUs that are + able to grok it fast. */ + else if (TARGET_USE_LEAVE + || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun)) + || !cfun->machine->use_fast_prologue_epilogue) + ix86_emit_leave (NULL); + else + { + pro_epilogue_adjust_stack (stack_pointer_rtx, + hard_frame_pointer_rtx, + const0_rtx, style, !using_drap); + ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx); + } + } - The valid base registers are taken from CFUN->MACHINE->FS. */ + if (using_drap) + { + int param_ptr_offset = UNITS_PER_WORD; + rtx_insn *insn; -static rtx -choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align, - unsigned int scratch_regno = INVALID_REGNUM) -{ - rtx base_reg = NULL; - HOST_WIDE_INT base_offset = 0; + gcc_assert (stack_realign_drap); - /* If a specific alignment is requested, try to get a base register - with that alignment first. */ - if (align && *align) - choose_basereg (cfa_offset, base_reg, base_offset, *align, align); + if (ix86_static_chain_on_stack) + param_ptr_offset += UNITS_PER_WORD; + if (!call_used_regs[REGNO (crtl->drap_reg)]) + param_ptr_offset += UNITS_PER_WORD; - if (!base_reg) - choose_basereg (cfa_offset, base_reg, base_offset, 0, align); + insn = emit_insn (gen_rtx_SET + (stack_pointer_rtx, + gen_rtx_PLUS (Pmode, + crtl->drap_reg, + GEN_INT (-param_ptr_offset)))); + m->fs.cfa_reg = stack_pointer_rtx; + m->fs.cfa_offset = param_ptr_offset; + m->fs.sp_offset = param_ptr_offset; + m->fs.realigned = false; - gcc_assert (base_reg != NULL); + add_reg_note (insn, REG_CFA_DEF_CFA, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (param_ptr_offset))); + RTX_FRAME_RELATED_P (insn) = 1; - rtx base_offset_rtx = GEN_INT (base_offset); + if (!call_used_regs[REGNO (crtl->drap_reg)]) + ix86_emit_restore_reg_using_pop (crtl->drap_reg); + } - if (!x86_64_immediate_operand (base_offset_rtx, Pmode)) + /* At this point the stack pointer must be valid, and we must have + restored all of the registers. We may not have deallocated the + entire stack frame. We've delayed this until now because it may + be possible to merge the local stack deallocation with the + deallocation forced by ix86_static_chain_on_stack. */ + gcc_assert (m->fs.sp_valid); + gcc_assert (!m->fs.sp_realigned); + gcc_assert (!m->fs.fp_valid); + gcc_assert (!m->fs.realigned); + if (m->fs.sp_offset != UNITS_PER_WORD) { - gcc_assert (scratch_regno != INVALID_REGNUM); - - rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno); - emit_move_insn (scratch_reg, base_offset_rtx); - - return gen_rtx_PLUS (Pmode, base_reg, scratch_reg); + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (m->fs.sp_offset - UNITS_PER_WORD), + style, true); } + else + ix86_add_queued_cfa_restore_notes (get_last_insn ()); - return plus_constant (Pmode, base_reg, base_offset); -} - -/* Emit code to save registers in the prologue. */ - -static void -ix86_emit_save_regs (void) -{ - unsigned int regno; - rtx_insn *insn; - - for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; ) - if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) - { - insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno))); - RTX_FRAME_RELATED_P (insn) = 1; - } -} - -/* Emit a single register save at CFA - CFA_OFFSET. */ + /* Sibcall epilogues don't want a return instruction. */ + if (style == 0) + { + m->fs = frame_state_save; + return; + } -static void -ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno, - HOST_WIDE_INT cfa_offset) -{ - struct machine_function *m = cfun->machine; - rtx reg = gen_rtx_REG (mode, regno); - rtx mem, addr, base, insn; - unsigned int align = GET_MODE_ALIGNMENT (mode); + if (cfun->machine->func_type != TYPE_NORMAL) + emit_jump_insn (gen_interrupt_return ()); + else if (crtl->args.pops_args && crtl->args.size) + { + rtx popc = GEN_INT (crtl->args.pops_args); - addr = choose_baseaddr (cfa_offset, &align); - mem = gen_frame_mem (mode, addr); + /* i386 can only pop 64K bytes. If asked to pop more, pop return + address, do explicit add, and jump indirectly to the caller. */ - /* The location aligment depends upon the base register. */ - align = MIN (GET_MODE_ALIGNMENT (mode), align); - gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1))); - set_mem_align (mem, align); + if (crtl->args.pops_args >= 65536) + { + rtx ecx = gen_rtx_REG (SImode, CX_REG); + rtx_insn *insn; - insn = emit_insn (gen_rtx_SET (mem, reg)); - RTX_FRAME_RELATED_P (insn) = 1; + /* There is no "pascal" calling convention in any 64bit ABI. */ + gcc_assert (!TARGET_64BIT); - base = addr; - if (GET_CODE (base) == PLUS) - base = XEXP (base, 0); - gcc_checking_assert (REG_P (base)); + insn = emit_insn (gen_pop (ecx)); + m->fs.cfa_offset -= UNITS_PER_WORD; + m->fs.sp_offset -= UNITS_PER_WORD; - /* When saving registers into a re-aligned local stack frame, avoid - any tricky guessing by dwarf2out. */ - if (m->fs.realigned) - { - gcc_checking_assert (stack_realign_drap); + rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); + x = gen_rtx_SET (stack_pointer_rtx, x); + add_reg_note (insn, REG_CFA_ADJUST_CFA, x); + add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx)); + RTX_FRAME_RELATED_P (insn) = 1; - if (regno == REGNO (crtl->drap_reg)) - { - /* A bit of a hack. We force the DRAP register to be saved in - the re-aligned stack frame, which provides us with a copy - of the CFA that will last past the prologue. Install it. */ - gcc_checking_assert (cfun->machine->fs.fp_valid); - addr = plus_constant (Pmode, hard_frame_pointer_rtx, - cfun->machine->fs.fp_offset - cfa_offset); - mem = gen_rtx_MEM (mode, addr); - add_reg_note (insn, REG_CFA_DEF_CFA, mem); + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + popc, -1, true); + emit_jump_insn (gen_simple_return_indirect_internal (ecx)); } else - { - /* The frame pointer is a stable reference within the - aligned frame. Use it. */ - gcc_checking_assert (cfun->machine->fs.fp_valid); - addr = plus_constant (Pmode, hard_frame_pointer_rtx, - cfun->machine->fs.fp_offset - cfa_offset); - mem = gen_rtx_MEM (mode, addr); - add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg)); - } - } - - else if (base == stack_pointer_rtx && m->fs.sp_realigned - && cfa_offset >= m->fs.sp_realigned_offset) - { - gcc_checking_assert (stack_realign_fp); - add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg)); + emit_jump_insn (gen_simple_return_pop_internal (popc)); } - - /* The memory may not be relative to the current CFA register, - which means that we may need to generate a new pattern for - use by the unwind info. */ - else if (base != m->fs.cfa_reg) + else if (!m->call_ms2sysv || !restore_stub_is_tail) { - addr = plus_constant (Pmode, m->fs.cfa_reg, - m->fs.cfa_offset - cfa_offset); - mem = gen_rtx_MEM (mode, addr); - add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg)); - } -} + /* In case of return from EH a simple return cannot be used + as a return address will be compared with a shadow stack + return address. Use indirect jump instead. */ + if (style == 2 && flag_cf_protection) + { + /* Register used in indirect jump must be in word_mode. But + Pmode may not be the same as word_mode for x32. */ + rtx ecx = gen_rtx_REG (word_mode, CX_REG); + rtx_insn *insn; -/* Emit code to save registers using MOV insns. - First register is stored at CFA - CFA_OFFSET. */ -static void -ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset) -{ - unsigned int regno; + insn = emit_insn (gen_pop (ecx)); + m->fs.cfa_offset -= UNITS_PER_WORD; + m->fs.sp_offset -= UNITS_PER_WORD; - for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true)) - { - ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset); - cfa_offset -= UNITS_PER_WORD; - } -} + rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); + x = gen_rtx_SET (stack_pointer_rtx, x); + add_reg_note (insn, REG_CFA_ADJUST_CFA, x); + add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx)); + RTX_FRAME_RELATED_P (insn) = 1; -/* Emit code to save SSE registers using MOV insns. - First register is stored at CFA - CFA_OFFSET. */ -static void -ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset) -{ - unsigned int regno; + emit_jump_insn (gen_simple_return_indirect_internal (ecx)); + } + else + emit_jump_insn (gen_simple_return_internal ()); + } - for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true)) - { - ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset); - cfa_offset -= GET_MODE_SIZE (V4SFmode); - } + /* Restore the state back to the state from the prologue, + so that it's correct for the next epilogue. */ + m->fs = frame_state_save; } -static GTY(()) rtx queued_cfa_restores; - -/* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack - manipulation insn. The value is on the stack at CFA - CFA_OFFSET. - Don't add the note if the previously saved value will be left untouched - within stack red-zone till return, as unwinders can find the same value - in the register and on the stack. */ +/* Reset from the function's potential modifications. */ static void -ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset) +ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED) { - if (!crtl->shrink_wrapped - && cfa_offset <= cfun->machine->fs.red_zone_offset) - return; + if (pic_offset_table_rtx + && !ix86_use_pseudo_pic_reg ()) + SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM); - if (insn) + if (TARGET_MACHO) { - add_reg_note (insn, REG_CFA_RESTORE, reg); - RTX_FRAME_RELATED_P (insn) = 1; - } - else - queued_cfa_restores - = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores); -} - -/* Add queued REG_CFA_RESTORE notes if any to INSN. */ + rtx_insn *insn = get_last_insn (); + rtx_insn *deleted_debug_label = NULL; -static void -ix86_add_queued_cfa_restore_notes (rtx insn) -{ - rtx last; - if (!queued_cfa_restores) - return; - for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1)) - ; - XEXP (last, 1) = REG_NOTES (insn); - REG_NOTES (insn) = queued_cfa_restores; - queued_cfa_restores = NULL_RTX; - RTX_FRAME_RELATED_P (insn) = 1; -} + /* Mach-O doesn't support labels at the end of objects, so if + it looks like we might want one, take special action. + First, collect any sequence of deleted debug labels. */ + while (insn + && NOTE_P (insn) + && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL) + { + /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL + notes only, instead set their CODE_LABEL_NUMBER to -1, + otherwise there would be code generation differences + in between -g and -g0. */ + if (NOTE_P (insn) && NOTE_KIND (insn) + == NOTE_INSN_DELETED_DEBUG_LABEL) + deleted_debug_label = insn; + insn = PREV_INSN (insn); + } -/* Expand prologue or epilogue stack adjustment. - The pattern exist to put a dependency on all ebp-based memory accesses. - STYLE should be negative if instructions should be marked as frame related, - zero if %r11 register is live and cannot be freely used and positive - otherwise. */ + /* If we have: + label: + barrier + then this needs to be detected, so skip past the barrier. */ -static rtx -pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, - int style, bool set_cfa) -{ - struct machine_function *m = cfun->machine; - rtx insn; - bool add_frame_related_expr = false; + if (insn && BARRIER_P (insn)) + insn = PREV_INSN (insn); - if (Pmode == SImode) - insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset); - else if (x86_64_immediate_operand (offset, DImode)) - insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset); - else - { - rtx tmp; - /* r11 is used by indirect sibcall return as well, set before the - epilogue and used after the epilogue. */ - if (style) - tmp = gen_rtx_REG (DImode, R11_REG); - else + /* Up to now we've only seen notes or barriers. */ + if (insn) { - gcc_assert (src != hard_frame_pointer_rtx - && dest != hard_frame_pointer_rtx); - tmp = hard_frame_pointer_rtx; + if (LABEL_P (insn) + || (NOTE_P (insn) + && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)) + /* Trailing label. */ + fputs ("\tnop\n", file); + else if (cfun && ! cfun->is_thunk) + { + /* See if we have a completely empty function body, skipping + the special case of the picbase thunk emitted as asm. */ + while (insn && ! INSN_P (insn)) + insn = PREV_INSN (insn); + /* If we don't find any insns, we've got an empty function body; + I.e. completely empty - without a return or branch. This is + taken as the case where a function body has been removed + because it contains an inline __builtin_unreachable(). GCC + declares that reaching __builtin_unreachable() means UB so + we're not obliged to do anything special; however, we want + non-zero-sized function bodies. To meet this, and help the + user out, let's trap the case. */ + if (insn == NULL) + fputs ("\tud2\n", file); + } } - insn = emit_insn (gen_rtx_SET (tmp, offset)); - if (style < 0) - add_frame_related_expr = true; - - insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp); + else if (deleted_debug_label) + for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn)) + if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL) + CODE_LABEL_NUMBER (insn) = -1; } +} - insn = emit_insn (insn); - if (style >= 0) - ix86_add_queued_cfa_restore_notes (insn); +/* Return a scratch register to use in the split stack prologue. The + split stack prologue is used for -fsplit-stack. It is the first + instructions in the function, even before the regular prologue. + The scratch register can be any caller-saved register which is not + used for parameters or for the static chain. */ - if (set_cfa) +static unsigned int +split_stack_prologue_scratch_regno (void) +{ + if (TARGET_64BIT) + return R11_REG; + else { - rtx r; + bool is_fastcall, is_thiscall; + int regparm; - gcc_assert (m->fs.cfa_reg == src); - m->fs.cfa_offset += INTVAL (offset); - m->fs.cfa_reg = dest; + is_fastcall = (lookup_attribute ("fastcall", + TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) + != NULL); + is_thiscall = (lookup_attribute ("thiscall", + TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) + != NULL); + regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl); - r = gen_rtx_PLUS (Pmode, src, offset); - r = gen_rtx_SET (dest, r); - add_reg_note (insn, REG_CFA_ADJUST_CFA, r); - RTX_FRAME_RELATED_P (insn) = 1; - } - else if (style < 0) - { - RTX_FRAME_RELATED_P (insn) = 1; - if (add_frame_related_expr) + if (is_fastcall) { - rtx r = gen_rtx_PLUS (Pmode, src, offset); - r = gen_rtx_SET (dest, r); - add_reg_note (insn, REG_FRAME_RELATED_EXPR, r); + if (DECL_STATIC_CHAIN (cfun->decl)) + { + sorry ("%<-fsplit-stack%> does not support fastcall with " + "nested function"); + return INVALID_REGNUM; + } + return AX_REG; } - } - - if (dest == stack_pointer_rtx) - { - HOST_WIDE_INT ooffset = m->fs.sp_offset; - bool valid = m->fs.sp_valid; - bool realigned = m->fs.sp_realigned; - - if (src == hard_frame_pointer_rtx) - { - valid = m->fs.fp_valid; - realigned = false; - ooffset = m->fs.fp_offset; + else if (is_thiscall) + { + if (!DECL_STATIC_CHAIN (cfun->decl)) + return DX_REG; + return AX_REG; } - else if (src == crtl->drap_reg) + else if (regparm < 3) { - valid = m->fs.drap_valid; - realigned = false; - ooffset = 0; + if (!DECL_STATIC_CHAIN (cfun->decl)) + return CX_REG; + else + { + if (regparm >= 2) + { + sorry ("%<-fsplit-stack%> does not support 2 register " + "parameters for a nested function"); + return INVALID_REGNUM; + } + return DX_REG; + } } else { - /* Else there are two possibilities: SP itself, which we set - up as the default above. Or EH_RETURN_STACKADJ_RTX, which is - taken care of this by hand along the eh_return path. */ - gcc_checking_assert (src == stack_pointer_rtx - || offset == const0_rtx); + /* FIXME: We could make this work by pushing a register + around the addition and comparison. */ + sorry ("%<-fsplit-stack%> does not support 3 register parameters"); + return INVALID_REGNUM; } - - m->fs.sp_offset = ooffset - INTVAL (offset); - m->fs.sp_valid = valid; - m->fs.sp_realigned = realigned; } - return insn; } -/* Find an available register to be used as dynamic realign argument - pointer regsiter. Such a register will be written in prologue and - used in begin of body, so it must not be - 1. parameter passing register. - 2. GOT pointer. - We reuse static-chain register if it is available. Otherwise, we - use DI for i386 and R13 for x86-64. We chose R13 since it has - shorter encoding. +/* A SYMBOL_REF for the function which allocates new stackspace for + -fsplit-stack. */ - Return: the regno of chosen register. */ +static GTY(()) rtx split_stack_fn; -static unsigned int -find_drap_reg (void) -{ - tree decl = cfun->decl; +/* A SYMBOL_REF for the more stack function when using the large + model. */ - /* Always use callee-saved register if there are no caller-saved - registers. */ - if (TARGET_64BIT) - { - /* Use R13 for nested function or function need static chain. - Since function with tail call may use any caller-saved - registers in epilogue, DRAP must not use caller-saved - register in such case. */ - if (DECL_STATIC_CHAIN (decl) - || cfun->machine->no_caller_saved_registers - || crtl->tail_call_emit) - return R13_REG; +static GTY(()) rtx split_stack_fn_large; - return R10_REG; - } - else - { - /* Use DI for nested function or function need static chain. - Since function with tail call may use any caller-saved - registers in epilogue, DRAP must not use caller-saved - register in such case. */ - if (DECL_STATIC_CHAIN (decl) - || cfun->machine->no_caller_saved_registers - || crtl->tail_call_emit) - return DI_REG; +/* Return location of the stack guard value in the TLS block. */ - /* Reuse static chain register if it isn't used for parameter - passing. */ - if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2) - { - unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl)); - if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0) - return CX_REG; - } - return DI_REG; - } -} +rtx +ix86_split_stack_guard (void) +{ + int offset; + addr_space_t as = DEFAULT_TLS_SEG_REG; + rtx r; -/* Handle a "force_align_arg_pointer" attribute. */ + gcc_assert (flag_split_stack); -static tree -ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name, - tree, int, bool *no_add_attrs) -{ - if (TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE - && TREE_CODE (*node) != FIELD_DECL - && TREE_CODE (*node) != TYPE_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - } +#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET + offset = TARGET_THREAD_SPLIT_STACK_OFFSET; +#else + gcc_unreachable (); +#endif - return NULL_TREE; + r = GEN_INT (offset); + r = gen_const_mem (Pmode, r); + set_mem_addr_space (r, as); + + return r; } -/* Return minimum incoming stack alignment. */ +/* Handle -fsplit-stack. These are the first instructions in the + function, even before the regular prologue. */ -static unsigned int -ix86_minimum_incoming_stack_boundary (bool sibcall) +void +ix86_expand_split_stack_prologue (void) { - unsigned int incoming_stack_boundary; + HOST_WIDE_INT allocate; + unsigned HOST_WIDE_INT args_size; + rtx_code_label *label; + rtx limit, current, allocate_rtx, call_fusage; + rtx_insn *call_insn; + rtx scratch_reg = NULL_RTX; + rtx_code_label *varargs_label = NULL; + rtx fn; - /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */ - if (cfun->machine->func_type != TYPE_NORMAL) - incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY; - /* Prefer the one specified at command line. */ - else if (ix86_user_incoming_stack_boundary) - incoming_stack_boundary = ix86_user_incoming_stack_boundary; - /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary - if -mstackrealign is used, it isn't used for sibcall check and - estimated stack alignment is 128bit. */ - else if (!sibcall - && ix86_force_align_arg_pointer - && crtl->stack_alignment_estimated == 128) - incoming_stack_boundary = MIN_STACK_BOUNDARY; - else - incoming_stack_boundary = ix86_default_incoming_stack_boundary; - - /* Incoming stack alignment can be changed on individual functions - via force_align_arg_pointer attribute. We use the smallest - incoming stack boundary. */ - if (incoming_stack_boundary > MIN_STACK_BOUNDARY - && lookup_attribute (ix86_force_align_arg_pointer_string, - TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl)))) - incoming_stack_boundary = MIN_STACK_BOUNDARY; - - /* The incoming stack frame has to be aligned at least at - parm_stack_boundary. */ - if (incoming_stack_boundary < crtl->parm_stack_boundary) - incoming_stack_boundary = crtl->parm_stack_boundary; + gcc_assert (flag_split_stack && reload_completed); - /* Stack at entrance of main is aligned by runtime. We use the - smallest incoming stack boundary. */ - if (incoming_stack_boundary > MAIN_STACK_BOUNDARY - && DECL_NAME (current_function_decl) - && MAIN_NAME_P (DECL_NAME (current_function_decl)) - && DECL_FILE_SCOPE_P (current_function_decl)) - incoming_stack_boundary = MAIN_STACK_BOUNDARY; + ix86_finalize_stack_frame_flags (); + struct ix86_frame &frame = cfun->machine->frame; + allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET; - return incoming_stack_boundary; -} + /* This is the label we will branch to if we have enough stack + space. We expect the basic block reordering pass to reverse this + branch if optimizing, so that we branch in the unlikely case. */ + label = gen_label_rtx (); -/* Update incoming stack boundary and estimated stack alignment. */ + /* We need to compare the stack pointer minus the frame size with + the stack boundary in the TCB. The stack boundary always gives + us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we + can compare directly. Otherwise we need to do an addition. */ -static void -ix86_update_stack_boundary (void) -{ - ix86_incoming_stack_boundary - = ix86_minimum_incoming_stack_boundary (false); + limit = ix86_split_stack_guard (); - /* x86_64 vararg needs 16byte stack alignment for register save area. */ - if (TARGET_64BIT - && cfun->stdarg - && crtl->stack_alignment_estimated < 128) - crtl->stack_alignment_estimated = 128; + if (allocate < SPLIT_STACK_AVAILABLE) + current = stack_pointer_rtx; + else + { + unsigned int scratch_regno; + rtx offset; - /* __tls_get_addr needs to be called with 16-byte aligned stack. */ - if (ix86_tls_descriptor_calls_expanded_in_cfun - && crtl->preferred_stack_boundary < 128) - crtl->preferred_stack_boundary = 128; -} + /* We need a scratch register to hold the stack pointer minus + the required frame size. Since this is the very start of the + function, the scratch register can be any caller-saved + register which is not used for parameters. */ + offset = GEN_INT (- allocate); + scratch_regno = split_stack_prologue_scratch_regno (); + if (scratch_regno == INVALID_REGNUM) + return; + scratch_reg = gen_rtx_REG (Pmode, scratch_regno); + if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode)) + { + /* We don't use ix86_gen_add3 in this case because it will + want to split to lea, but when not optimizing the insn + will not be split after this point. */ + emit_insn (gen_rtx_SET (scratch_reg, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + offset))); + } + else + { + emit_move_insn (scratch_reg, offset); + emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg, + stack_pointer_rtx)); + } + current = scratch_reg; + } -/* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is - needed or an rtx for DRAP otherwise. */ + ix86_expand_branch (GEU, current, limit, label); + rtx_insn *jump_insn = get_last_insn (); + JUMP_LABEL (jump_insn) = label; -static rtx -ix86_get_drap_rtx (void) -{ - /* We must use DRAP if there are outgoing arguments on stack and - ACCUMULATE_OUTGOING_ARGS is false. */ - if (ix86_force_drap - || (cfun->machine->outgoing_args_on_stack - && !ACCUMULATE_OUTGOING_ARGS)) - crtl->need_drap = true; + /* Mark the jump as very likely to be taken. */ + add_reg_br_prob_note (jump_insn, profile_probability::very_likely ()); - if (stack_realign_drap) + if (split_stack_fn == NULL_RTX) { - /* Assign DRAP to vDRAP and returns vDRAP */ - unsigned int regno = find_drap_reg (); - rtx drap_vreg; - rtx arg_ptr; - rtx_insn *seq, *insn; + split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack"); + SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL; + } + fn = split_stack_fn; - arg_ptr = gen_rtx_REG (Pmode, regno); - crtl->drap_reg = arg_ptr; + /* Get more stack space. We pass in the desired stack space and the + size of the arguments to copy to the new stack. In 32-bit mode + we push the parameters; __morestack will return on a new stack + anyhow. In 64-bit mode we pass the parameters in r10 and + r11. */ + allocate_rtx = GEN_INT (allocate); + args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0; + call_fusage = NULL_RTX; + rtx pop = NULL_RTX; + if (TARGET_64BIT) + { + rtx reg10, reg11; - start_sequence (); - drap_vreg = copy_to_reg (arg_ptr); - seq = get_insns (); - end_sequence (); + reg10 = gen_rtx_REG (Pmode, R10_REG); + reg11 = gen_rtx_REG (Pmode, R11_REG); - insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ())); - if (!optimize) + /* If this function uses a static chain, it will be in %r10. + Preserve it across the call to __morestack. */ + if (DECL_STATIC_CHAIN (cfun->decl)) { - add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg); - RTX_FRAME_RELATED_P (insn) = 1; + rtx rax; + + rax = gen_rtx_REG (word_mode, AX_REG); + emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG)); + use_reg (&call_fusage, rax); } - return drap_vreg; - } - else - return NULL; -} -/* Handle the TARGET_INTERNAL_ARG_POINTER hook. */ + if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) + && !TARGET_PECOFF) + { + HOST_WIDE_INT argval; -static rtx -ix86_internal_arg_pointer (void) -{ - return virtual_incoming_args_rtx; -} + gcc_assert (Pmode == DImode); + /* When using the large model we need to load the address + into a register, and we've run out of registers. So we + switch to a different calling convention, and we call a + different function: __morestack_large. We pass the + argument size in the upper 32 bits of r10 and pass the + frame size in the lower 32 bits. */ + gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate); + gcc_assert ((args_size & 0xffffffff) == args_size); -struct scratch_reg { - rtx reg; - bool saved; -}; + if (split_stack_fn_large == NULL_RTX) + { + split_stack_fn_large + = gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model"); + SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL; + } + if (ix86_cmodel == CM_LARGE_PIC) + { + rtx_code_label *label; + rtx x; -/* Return a short-lived scratch register for use on function entry. - In 32-bit mode, it is valid only after the registers are saved - in the prologue. This register must be released by means of - release_scratch_register_on_entry once it is dead. */ + label = gen_label_rtx (); + emit_label (label); + LABEL_PRESERVE_P (label) = 1; + emit_insn (gen_set_rip_rex64 (reg10, label)); + emit_insn (gen_set_got_offset_rex64 (reg11, label)); + emit_insn (ix86_gen_add3 (reg10, reg10, reg11)); + x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large), + UNSPEC_GOT); + x = gen_rtx_CONST (Pmode, x); + emit_move_insn (reg11, x); + x = gen_rtx_PLUS (Pmode, reg10, reg11); + x = gen_const_mem (Pmode, x); + emit_move_insn (reg11, x); + } + else + emit_move_insn (reg11, split_stack_fn_large); -static void -get_scratch_register_on_entry (struct scratch_reg *sr) -{ - int regno; + fn = reg11; - sr->saved = false; + argval = ((args_size << 16) << 16) + allocate; + emit_move_insn (reg10, GEN_INT (argval)); + } + else + { + emit_move_insn (reg10, allocate_rtx); + emit_move_insn (reg11, GEN_INT (args_size)); + use_reg (&call_fusage, reg11); + } - if (TARGET_64BIT) - { - /* We always use R11 in 64-bit mode. */ - regno = R11_REG; + use_reg (&call_fusage, reg10); } else { - tree decl = current_function_decl, fntype = TREE_TYPE (decl); - bool fastcall_p - = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE; - bool thiscall_p - = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE; - bool static_chain_p = DECL_STATIC_CHAIN (decl); - int regparm = ix86_function_regparm (fntype, decl); - int drap_regno - = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM; + rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size))); + add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD)); + insn = emit_insn (gen_push (allocate_rtx)); + add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD)); + pop = GEN_INT (2 * UNITS_PER_WORD); + } + call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn), + GEN_INT (UNITS_PER_WORD), constm1_rtx, + pop, false); + add_function_usage_to (call_insn, call_fusage); + if (!TARGET_64BIT) + add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0)); + /* Indicate that this function can't jump to non-local gotos. */ + make_reg_eh_region_note_nothrow_nononlocal (call_insn); - /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax - for the static chain register. */ - if ((regparm < 1 || (fastcall_p && !static_chain_p)) - && drap_regno != AX_REG) - regno = AX_REG; - /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx - for the static chain register. */ - else if (thiscall_p && !static_chain_p && drap_regno != AX_REG) - regno = AX_REG; - else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG) - regno = DX_REG; - /* ecx is the static chain register. */ - else if (regparm < 3 && !fastcall_p && !thiscall_p - && !static_chain_p - && drap_regno != CX_REG) - regno = CX_REG; - else if (ix86_save_reg (BX_REG, true, false)) - regno = BX_REG; - /* esi is the static chain register. */ - else if (!(regparm == 3 && static_chain_p) - && ix86_save_reg (SI_REG, true, false)) - regno = SI_REG; - else if (ix86_save_reg (DI_REG, true, false)) - regno = DI_REG; - else - { - regno = (drap_regno == AX_REG ? DX_REG : AX_REG); - sr->saved = true; - } - } + /* In order to make call/return prediction work right, we now need + to execute a return instruction. See + libgcc/config/i386/morestack.S for the details on how this works. - sr->reg = gen_rtx_REG (Pmode, regno); - if (sr->saved) + For flow purposes gcc must not see this as a return + instruction--we need control flow to continue at the subsequent + label. Therefore, we use an unspec. */ + gcc_assert (crtl->args.pops_args < 65536); + rtx_insn *ret_insn + = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args))); + + if ((flag_cf_protection & CF_BRANCH)) { - rtx_insn *insn = emit_insn (gen_push (sr->reg)); - RTX_FRAME_RELATED_P (insn) = 1; + /* Insert ENDBR since __morestack will jump back here via indirect + call. */ + rtx cet_eb = gen_nop_endbr (); + emit_insn_after (cet_eb, ret_insn); } -} -/* Release a scratch register obtained from the preceding function. + /* If we are in 64-bit mode and this function uses a static chain, + we saved %r10 in %rax before calling _morestack. */ + if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl)) + emit_move_insn (gen_rtx_REG (word_mode, R10_REG), + gen_rtx_REG (word_mode, AX_REG)); - If RELEASE_VIA_POP is true, we just pop the register off the stack - to release it. This is what non-Linux systems use with -fstack-check. + /* If this function calls va_start, we need to store a pointer to + the arguments on the old stack, because they may not have been + all copied to the new stack. At this point the old stack can be + found at the frame pointer value used by __morestack, because + __morestack has set that up before calling back to us. Here we + store that pointer in a scratch register, and in + ix86_expand_prologue we store the scratch register in a stack + slot. */ + if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + { + unsigned int scratch_regno; + rtx frame_reg; + int words; - Otherwise we use OFFSET to locate the saved register and the - allocated stack space becomes part of the local frame and is - deallocated by the epilogue. */ + scratch_regno = split_stack_prologue_scratch_regno (); + scratch_reg = gen_rtx_REG (Pmode, scratch_regno); + frame_reg = gen_rtx_REG (Pmode, BP_REG); -static void -release_scratch_register_on_entry (struct scratch_reg *sr, HOST_WIDE_INT offset, - bool release_via_pop) -{ - if (sr->saved) - { - if (release_via_pop) - { - struct machine_function *m = cfun->machine; - rtx x, insn = emit_insn (gen_pop (sr->reg)); + /* 64-bit: + fp -> old fp value + return address within this function + return address of caller of this function + stack arguments + So we add three words to get to the stack arguments. - /* The RX FRAME_RELATED_P mechanism doesn't know about pop. */ - RTX_FRAME_RELATED_P (insn) = 1; - x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD)); - x = gen_rtx_SET (stack_pointer_rtx, x); - add_reg_note (insn, REG_FRAME_RELATED_EXPR, x); - m->fs.sp_offset -= UNITS_PER_WORD; - } - else - { - rtx x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (offset)); - x = gen_rtx_SET (sr->reg, gen_rtx_MEM (word_mode, x)); - emit_insn (x); - } + 32-bit: + fp -> old fp value + return address within this function + first argument to __morestack + second argument to __morestack + return address of caller of this function + stack arguments + So we add five words to get to the stack arguments. + */ + words = TARGET_64BIT ? 3 : 5; + emit_insn (gen_rtx_SET (scratch_reg, + gen_rtx_PLUS (Pmode, frame_reg, + GEN_INT (words * UNITS_PER_WORD)))); + + varargs_label = gen_label_rtx (); + emit_jump_insn (gen_jump (varargs_label)); + JUMP_LABEL (get_last_insn ()) = varargs_label; + + emit_barrier (); } -} -/* Emit code to adjust the stack pointer by SIZE bytes while probing it. + emit_label (label); + LABEL_NUSES (label) = 1; - This differs from the next routine in that it tries hard to prevent - attacks that jump the stack guard. Thus it is never allowed to allocate - more than PROBE_INTERVAL bytes of stack space without a suitable - probe. + /* If this function calls va_start, we now have to set the scratch + register for the case where we do not call __morestack. In this + case we need to set it based on the stack pointer. */ + if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + { + emit_insn (gen_rtx_SET (scratch_reg, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (UNITS_PER_WORD)))); - INT_REGISTERS_SAVED is true if integer registers have already been - pushed on the stack. */ + emit_label (varargs_label); + LABEL_NUSES (varargs_label) = 1; + } +} + +/* We may have to tell the dataflow pass that the split stack prologue + is initializing a scratch register. */ static void -ix86_adjust_stack_and_probe_stack_clash (HOST_WIDE_INT size, - const bool int_registers_saved) +ix86_live_on_entry (bitmap regs) { - struct machine_function *m = cfun->machine; - - /* If this function does not statically allocate stack space, then - no probes are needed. */ - if (!size) + if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) { - /* However, the allocation of space via pushes for register - saves could be viewed as allocating space, but without the - need to probe. */ - if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed) - dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); - else - dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false); - return; + gcc_assert (flag_split_stack); + bitmap_set_bit (regs, split_stack_prologue_scratch_regno ()); } +} + +/* Extract the parts of an RTL expression that is a valid memory address + for an instruction. Return 0 if the structure of the address is + grossly off. Return -1 if the address contains ASHIFT, so it is not + strictly valid, but still used for computing length of lea instruction. */ - /* If we are a noreturn function, then we have to consider the - possibility that we're called via a jump rather than a call. - - Thus we don't have the implicit probe generated by saving the - return address into the stack at the call. Thus, the stack - pointer could be anywhere in the guard page. The safe thing - to do is emit a probe now. - - The probe can be avoided if we have already emitted any callee - register saves into the stack or have a frame pointer (which will - have been saved as well). Those saves will function as implicit - probes. +int +ix86_decompose_address (rtx addr, struct ix86_address *out) +{ + rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX; + rtx base_reg, index_reg; + HOST_WIDE_INT scale = 1; + rtx scale_rtx = NULL_RTX; + rtx tmp; + int retval = 1; + addr_space_t seg = ADDR_SPACE_GENERIC; - ?!? This should be revamped to work like aarch64 and s390 where - we track the offset from the most recent probe. Normally that - offset would be zero. For a noreturn function we would reset - it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then - we just probe when we cross PROBE_INTERVAL. */ - if (TREE_THIS_VOLATILE (cfun->decl) - && !(m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)) + /* Allow zero-extended SImode addresses, + they will be emitted with addr32 prefix. */ + if (TARGET_64BIT && GET_MODE (addr) == DImode) { - /* We can safely use any register here since we're just going to push - its value and immediately pop it back. But we do try and avoid - argument passing registers so as not to introduce dependencies in - the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */ - rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG); - rtx_insn *insn_push = emit_insn (gen_push (dummy_reg)); - rtx_insn *insn_pop = emit_insn (gen_pop (dummy_reg)); - m->fs.sp_offset -= UNITS_PER_WORD; - if (m->fs.cfa_reg == stack_pointer_rtx) + if (GET_CODE (addr) == ZERO_EXTEND + && GET_MODE (XEXP (addr, 0)) == SImode) { - m->fs.cfa_offset -= UNITS_PER_WORD; - rtx x = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD); - x = gen_rtx_SET (stack_pointer_rtx, x); - add_reg_note (insn_push, REG_CFA_ADJUST_CFA, x); - RTX_FRAME_RELATED_P (insn_push) = 1; - x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); - x = gen_rtx_SET (stack_pointer_rtx, x); - add_reg_note (insn_pop, REG_CFA_ADJUST_CFA, x); - RTX_FRAME_RELATED_P (insn_pop) = 1; + addr = XEXP (addr, 0); + if (CONST_INT_P (addr)) + return 0; + } + else if (GET_CODE (addr) == AND + && const_32bit_mask (XEXP (addr, 1), DImode)) + { + addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode); + if (addr == NULL_RTX) + return 0; + + if (CONST_INT_P (addr)) + return 0; } - emit_insn (gen_blockage ()); } - /* If we allocate less than the size of the guard statically, - then no probing is necessary, but we do need to allocate - the stack. */ - if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE))) + /* Allow SImode subregs of DImode addresses, + they will be emitted with addr32 prefix. */ + if (TARGET_64BIT && GET_MODE (addr) == SImode) { - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (-size), -1, - m->fs.cfa_reg == stack_pointer_rtx); - dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); - return; + if (SUBREG_P (addr) + && GET_MODE (SUBREG_REG (addr)) == DImode) + { + addr = SUBREG_REG (addr); + if (CONST_INT_P (addr)) + return 0; + } } - /* We're allocating a large enough stack frame that we need to - emit probes. Either emit them inline or in a loop depending - on the size. */ - HOST_WIDE_INT probe_interval = get_probe_interval (); - if (size <= 4 * probe_interval) + if (REG_P (addr)) + base = addr; + else if (SUBREG_P (addr)) { - HOST_WIDE_INT i; - for (i = probe_interval; i <= size; i += probe_interval) - { - /* Allocate PROBE_INTERVAL bytes. */ - rtx insn - = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (-probe_interval), -1, - m->fs.cfa_reg == stack_pointer_rtx); - add_reg_note (insn, REG_STACK_CHECK, const0_rtx); - - /* And probe at *sp. */ - emit_stack_probe (stack_pointer_rtx); - emit_insn (gen_blockage ()); - } - - /* We need to allocate space for the residual, but we do not need - to probe the residual. */ - HOST_WIDE_INT residual = (i - probe_interval - size); - if (residual) - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (residual), -1, - m->fs.cfa_reg == stack_pointer_rtx); - dump_stack_clash_frame_info (PROBE_INLINE, residual != 0); + if (REG_P (SUBREG_REG (addr))) + base = addr; + else + return 0; } - else + else if (GET_CODE (addr) == PLUS) { - /* We expect the GP registers to be saved when probes are used - as the probing sequences might need a scratch register and - the routine to allocate one assumes the integer registers - have already been saved. */ - gcc_assert (int_registers_saved); - - struct scratch_reg sr; - get_scratch_register_on_entry (&sr); - - /* If we needed to save a register, then account for any space - that was pushed (we are not going to pop the register when - we do the restore). */ - if (sr.saved) - size -= UNITS_PER_WORD; - - /* Step 1: round SIZE down to a multiple of the interval. */ - HOST_WIDE_INT rounded_size = size & -probe_interval; + rtx addends[4], op; + int n = 0, i; - /* Step 2: compute final value of the loop counter. Use lea if - possible. */ - rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size); - rtx insn; - if (address_no_seg_operand (addr, Pmode)) - insn = emit_insn (gen_rtx_SET (sr.reg, addr)); - else - { - emit_move_insn (sr.reg, GEN_INT (-rounded_size)); - insn = emit_insn (gen_rtx_SET (sr.reg, - gen_rtx_PLUS (Pmode, sr.reg, - stack_pointer_rtx))); - } - if (m->fs.cfa_reg == stack_pointer_rtx) + op = addr; + do { - add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, sr.reg, - m->fs.cfa_offset + rounded_size)); - RTX_FRAME_RELATED_P (insn) = 1; + if (n >= 4) + return 0; + addends[n++] = XEXP (op, 1); + op = XEXP (op, 0); } + while (GET_CODE (op) == PLUS); + if (n >= 4) + return 0; + addends[n] = op; - /* Step 3: the loop. */ - rtx size_rtx = GEN_INT (rounded_size); - insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, - size_rtx)); - if (m->fs.cfa_reg == stack_pointer_rtx) + for (i = n; i >= 0; --i) { - m->fs.cfa_offset += rounded_size; - add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, stack_pointer_rtx, - m->fs.cfa_offset)); - RTX_FRAME_RELATED_P (insn) = 1; - } - m->fs.sp_offset += rounded_size; - emit_insn (gen_blockage ()); - - /* Step 4: adjust SP if we cannot assert at compile-time that SIZE - is equal to ROUNDED_SIZE. */ - - if (size != rounded_size) - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (rounded_size - size), -1, - m->fs.cfa_reg == stack_pointer_rtx); - dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size); + op = addends[i]; + switch (GET_CODE (op)) + { + case MULT: + if (index) + return 0; + index = XEXP (op, 0); + scale_rtx = XEXP (op, 1); + break; - /* This does not deallocate the space reserved for the scratch - register. That will be deallocated in the epilogue. */ - release_scratch_register_on_entry (&sr, size, false); - } + case ASHIFT: + if (index) + return 0; + index = XEXP (op, 0); + tmp = XEXP (op, 1); + if (!CONST_INT_P (tmp)) + return 0; + scale = INTVAL (tmp); + if ((unsigned HOST_WIDE_INT) scale > 3) + return 0; + scale = 1 << scale; + break; - /* Make sure nothing is scheduled before we are done. */ - emit_insn (gen_blockage ()); -} + case ZERO_EXTEND: + op = XEXP (op, 0); + if (GET_CODE (op) != UNSPEC) + return 0; + /* FALLTHRU */ -/* Emit code to adjust the stack pointer by SIZE bytes while probing it. + case UNSPEC: + if (XINT (op, 1) == UNSPEC_TP + && TARGET_TLS_DIRECT_SEG_REFS + && seg == ADDR_SPACE_GENERIC) + seg = DEFAULT_TLS_SEG_REG; + else + return 0; + break; - INT_REGISTERS_SAVED is true if integer registers have already been - pushed on the stack. */ + case SUBREG: + if (!REG_P (SUBREG_REG (op))) + return 0; + /* FALLTHRU */ -static void -ix86_adjust_stack_and_probe (HOST_WIDE_INT size, - const bool int_registers_saved) -{ - /* We skip the probe for the first interval + a small dope of 4 words and - probe that many bytes past the specified size to maintain a protection - area at the botton of the stack. */ - const int dope = 4 * UNITS_PER_WORD; - rtx size_rtx = GEN_INT (size), last; + case REG: + if (!base) + base = op; + else if (!index) + index = op; + else + return 0; + break; - /* See if we have a constant small number of probes to generate. If so, - that's the easy case. The run-time loop is made up of 9 insns in the - generic case while the compile-time loop is made up of 3+2*(n-1) insns - for n # of intervals. */ - if (size <= 4 * get_probe_interval ()) - { - HOST_WIDE_INT i, adjust; - bool first_probe = true; + case CONST: + case CONST_INT: + case SYMBOL_REF: + case LABEL_REF: + if (disp) + return 0; + disp = op; + break; - /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for - values of N from 1 until it exceeds SIZE. If only one probe is - needed, this will not generate any code. Then adjust and probe - to PROBE_INTERVAL + SIZE. */ - for (i = get_probe_interval (); i < size; i += get_probe_interval ()) - { - if (first_probe) - { - adjust = 2 * get_probe_interval () + dope; - first_probe = false; + default: + return 0; } - else - adjust = get_probe_interval (); - - emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - -adjust))); - emit_stack_probe (stack_pointer_rtx); } + } + else if (GET_CODE (addr) == MULT) + { + index = XEXP (addr, 0); /* index*scale */ + scale_rtx = XEXP (addr, 1); + } + else if (GET_CODE (addr) == ASHIFT) + { + /* We're called for lea too, which implements ashift on occasion. */ + index = XEXP (addr, 0); + tmp = XEXP (addr, 1); + if (!CONST_INT_P (tmp)) + return 0; + scale = INTVAL (tmp); + if ((unsigned HOST_WIDE_INT) scale > 3) + return 0; + scale = 1 << scale; + retval = -1; + } + else + disp = addr; /* displacement */ - if (first_probe) - adjust = size + get_probe_interval () + dope; + if (index) + { + if (REG_P (index)) + ; + else if (SUBREG_P (index) + && REG_P (SUBREG_REG (index))) + ; else - adjust = size + get_probe_interval () - i; - - emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - -adjust))); - emit_stack_probe (stack_pointer_rtx); - - /* Adjust back to account for the additional first interval. */ - last = emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - (get_probe_interval () - + dope)))); + return 0; } - /* Otherwise, do the same as above, but in a loop. Note that we must be - extra careful with variables wrapping around because we might be at - the very top (or the very bottom) of the address space and we have - to be able to handle this case properly; in particular, we use an - equality test for the loop condition. */ - else + /* Extract the integral value of scale. */ + if (scale_rtx) { - /* We expect the GP registers to be saved when probes are used - as the probing sequences might need a scratch register and - the routine to allocate one assumes the integer registers - have already been saved. */ - gcc_assert (int_registers_saved); - - HOST_WIDE_INT rounded_size; - struct scratch_reg sr; + if (!CONST_INT_P (scale_rtx)) + return 0; + scale = INTVAL (scale_rtx); + } - get_scratch_register_on_entry (&sr); + base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base; + index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index; - /* If we needed to save a register, then account for any space - that was pushed (we are not going to pop the register when - we do the restore). */ - if (sr.saved) - size -= UNITS_PER_WORD; + /* Avoid useless 0 displacement. */ + if (disp == const0_rtx && (base || index)) + disp = NULL_RTX; - /* Step 1: round SIZE to the previous multiple of the interval. */ + /* Allow arg pointer and stack pointer as index if there is not scaling. */ + if (base_reg && index_reg && scale == 1 + && (REGNO (index_reg) == ARG_POINTER_REGNUM + || REGNO (index_reg) == FRAME_POINTER_REGNUM + || REGNO (index_reg) == SP_REG)) + { + std::swap (base, index); + std::swap (base_reg, index_reg); + } - rounded_size = ROUND_DOWN (size, get_probe_interval ()); + /* Special case: %ebp cannot be encoded as a base without a displacement. + Similarly %r13. */ + if (!disp && base_reg + && (REGNO (base_reg) == ARG_POINTER_REGNUM + || REGNO (base_reg) == FRAME_POINTER_REGNUM + || REGNO (base_reg) == BP_REG + || REGNO (base_reg) == R13_REG)) + disp = const0_rtx; + /* Special case: on K6, [%esi] makes the instruction vector decoded. + Avoid this by transforming to [%esi+0]. + Reload calls address legitimization without cfun defined, so we need + to test cfun for being non-NULL. */ + if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun) + && base_reg && !index_reg && !disp + && REGNO (base_reg) == SI_REG) + disp = const0_rtx; - /* Step 2: compute initial and final value of the loop counter. */ + /* Special case: encode reg+reg instead of reg*2. */ + if (!base && index && scale == 2) + base = index, base_reg = index_reg, scale = 1; - /* SP = SP_0 + PROBE_INTERVAL. */ - emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - - (get_probe_interval () + dope)))); + /* Special case: scaling cannot be encoded without base or displacement. */ + if (!base && !disp && index && scale != 1) + disp = const0_rtx; - /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */ - if (rounded_size <= (HOST_WIDE_INT_1 << 31)) - emit_insn (gen_rtx_SET (sr.reg, - plus_constant (Pmode, stack_pointer_rtx, - -rounded_size))); - else - { - emit_move_insn (sr.reg, GEN_INT (-rounded_size)); - emit_insn (gen_rtx_SET (sr.reg, - gen_rtx_PLUS (Pmode, sr.reg, - stack_pointer_rtx))); - } + out->base = base; + out->index = index; + out->disp = disp; + out->scale = scale; + out->seg = seg; + return retval; +} + +/* Return cost of the memory address x. + For i386, it is better to use a complex address than let gcc copy + the address into a reg and make a new pseudo. But not if the address + requires to two regs - that would mean more pseudos with longer + lifetimes. */ +static int +ix86_address_cost (rtx x, machine_mode, addr_space_t, bool) +{ + struct ix86_address parts; + int cost = 1; + int ok = ix86_decompose_address (x, &parts); - /* Step 3: the loop + gcc_assert (ok); - do - { - SP = SP + PROBE_INTERVAL - probe at SP - } - while (SP != LAST_ADDR) + if (parts.base && SUBREG_P (parts.base)) + parts.base = SUBREG_REG (parts.base); + if (parts.index && SUBREG_P (parts.index)) + parts.index = SUBREG_REG (parts.index); - adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for - values of N from 1 until it is equal to ROUNDED_SIZE. */ + /* Attempt to minimize number of registers in the address by increasing + address cost for each used register. We don't increase address cost + for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx" + is not invariant itself it most likely means that base or index is not + invariant. Therefore only "pic_offset_table_rtx" could be hoisted out, + which is not profitable for x86. */ + if (parts.base + && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER) + && (current_pass->type == GIMPLE_PASS + || !pic_offset_table_rtx + || !REG_P (parts.base) + || REGNO (pic_offset_table_rtx) != REGNO (parts.base))) + cost++; - emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx)); + if (parts.index + && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER) + && (current_pass->type == GIMPLE_PASS + || !pic_offset_table_rtx + || !REG_P (parts.index) + || REGNO (pic_offset_table_rtx) != REGNO (parts.index))) + cost++; + /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b, + since it's predecode logic can't detect the length of instructions + and it degenerates to vector decoded. Increase cost of such + addresses here. The penalty is minimally 2 cycles. It may be worthwhile + to split such addresses or even refuse such addresses at all. - /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot - assert at compile-time that SIZE is equal to ROUNDED_SIZE. */ + Following addressing modes are affected: + [base+scale*index] + [scale*index+disp] + [base+index] - if (size != rounded_size) - { - emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - rounded_size - size))); - emit_stack_probe (stack_pointer_rtx); - } + The first and last case may be avoidable by explicitly coding the zero in + memory address, but I don't have AMD-K6 machine handy to check this + theory. */ - /* Adjust back to account for the additional first interval. */ - last = emit_insn (gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - (get_probe_interval () - + dope)))); + if (TARGET_K6 + && ((!parts.disp && parts.base && parts.index && parts.scale != 1) + || (parts.disp && !parts.base && parts.index && parts.scale != 1) + || (!parts.disp && parts.base && parts.index && parts.scale == 1))) + cost += 10; - /* This does not deallocate the space reserved for the scratch - register. That will be deallocated in the epilogue. */ - release_scratch_register_on_entry (&sr, size, false); - } + return cost; +} + +/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as + this is used for to form addresses to local data when -fPIC is in + use. */ - /* Even if the stack pointer isn't the CFA register, we need to correctly - describe the adjustments made to it, in particular differentiate the - frame-related ones from the frame-unrelated ones. */ - if (size > 0) - { - rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2)); - XVECEXP (expr, 0, 0) - = gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, -size)); - XVECEXP (expr, 0, 1) - = gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - get_probe_interval () + dope + size)); - add_reg_note (last, REG_FRAME_RELATED_EXPR, expr); - RTX_FRAME_RELATED_P (last) = 1; +static bool +darwin_local_data_pic (rtx disp) +{ + return (GET_CODE (disp) == UNSPEC + && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET); +} - cfun->machine->fs.sp_offset += size; - } +/* True if operand X should be loaded from GOT. */ - /* Make sure nothing is scheduled before we are done. */ - emit_insn (gen_blockage ()); +bool +ix86_force_load_from_GOT_p (rtx x) +{ + return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X) + && !TARGET_PECOFF && !TARGET_MACHO + && !flag_pic + && ix86_cmodel != CM_LARGE + && GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_FUNCTION_P (x) + && (!flag_plt + || (SYMBOL_REF_DECL (x) + && lookup_attribute ("noplt", + DECL_ATTRIBUTES (SYMBOL_REF_DECL (x))))) + && !SYMBOL_REF_LOCAL_P (x)); } -/* Adjust the stack pointer up to REG while probing it. */ +/* Determine if a given RTX is a valid constant. We already know this + satisfies CONSTANT_P. */ -const char * -output_adjust_stack_and_probe (rtx reg) +static bool +ix86_legitimate_constant_p (machine_mode mode, rtx x) { - static int labelno = 0; - char loop_lab[32]; - rtx xops[2]; + switch (GET_CODE (x)) + { + case CONST: + x = XEXP (x, 0); - ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++); + if (GET_CODE (x) == PLUS) + { + if (!CONST_INT_P (XEXP (x, 1))) + return false; + x = XEXP (x, 0); + } - /* Loop. */ - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); + if (TARGET_MACHO && darwin_local_data_pic (x)) + return true; - /* SP = SP + PROBE_INTERVAL. */ - xops[0] = stack_pointer_rtx; - xops[1] = GEN_INT (get_probe_interval ()); - output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops); + /* Only some unspecs are valid as "constants". */ + if (GET_CODE (x) == UNSPEC) + switch (XINT (x, 1)) + { + case UNSPEC_GOT: + case UNSPEC_GOTOFF: + case UNSPEC_PLTOFF: + return TARGET_64BIT; + case UNSPEC_TPOFF: + case UNSPEC_NTPOFF: + x = XVECEXP (x, 0, 0); + return (GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC); + case UNSPEC_DTPOFF: + x = XVECEXP (x, 0, 0); + return (GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC); + default: + return false; + } - /* Probe at SP. */ - xops[1] = const0_rtx; - output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops); + /* We must have drilled down to a symbol. */ + if (GET_CODE (x) == LABEL_REF) + return true; + if (GET_CODE (x) != SYMBOL_REF) + return false; + /* FALLTHRU */ - /* Test if SP == LAST_ADDR. */ - xops[0] = stack_pointer_rtx; - xops[1] = reg; - output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops); + case SYMBOL_REF: + /* TLS symbols are never valid. */ + if (SYMBOL_REF_TLS_MODEL (x)) + return false; - /* Branch. */ - fputs ("\tjne\t", asm_out_file); - assemble_name_raw (asm_out_file, loop_lab); - fputc ('\n', asm_out_file); + /* DLLIMPORT symbols are never valid. */ + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES + && SYMBOL_REF_DLLIMPORT_P (x)) + return false; - return ""; -} +#if TARGET_MACHO + /* mdynamic-no-pic */ + if (MACHO_DYNAMIC_NO_PIC_P) + return machopic_symbol_defined_p (x); +#endif -/* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE, - inclusive. These are offsets from the current stack pointer. + /* External function address should be loaded + via the GOT slot to avoid PLT. */ + if (ix86_force_load_from_GOT_p (x)) + return false; - INT_REGISTERS_SAVED is true if integer registers have already been - pushed on the stack. */ + break; -static void -ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size, - const bool int_registers_saved) -{ - /* See if we have a constant small number of probes to generate. If so, - that's the easy case. The run-time loop is made up of 6 insns in the - generic case while the compile-time loop is made up of n insns for n # - of intervals. */ - if (size <= 6 * get_probe_interval ()) - { - HOST_WIDE_INT i; + CASE_CONST_SCALAR_INT: + switch (mode) + { + case E_TImode: + if (TARGET_64BIT) + return true; + /* FALLTHRU */ + case E_OImode: + case E_XImode: + if (!standard_sse_constant_p (x, mode)) + return false; + default: + break; + } + break; - /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until - it exceeds SIZE. If only one probe is needed, this will not - generate any code. Then probe at FIRST + SIZE. */ - for (i = get_probe_interval (); i < size; i += get_probe_interval ()) - emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, - -(first + i))); + case CONST_VECTOR: + if (!standard_sse_constant_p (x, mode)) + return false; - emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, - -(first + size))); + default: + break; } - /* Otherwise, do the same as above, but in a loop. Note that we must be - extra careful with variables wrapping around because we might be at - the very top (or the very bottom) of the address space and we have - to be able to handle this case properly; in particular, we use an - equality test for the loop condition. */ - else - { - /* We expect the GP registers to be saved when probes are used - as the probing sequences might need a scratch register and - the routine to allocate one assumes the integer registers - have already been saved. */ - gcc_assert (int_registers_saved); - - HOST_WIDE_INT rounded_size, last; - struct scratch_reg sr; - - get_scratch_register_on_entry (&sr); - - - /* Step 1: round SIZE to the previous multiple of the interval. */ + /* Otherwise we handle everything else in the move patterns. */ + return true; +} - rounded_size = ROUND_DOWN (size, get_probe_interval ()); +/* Determine if it's legal to put X into the constant pool. This + is not possible for the address of thread-local symbols, which + is checked above. */ +static bool +ix86_cannot_force_const_mem (machine_mode mode, rtx x) +{ + /* We can put any immediate constant in memory. */ + switch (GET_CODE (x)) + { + CASE_CONST_ANY: + return false; - /* Step 2: compute initial and final value of the loop counter. */ + default: + break; + } - /* TEST_OFFSET = FIRST. */ - emit_move_insn (sr.reg, GEN_INT (-first)); + return !ix86_legitimate_constant_p (mode, x); +} - /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */ - last = first + rounded_size; +/* Nonzero if the symbol is marked as dllimport, or as stub-variable, + otherwise zero. */ +static bool +is_imported_p (rtx x) +{ + if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES + || GET_CODE (x) != SYMBOL_REF) + return false; - /* Step 3: the loop + return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x); +} - do - { - TEST_ADDR = TEST_ADDR + PROBE_INTERVAL - probe at TEST_ADDR - } - while (TEST_ADDR != LAST_ADDR) - probes at FIRST + N * PROBE_INTERVAL for values of N from 1 - until it is equal to ROUNDED_SIZE. */ +/* Nonzero if the constant value X is a legitimate general operand + when generating PIC code. It is given that flag_pic is on and + that X satisfies CONSTANT_P. */ - emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last))); +bool +legitimate_pic_operand_p (rtx x) +{ + rtx inner; + switch (GET_CODE (x)) + { + case CONST: + inner = XEXP (x, 0); + if (GET_CODE (inner) == PLUS + && CONST_INT_P (XEXP (inner, 1))) + inner = XEXP (inner, 0); - /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time - that SIZE is equal to ROUNDED_SIZE. */ + /* Only some unspecs are valid as "constants". */ + if (GET_CODE (inner) == UNSPEC) + switch (XINT (inner, 1)) + { + case UNSPEC_GOT: + case UNSPEC_GOTOFF: + case UNSPEC_PLTOFF: + return TARGET_64BIT; + case UNSPEC_TPOFF: + x = XVECEXP (inner, 0, 0); + return (GET_CODE (x) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC); + case UNSPEC_MACHOPIC_OFFSET: + return legitimate_pic_address_disp_p (x); + default: + return false; + } + /* FALLTHRU */ - if (size != rounded_size) - emit_stack_probe (plus_constant (Pmode, - gen_rtx_PLUS (Pmode, - stack_pointer_rtx, - sr.reg), - rounded_size - size)); + case SYMBOL_REF: + case LABEL_REF: + return legitimate_pic_address_disp_p (x); - release_scratch_register_on_entry (&sr, size, true); + default: + return true; } - - /* Make sure nothing is scheduled before we are done. */ - emit_insn (gen_blockage ()); } -/* Probe a range of stack addresses from REG to END, inclusive. These are - offsets from the current stack pointer. */ +/* Determine if a given CONST RTX is a valid memory displacement + in PIC mode. */ -const char * -output_probe_stack_range (rtx reg, rtx end) +bool +legitimate_pic_address_disp_p (rtx disp) { - static int labelno = 0; - char loop_lab[32]; - rtx xops[3]; - - ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++); + bool saw_plus; - /* Loop. */ - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); + /* In 64bit mode we can allow direct addresses of symbols and labels + when they are not dynamic symbols. */ + if (TARGET_64BIT) + { + rtx op0 = disp, op1; - /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */ - xops[0] = reg; - xops[1] = GEN_INT (get_probe_interval ()); - output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops); + switch (GET_CODE (disp)) + { + case LABEL_REF: + return true; - /* Probe at TEST_ADDR. */ - xops[0] = stack_pointer_rtx; - xops[1] = reg; - xops[2] = const0_rtx; - output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops); + case CONST: + if (GET_CODE (XEXP (disp, 0)) != PLUS) + break; + op0 = XEXP (XEXP (disp, 0), 0); + op1 = XEXP (XEXP (disp, 0), 1); + if (!CONST_INT_P (op1)) + break; + if (GET_CODE (op0) == UNSPEC + && (XINT (op0, 1) == UNSPEC_DTPOFF + || XINT (op0, 1) == UNSPEC_NTPOFF) + && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1)) + return true; + if (INTVAL (op1) >= 16*1024*1024 + || INTVAL (op1) < -16*1024*1024) + break; + if (GET_CODE (op0) == LABEL_REF) + return true; + if (GET_CODE (op0) == CONST + && GET_CODE (XEXP (op0, 0)) == UNSPEC + && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL) + return true; + if (GET_CODE (op0) == UNSPEC + && XINT (op0, 1) == UNSPEC_PCREL) + return true; + if (GET_CODE (op0) != SYMBOL_REF) + break; + /* FALLTHRU */ - /* Test if TEST_ADDR == LAST_ADDR. */ - xops[0] = reg; - xops[1] = end; - output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops); + case SYMBOL_REF: + /* TLS references should always be enclosed in UNSPEC. + The dllimported symbol needs always to be resolved. */ + if (SYMBOL_REF_TLS_MODEL (op0) + || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0))) + return false; - /* Branch. */ - fputs ("\tjne\t", asm_out_file); - assemble_name_raw (asm_out_file, loop_lab); - fputc ('\n', asm_out_file); + if (TARGET_PECOFF) + { + if (is_imported_p (op0)) + return true; - return ""; + if (SYMBOL_REF_FAR_ADDR_P (op0) + || !SYMBOL_REF_LOCAL_P (op0)) + break; + + /* Function-symbols need to be resolved only for + large-model. + For the small-model we don't need to resolve anything + here. */ + if ((ix86_cmodel != CM_LARGE_PIC + && SYMBOL_REF_FUNCTION_P (op0)) + || ix86_cmodel == CM_SMALL_PIC) + return true; + /* Non-external symbols don't need to be resolved for + large, and medium-model. */ + if ((ix86_cmodel == CM_LARGE_PIC + || ix86_cmodel == CM_MEDIUM_PIC) + && !SYMBOL_REF_EXTERNAL_P (op0)) + return true; + } + else if (!SYMBOL_REF_FAR_ADDR_P (op0) + && (SYMBOL_REF_LOCAL_P (op0) + || (HAVE_LD_PIE_COPYRELOC + && flag_pie + && !SYMBOL_REF_WEAK (op0) + && !SYMBOL_REF_FUNCTION_P (op0))) + && ix86_cmodel != CM_LARGE_PIC) + return true; + break; + + default: + break; + } + } + if (GET_CODE (disp) != CONST) + return false; + disp = XEXP (disp, 0); + + if (TARGET_64BIT) + { + /* We are unsafe to allow PLUS expressions. This limit allowed distance + of GOT tables. We should not need these anyway. */ + if (GET_CODE (disp) != UNSPEC + || (XINT (disp, 1) != UNSPEC_GOTPCREL + && XINT (disp, 1) != UNSPEC_GOTOFF + && XINT (disp, 1) != UNSPEC_PCREL + && XINT (disp, 1) != UNSPEC_PLTOFF)) + return false; + + if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF + && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF) + return false; + return true; + } + + saw_plus = false; + if (GET_CODE (disp) == PLUS) + { + if (!CONST_INT_P (XEXP (disp, 1))) + return false; + disp = XEXP (disp, 0); + saw_plus = true; + } + + if (TARGET_MACHO && darwin_local_data_pic (disp)) + return true; + + if (GET_CODE (disp) != UNSPEC) + return false; + + switch (XINT (disp, 1)) + { + case UNSPEC_GOT: + if (saw_plus) + return false; + /* We need to check for both symbols and labels because VxWorks loads + text labels with @GOT rather than @GOTOFF. See gotoff_operand for + details. */ + return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF + || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF); + case UNSPEC_GOTOFF: + /* Refuse GOTOFF in 64bit mode since it is always 64bit when used. + While ABI specify also 32bit relocation but we don't produce it in + small PIC model at all. */ + if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF + || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF) + && !TARGET_64BIT) + return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode); + return false; + case UNSPEC_GOTTPOFF: + case UNSPEC_GOTNTPOFF: + case UNSPEC_INDNTPOFF: + if (saw_plus) + return false; + disp = XVECEXP (disp, 0, 0); + return (GET_CODE (disp) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC); + case UNSPEC_NTPOFF: + disp = XVECEXP (disp, 0, 0); + return (GET_CODE (disp) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC); + case UNSPEC_DTPOFF: + disp = XVECEXP (disp, 0, 0); + return (GET_CODE (disp) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC); + } + + return false; } -/* Return true if stack frame is required. Update STACK_ALIGNMENT - to the largest alignment, in bits, of stack slot used if stack - frame is required and CHECK_STACK_SLOT is true. */ +/* Determine if op is suitable RTX for an address register. + Return naked register if a register or a register subreg is + found, otherwise return NULL_RTX. */ -static bool -ix86_find_max_used_stack_alignment (unsigned int &stack_alignment, - bool check_stack_slot) +static rtx +ix86_validate_address_register (rtx op) { - HARD_REG_SET set_up_by_prologue, prologue_used; - basic_block bb; + machine_mode mode = GET_MODE (op); - CLEAR_HARD_REG_SET (prologue_used); - CLEAR_HARD_REG_SET (set_up_by_prologue); - add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM); - add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM); - add_to_hard_reg_set (&set_up_by_prologue, Pmode, - HARD_FRAME_POINTER_REGNUM); + /* Only SImode or DImode registers can form the address. */ + if (mode != SImode && mode != DImode) + return NULL_RTX; - /* The preferred stack alignment is the minimum stack alignment. */ - if (stack_alignment > crtl->preferred_stack_boundary) - stack_alignment = crtl->preferred_stack_boundary; + if (REG_P (op)) + return op; + else if (SUBREG_P (op)) + { + rtx reg = SUBREG_REG (op); - bool require_stack_frame = false; + if (!REG_P (reg)) + return NULL_RTX; - FOR_EACH_BB_FN (bb, cfun) - { - rtx_insn *insn; - FOR_BB_INSNS (bb, insn) - if (NONDEBUG_INSN_P (insn) - && requires_stack_frame_p (insn, prologue_used, - set_up_by_prologue)) - { - require_stack_frame = true; + mode = GET_MODE (reg); - if (check_stack_slot) - { - /* Find the maximum stack alignment. */ - subrtx_iterator::array_type array; - FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL) - if (MEM_P (*iter) - && (reg_mentioned_p (stack_pointer_rtx, - *iter) - || reg_mentioned_p (frame_pointer_rtx, - *iter))) - { - unsigned int alignment = MEM_ALIGN (*iter); - if (alignment > stack_alignment) - stack_alignment = alignment; - } - } - } + /* Don't allow SUBREGs that span more than a word. It can + lead to spill failures when the register is one word out + of a two word structure. */ + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + return NULL_RTX; + + /* Allow only SUBREGs of non-eliminable hard registers. */ + if (register_no_elim_operand (reg, mode)) + return reg; } - return require_stack_frame; + /* Op is not a register. */ + return NULL_RTX; } -/* Finalize stack_realign_needed and frame_pointer_needed flags, which - will guide prologue/epilogue to be generated in correct form. */ +/* Recognizes RTL expressions that are valid memory addresses for an + instruction. The MODE argument is the machine mode for the MEM + expression that wants to use this address. -static void -ix86_finalize_stack_frame_flags (void) + It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should + convert common non-canonical forms to canonical form so that they will + be recognized. */ + +static bool +ix86_legitimate_address_p (machine_mode, rtx addr, bool strict) { - /* Check if stack realign is really needed after reload, and - stores result in cfun */ - unsigned int incoming_stack_boundary - = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary - ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary); - unsigned int stack_alignment - = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor - ? crtl->max_used_stack_slot_alignment - : crtl->stack_alignment_needed); - unsigned int stack_realign - = (incoming_stack_boundary < stack_alignment); - bool recompute_frame_layout_p = false; + struct ix86_address parts; + rtx base, index, disp; + HOST_WIDE_INT scale; + addr_space_t seg; - if (crtl->stack_realign_finalized) + if (ix86_decompose_address (addr, &parts) <= 0) + /* Decomposition failed. */ + return false; + + base = parts.base; + index = parts.index; + disp = parts.disp; + scale = parts.scale; + seg = parts.seg; + + /* Validate base register. */ + if (base) { - /* After stack_realign_needed is finalized, we can't no longer - change it. */ - gcc_assert (crtl->stack_realign_needed == stack_realign); - return; + rtx reg = ix86_validate_address_register (base); + + if (reg == NULL_RTX) + return false; + + if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg)) + || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg))) + /* Base is not valid. */ + return false; } - /* If the only reason for frame_pointer_needed is that we conservatively - assumed stack realignment might be needed or -fno-omit-frame-pointer - is used, but in the end nothing that needed the stack alignment had - been spilled nor stack access, clear frame_pointer_needed and say we - don't need stack realignment. */ - if ((stack_realign || (!flag_omit_frame_pointer && optimize)) - && frame_pointer_needed - && crtl->is_leaf - && crtl->sp_is_unchanging - && !ix86_current_function_calls_tls_descriptor - && !crtl->accesses_prior_frames - && !cfun->calls_alloca - && !crtl->calls_eh_return - /* See ira_setup_eliminable_regset for the rationale. */ - && !(STACK_CHECK_MOVING_SP - && flag_stack_check - && flag_exceptions - && cfun->can_throw_non_call_exceptions) - && !ix86_frame_pointer_required () - && get_frame_size () == 0 - && ix86_nsaved_sseregs () == 0 - && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0) + /* Validate index register. */ + if (index) { - if (ix86_find_max_used_stack_alignment (stack_alignment, - stack_realign)) - { - /* Stack frame is required. If stack alignment needed is less - than incoming stack boundary, don't realign stack. */ - stack_realign = incoming_stack_boundary < stack_alignment; - if (!stack_realign) - { - crtl->max_used_stack_slot_alignment - = incoming_stack_boundary; - crtl->stack_alignment_needed - = incoming_stack_boundary; - /* Also update preferred_stack_boundary for leaf - functions. */ - crtl->preferred_stack_boundary - = incoming_stack_boundary; - } - } - else - { - /* If drap has been set, but it actually isn't live at the - start of the function, there is no reason to set it up. */ - if (crtl->drap_reg) - { - basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; - if (! REGNO_REG_SET_P (DF_LR_IN (bb), - REGNO (crtl->drap_reg))) - { - crtl->drap_reg = NULL_RTX; - crtl->need_drap = false; - } - } - else - cfun->machine->no_drap_save_restore = true; - - frame_pointer_needed = false; - stack_realign = false; - crtl->max_used_stack_slot_alignment = incoming_stack_boundary; - crtl->stack_alignment_needed = incoming_stack_boundary; - crtl->stack_alignment_estimated = incoming_stack_boundary; - if (crtl->preferred_stack_boundary > incoming_stack_boundary) - crtl->preferred_stack_boundary = incoming_stack_boundary; - df_finish_pass (true); - df_scan_alloc (NULL); - df_scan_blocks (); - df_compute_regs_ever_live (true); - df_analyze (); - - if (flag_var_tracking) - { - /* Since frame pointer is no longer available, replace it with - stack pointer - UNITS_PER_WORD in debug insns. */ - df_ref ref, next; - for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM); - ref; ref = next) - { - next = DF_REF_NEXT_REG (ref); - if (!DF_REF_INSN_INFO (ref)) - continue; - - /* Make sure the next ref is for a different instruction, - so that we're not affected by the rescan. */ - rtx_insn *insn = DF_REF_INSN (ref); - while (next && DF_REF_INSN (next) == insn) - next = DF_REF_NEXT_REG (next); + rtx reg = ix86_validate_address_register (index); - if (DEBUG_INSN_P (insn)) - { - bool changed = false; - for (; ref != next; ref = DF_REF_NEXT_REG (ref)) - { - rtx *loc = DF_REF_LOC (ref); - if (*loc == hard_frame_pointer_rtx) - { - *loc = plus_constant (Pmode, - stack_pointer_rtx, - -UNITS_PER_WORD); - changed = true; - } - } - if (changed) - df_insn_rescan (insn); - } - } - } + if (reg == NULL_RTX) + return false; - recompute_frame_layout_p = true; - } - } - else if (crtl->max_used_stack_slot_alignment >= 128) - { - /* We don't need to realign stack. max_used_stack_alignment is - used to decide how stack frame should be aligned. This is - independent of any psABIs nor 32-bit vs 64-bit. It is always - safe to compute max_used_stack_alignment. We compute it only - if 128-bit aligned load/store may be generated on misaligned - stack slot which will lead to segfault. */ - if (ix86_find_max_used_stack_alignment (stack_alignment, true)) - cfun->machine->max_used_stack_alignment - = stack_alignment / BITS_PER_UNIT; + if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg)) + || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg))) + /* Index is not valid. */ + return false; } - if (crtl->stack_realign_needed != stack_realign) - recompute_frame_layout_p = true; - crtl->stack_realign_needed = stack_realign; - crtl->stack_realign_finalized = true; - if (recompute_frame_layout_p) - ix86_compute_frame_layout (); -} + /* Index and base should have the same mode. */ + if (base && index + && GET_MODE (base) != GET_MODE (index)) + return false; -/* Delete SET_GOT right after entry block if it is allocated to reg. */ + /* Address override works only on the (%reg) part of %fs:(%reg). */ + if (seg != ADDR_SPACE_GENERIC + && ((base && GET_MODE (base) != word_mode) + || (index && GET_MODE (index) != word_mode))) + return false; -static void -ix86_elim_entry_set_got (rtx reg) -{ - basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb; - rtx_insn *c_insn = BB_HEAD (bb); - if (!NONDEBUG_INSN_P (c_insn)) - c_insn = next_nonnote_nondebug_insn (c_insn); - if (c_insn && NONJUMP_INSN_P (c_insn)) + /* Validate scale factor. */ + if (scale != 1) { - rtx pat = PATTERN (c_insn); - if (GET_CODE (pat) == PARALLEL) - { - rtx vec = XVECEXP (pat, 0, 0); - if (GET_CODE (vec) == SET - && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT - && REGNO (XEXP (vec, 0)) == REGNO (reg)) - delete_insn (c_insn); - } + if (!index) + /* Scale without index. */ + return false; + + if (scale != 2 && scale != 4 && scale != 8) + /* Scale is not a valid multiplier. */ + return false; } -} -static rtx -gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store) -{ - rtx addr, mem; + /* Validate displacement. */ + if (disp) + { + if (GET_CODE (disp) == CONST + && GET_CODE (XEXP (disp, 0)) == UNSPEC + && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET) + switch (XINT (XEXP (disp, 0), 1)) + { + /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit + when used. While ABI specify also 32bit relocations, we + don't produce them at all and use IP relative instead. + Allow GOT in 32bit mode for both PIC and non-PIC if symbol + should be loaded via GOT. */ + case UNSPEC_GOT: + if (!TARGET_64BIT + && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0))) + goto is_legitimate_pic; + /* FALLTHRU */ + case UNSPEC_GOTOFF: + gcc_assert (flag_pic); + if (!TARGET_64BIT) + goto is_legitimate_pic; - if (offset) - addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset)); - mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg); - return gen_rtx_SET (store ? mem : reg, store ? reg : mem); -} + /* 64bit address unspec. */ + return false; -static inline rtx -gen_frame_load (rtx reg, rtx frame_reg, int offset) -{ - return gen_frame_set (reg, frame_reg, offset, false); -} + case UNSPEC_GOTPCREL: + if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0))) + goto is_legitimate_pic; + /* FALLTHRU */ + case UNSPEC_PCREL: + gcc_assert (flag_pic); + goto is_legitimate_pic; -static inline rtx -gen_frame_store (rtx reg, rtx frame_reg, int offset) -{ - return gen_frame_set (reg, frame_reg, offset, true); -} + case UNSPEC_GOTTPOFF: + case UNSPEC_GOTNTPOFF: + case UNSPEC_INDNTPOFF: + case UNSPEC_NTPOFF: + case UNSPEC_DTPOFF: + break; -static void -ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame) -{ - struct machine_function *m = cfun->machine; - const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS - + m->call_ms2sysv_extra_regs; - rtvec v = rtvec_alloc (ncregs + 1); - unsigned int align, i, vi = 0; - rtx_insn *insn; - rtx sym, addr; - rtx rax = gen_rtx_REG (word_mode, AX_REG); - const struct xlogue_layout &xlogue = xlogue_layout::get_instance (); + default: + /* Invalid address unspec. */ + return false; + } - /* AL should only be live with sysv_abi. */ - gcc_assert (!ix86_eax_live_at_start_p ()); - gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset); + else if (SYMBOLIC_CONST (disp) + && (flag_pic + || (TARGET_MACHO +#if TARGET_MACHO + && MACHOPIC_INDIRECT + && !machopic_operand_p (disp) +#endif + ))) + { - /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather - we've actually realigned the stack or not. */ - align = GET_MODE_ALIGNMENT (V4SFmode); - addr = choose_baseaddr (frame.stack_realign_offset - + xlogue.get_stub_ptr_offset (), &align, AX_REG); - gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode)); + is_legitimate_pic: + if (TARGET_64BIT && (index || base)) + { + /* foo@dtpoff(%rX) is ok. */ + if (GET_CODE (disp) != CONST + || GET_CODE (XEXP (disp, 0)) != PLUS + || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC + || !CONST_INT_P (XEXP (XEXP (disp, 0), 1)) + || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF + && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF)) + /* Non-constant pic memory reference. */ + return false; + } + else if ((!TARGET_MACHO || flag_pic) + && ! legitimate_pic_address_disp_p (disp)) + /* Displacement is an invalid pic construct. */ + return false; +#if TARGET_MACHO + else if (MACHO_DYNAMIC_NO_PIC_P + && !ix86_legitimate_constant_p (Pmode, disp)) + /* displacment must be referenced via non_lazy_pointer */ + return false; +#endif - emit_insn (gen_rtx_SET (rax, addr)); + /* This code used to verify that a symbolic pic displacement + includes the pic_offset_table_rtx register. - /* Get the stub symbol. */ - sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP - : XLOGUE_STUB_SAVE); - RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym); + While this is good idea, unfortunately these constructs may + be created by "adds using lea" optimization for incorrect + code like: - for (i = 0; i < ncregs; ++i) - { - const xlogue_layout::reginfo &r = xlogue.get_reginfo (i); - rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode), - r.regno); - RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset); - } + int a; + int foo(int i) + { + return *(&a+i); + } - gcc_assert (vi == (unsigned)GET_NUM_ELEM (v)); + This code is nonsensical, but results in addressing + GOT table with pic_offset_table_rtx base. We can't + just refuse it easily, since it gets matched by + "addsi3" pattern, that later gets split to lea in the + case output register differs from input. While this + can be handled by separate addsi pattern for this case + that never results in lea, this seems to be easier and + correct fix for crash to disable this test. */ + } + else if (GET_CODE (disp) != LABEL_REF + && !CONST_INT_P (disp) + && (GET_CODE (disp) != CONST + || !ix86_legitimate_constant_p (Pmode, disp)) + && (GET_CODE (disp) != SYMBOL_REF + || !ix86_legitimate_constant_p (Pmode, disp))) + /* Displacement is not constant. */ + return false; + else if (TARGET_64BIT + && !x86_64_immediate_operand (disp, VOIDmode)) + /* Displacement is out of range. */ + return false; + /* In x32 mode, constant addresses are sign extended to 64bit, so + we have to prevent addresses from 0x80000000 to 0xffffffff. */ + else if (TARGET_X32 && !(index || base) + && CONST_INT_P (disp) + && val_signbit_known_set_p (SImode, INTVAL (disp))) + return false; + } - insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v)); - RTX_FRAME_RELATED_P (insn) = true; + /* Everything looks valid. */ + return true; } -/* Expand the prologue into a bunch of separate insns. */ +/* Determine if a given RTX is a valid constant address. */ -void -ix86_expand_prologue (void) +bool +constant_address_p (rtx x) { - struct machine_function *m = cfun->machine; - rtx insn, t; - HOST_WIDE_INT allocate; - bool int_registers_saved; - bool sse_registers_saved; - bool save_stub_call_needed; - rtx static_chain = NULL_RTX; + return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1); +} + +/* Return a unique alias set for the GOT. */ - if (ix86_function_naked (current_function_decl)) - return; +alias_set_type +ix86_GOT_alias_set (void) +{ + static alias_set_type set = -1; + if (set == -1) + set = new_alias_set (); + return set; +} - ix86_finalize_stack_frame_flags (); +/* Return a legitimate reference for ORIG (an address) using the + register REG. If REG is 0, a new pseudo is generated. - /* DRAP should not coexist with stack_realign_fp */ - gcc_assert (!(crtl->drap_reg && stack_realign_fp)); + There are two types of references that must be handled: - memset (&m->fs, 0, sizeof (m->fs)); + 1. Global data references must load the address from the GOT, via + the PIC reg. An insn is emitted to do this load, and the reg is + returned. - /* Initialize CFA state for before the prologue. */ - m->fs.cfa_reg = stack_pointer_rtx; - m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET; + 2. Static data references, constant pool addresses, and code labels + compute the address as an offset from the GOT, whose base is in + the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to + differentiate them from global data objects. The returned + address is the PIC reg + an unspec constant. - /* Track SP offset to the CFA. We continue tracking this after we've - swapped the CFA register away from SP. In the case of re-alignment - this is fudged; we're interested to offsets within the local frame. */ - m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET; - m->fs.sp_valid = true; - m->fs.sp_realigned = false; + TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC + reg also appears in the address. */ - const struct ix86_frame &frame = cfun->machine->frame; +rtx +legitimize_pic_address (rtx orig, rtx reg) +{ + rtx addr = orig; + rtx new_rtx = orig; - if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl)) +#if TARGET_MACHO + if (TARGET_MACHO && !TARGET_64BIT) { - /* We should have already generated an error for any use of - ms_hook on a nested function. */ - gcc_checking_assert (!ix86_static_chain_on_stack); - - /* Check if profiling is active and we shall use profiling before - prologue variant. If so sorry. */ - if (crtl->profile && flag_fentry != 0) - sorry ("ms_hook_prologue attribute isn%'t compatible " - "with %<-mfentry%> for 32-bit"); - - /* In ix86_asm_output_function_label we emitted: - 8b ff movl.s %edi,%edi - 55 push %ebp - 8b ec movl.s %esp,%ebp - - This matches the hookable function prologue in Win32 API - functions in Microsoft Windows XP Service Pack 2 and newer. - Wine uses this to enable Windows apps to hook the Win32 API - functions provided by Wine. - - What that means is that we've already set up the frame pointer. */ - - if (frame_pointer_needed - && !(crtl->drap_reg && crtl->stack_realign_needed)) - { - rtx push, mov; - - /* We've decided to use the frame pointer already set up. - Describe this to the unwinder by pretending that both - push and mov insns happen right here. + if (reg == 0) + reg = gen_reg_rtx (Pmode); + /* Use the generic Mach-O PIC machinery. */ + return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg); + } +#endif - Putting the unwind info here at the end of the ms_hook - is done so that we can make absolutely certain we get - the required byte sequence at the start of the function, - rather than relying on an assembler that can produce - the exact encoding required. + if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES) + { + rtx tmp = legitimize_pe_coff_symbol (addr, true); + if (tmp) + return tmp; + } - However it does mean (in the unpatched case) that we have - a 1 insn window where the asynchronous unwind info is - incorrect. However, if we placed the unwind info at - its correct location we would have incorrect unwind info - in the patched case. Which is probably all moot since - I don't expect Wine generates dwarf2 unwind info for the - system libraries that use this feature. */ + if (TARGET_64BIT && legitimate_pic_address_disp_p (addr)) + new_rtx = addr; + else if ((!TARGET_64BIT + || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC) + && !TARGET_PECOFF + && gotoff_operand (addr, Pmode)) + { + /* This symbol may be referenced via a displacement + from the PIC base address (@GOTOFF). */ + if (GET_CODE (addr) == CONST) + addr = XEXP (addr, 0); - insn = emit_insn (gen_blockage ()); + if (GET_CODE (addr) == PLUS) + { + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), + UNSPEC_GOTOFF); + new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1)); + } + else + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF); - push = gen_push (hard_frame_pointer_rtx); - mov = gen_rtx_SET (hard_frame_pointer_rtx, - stack_pointer_rtx); - RTX_FRAME_RELATED_P (push) = 1; - RTX_FRAME_RELATED_P (mov) = 1; + new_rtx = gen_rtx_CONST (Pmode, new_rtx); - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov))); + if (TARGET_64BIT) + new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode); - /* Note that gen_push incremented m->fs.cfa_offset, even - though we didn't emit the push insn here. */ - m->fs.cfa_reg = hard_frame_pointer_rtx; - m->fs.fp_offset = m->fs.cfa_offset; - m->fs.fp_valid = true; - } - else + if (reg != 0) { - /* The frame pointer is not needed so pop %ebp again. - This leaves us with a pristine state. */ - emit_insn (gen_pop (hard_frame_pointer_rtx)); - } + gcc_assert (REG_P (reg)); + new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx, + new_rtx, reg, 1, OPTAB_DIRECT); + } + else + new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); } - - /* The first insn of a function that accepts its static chain on the - stack is to push the register that would be filled in by a direct - call. This insn will be skipped by the trampoline. */ - else if (ix86_static_chain_on_stack) + else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0) + /* We can't use @GOTOFF for text labels + on VxWorks, see gotoff_operand. */ + || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF)) { - static_chain = ix86_static_chain (cfun->decl, false); - insn = emit_insn (gen_push (static_chain)); - emit_insn (gen_blockage ()); + rtx tmp = legitimize_pe_coff_symbol (addr, true); + if (tmp) + return tmp; - /* We don't want to interpret this push insn as a register save, - only as a stack adjustment. The real copy of the register as - a save will be done later, if needed. */ - t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD); - t = gen_rtx_SET (stack_pointer_rtx, t); - add_reg_note (insn, REG_CFA_ADJUST_CFA, t); - RTX_FRAME_RELATED_P (insn) = 1; - } + /* For x64 PE-COFF there is no GOT table, + so we use address directly. */ + if (TARGET_64BIT && TARGET_PECOFF) + { + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); + } + else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC) + { + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), + UNSPEC_GOTPCREL); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); + new_rtx = gen_const_mem (Pmode, new_rtx); + set_mem_alias_set (new_rtx, ix86_GOT_alias_set ()); + } + else + { + /* This symbol must be referenced via a load + from the Global Offset Table (@GOT). */ + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); + if (TARGET_64BIT) + new_rtx = force_reg (Pmode, new_rtx); + new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); + new_rtx = gen_const_mem (Pmode, new_rtx); + set_mem_alias_set (new_rtx, ix86_GOT_alias_set ()); + } - /* Emit prologue code to adjust stack alignment and setup DRAP, in case - of DRAP is needed and stack realignment is really needed after reload */ - if (stack_realign_drap) + new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode); + } + else { - int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT; + if (CONST_INT_P (addr) + && !x86_64_immediate_operand (addr, VOIDmode)) + new_rtx = copy_to_suggested_reg (addr, reg, Pmode); + else if (GET_CODE (addr) == CONST) + { + addr = XEXP (addr, 0); - /* Can't use DRAP in interrupt function. */ - if (cfun->machine->func_type != TYPE_NORMAL) - sorry ("Dynamic Realign Argument Pointer (DRAP) not supported " - "in interrupt service routine. This may be worked " - "around by avoiding functions with aggregate return."); + /* We must match stuff we generate before. Assume the only + unspecs that can get here are ours. Not that we could do + anything with them anyway.... */ + if (GET_CODE (addr) == UNSPEC + || (GET_CODE (addr) == PLUS + && GET_CODE (XEXP (addr, 0)) == UNSPEC)) + return orig; + gcc_assert (GET_CODE (addr) == PLUS); + } - /* Only need to push parameter pointer reg if it is caller saved. */ - if (!call_used_regs[REGNO (crtl->drap_reg)]) + if (GET_CODE (addr) == PLUS) { - /* Push arg pointer reg */ - insn = emit_insn (gen_push (crtl->drap_reg)); - RTX_FRAME_RELATED_P (insn) = 1; - } + rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1); - /* Grab the argument pointer. */ - t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset); - insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t)); - RTX_FRAME_RELATED_P (insn) = 1; - m->fs.cfa_reg = crtl->drap_reg; - m->fs.cfa_offset = 0; + /* Check first to see if this is a constant + offset from a @GOTOFF symbol reference. */ + if (!TARGET_PECOFF + && gotoff_operand (op0, Pmode) + && CONST_INT_P (op1)) + { + if (!TARGET_64BIT) + { + new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0), + UNSPEC_GOTOFF); + new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1); + new_rtx = gen_rtx_CONST (Pmode, new_rtx); - /* Align the stack. */ - insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx, - stack_pointer_rtx, - GEN_INT (-align_bytes))); - RTX_FRAME_RELATED_P (insn) = 1; + if (reg != 0) + { + gcc_assert (REG_P (reg)); + new_rtx = expand_simple_binop (Pmode, PLUS, + pic_offset_table_rtx, + new_rtx, reg, 1, + OPTAB_DIRECT); + } + else + new_rtx + = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); + } + else + { + if (INTVAL (op1) < -16*1024*1024 + || INTVAL (op1) >= 16*1024*1024) + { + if (!x86_64_immediate_operand (op1, Pmode)) + op1 = force_reg (Pmode, op1); - /* Replicate the return address on the stack so that return - address can be reached via (argp - 1) slot. This is needed - to implement macro RETURN_ADDR_RTX and intrinsic function - expand_builtin_return_addr etc. */ - t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD); - t = gen_frame_mem (word_mode, t); - insn = emit_insn (gen_push (t)); - RTX_FRAME_RELATED_P (insn) = 1; + new_rtx + = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1); + } + } + } + else + { + rtx base = legitimize_pic_address (op0, reg); + machine_mode mode = GET_MODE (base); + new_rtx + = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg); - /* For the purposes of frame and register save area addressing, - we've started over with a new frame. */ - m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET; - m->fs.realigned = true; + if (CONST_INT_P (new_rtx)) + { + if (INTVAL (new_rtx) < -16*1024*1024 + || INTVAL (new_rtx) >= 16*1024*1024) + { + if (!x86_64_immediate_operand (new_rtx, mode)) + new_rtx = force_reg (mode, new_rtx); - if (static_chain) - { - /* Replicate static chain on the stack so that static chain - can be reached via (argp - 2) slot. This is needed for - nested function with stack realignment. */ - insn = emit_insn (gen_push (static_chain)); - RTX_FRAME_RELATED_P (insn) = 1; + new_rtx + = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx); + } + else + new_rtx = plus_constant (mode, base, INTVAL (new_rtx)); + } + else + { + /* For %rip addressing, we have to use + just disp32, not base nor index. */ + if (TARGET_64BIT + && (GET_CODE (base) == SYMBOL_REF + || GET_CODE (base) == LABEL_REF)) + base = force_reg (mode, base); + if (GET_CODE (new_rtx) == PLUS + && CONSTANT_P (XEXP (new_rtx, 1))) + { + base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0)); + new_rtx = XEXP (new_rtx, 1); + } + new_rtx = gen_rtx_PLUS (mode, base, new_rtx); + } + } } } + return new_rtx; +} + +/* Load the thread pointer. If TO_REG is true, force it into a register. */ - int_registers_saved = (frame.nregs == 0); - sse_registers_saved = (frame.nsseregs == 0); - save_stub_call_needed = (m->call_ms2sysv); - gcc_assert (sse_registers_saved || !save_stub_call_needed); +static rtx +get_thread_pointer (machine_mode tp_mode, bool to_reg) +{ + rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP); - if (frame_pointer_needed && !m->fs.fp_valid) + if (GET_MODE (tp) != tp_mode) { - /* Note: AT&T enter does NOT have reversed args. Enter is probably - slower on all targets. Also sdb didn't like it. */ - insn = emit_insn (gen_push (hard_frame_pointer_rtx)); - RTX_FRAME_RELATED_P (insn) = 1; + gcc_assert (GET_MODE (tp) == SImode); + gcc_assert (tp_mode == DImode); - /* Push registers now, before setting the frame pointer - on SEH target. */ - if (!int_registers_saved - && TARGET_SEH - && !frame.save_regs_using_mov) - { - ix86_emit_save_regs (); - int_registers_saved = true; - gcc_assert (m->fs.sp_offset == frame.reg_save_offset); - } + tp = gen_rtx_ZERO_EXTEND (tp_mode, tp); + } - if (m->fs.sp_offset == frame.hard_frame_pointer_offset) - { - insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx); - RTX_FRAME_RELATED_P (insn) = 1; + if (to_reg) + tp = copy_to_mode_reg (tp_mode, tp); - if (m->fs.cfa_reg == stack_pointer_rtx) - m->fs.cfa_reg = hard_frame_pointer_rtx; - m->fs.fp_offset = m->fs.sp_offset; - m->fs.fp_valid = true; - } - } + return tp; +} - if (!int_registers_saved) +/* Construct the SYMBOL_REF for the tls_get_addr function. */ + +static GTY(()) rtx ix86_tls_symbol; + +static rtx +ix86_tls_get_addr (void) +{ + if (!ix86_tls_symbol) { - /* If saving registers via PUSH, do so now. */ - if (!frame.save_regs_using_mov) - { - ix86_emit_save_regs (); - int_registers_saved = true; - gcc_assert (m->fs.sp_offset == frame.reg_save_offset); - } + const char *sym + = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT) + ? "___tls_get_addr" : "__tls_get_addr"); - /* When using red zone we may start register saving before allocating - the stack frame saving one cycle of the prologue. However, avoid - doing this if we have to probe the stack; at least on x86_64 the - stack probe can turn into a call that clobbers a red zone location. */ - else if (ix86_using_red_zone () - && (! TARGET_STACK_PROBE - || frame.stack_pointer_offset < CHECK_STACK_LIMIT)) - { - ix86_emit_save_regs_using_mov (frame.reg_save_offset); - int_registers_saved = true; - } + ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym); } - if (stack_realign_fp) + if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF) { - int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT; - gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT); + rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol), + UNSPEC_PLTOFF); + return gen_rtx_PLUS (Pmode, pic_offset_table_rtx, + gen_rtx_CONST (Pmode, unspec)); + } - /* Record last valid frame pointer offset. */ - m->fs.sp_realigned_fp_last = frame.reg_save_offset; + return ix86_tls_symbol; +} - /* The computation of the size of the re-aligned stack frame means - that we must allocate the size of the register save area before - performing the actual alignment. Otherwise we cannot guarantee - that there's enough storage above the realignment point. */ - allocate = frame.reg_save_offset - m->fs.sp_offset - + frame.stack_realign_allocate; - if (allocate) - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (-allocate), -1, false); +/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */ - /* Align the stack. */ - insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx, - stack_pointer_rtx, - GEN_INT (-align_bytes))); - m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes); - m->fs.sp_realigned_offset = m->fs.sp_offset - - frame.stack_realign_allocate; - /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset. - Beyond this point, stack access should be done via choose_baseaddr or - by using sp_valid_at and fp_valid_at to determine the correct base - register. Henceforth, any CFA offset should be thought of as logical - and not physical. */ - gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last); - gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset); - m->fs.sp_realigned = true; +static GTY(()) rtx ix86_tls_module_base_symbol; - /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which - is needed to describe where a register is saved using a realigned - stack pointer, so we need to invalidate the stack pointer for that - target. */ - if (TARGET_SEH) - m->fs.sp_valid = false; +rtx +ix86_tls_module_base (void) +{ + if (!ix86_tls_module_base_symbol) + { + ix86_tls_module_base_symbol + = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_"); - /* If SP offset is non-immediate after allocation of the stack frame, - then emit SSE saves or stub call prior to allocating the rest of the - stack frame. This is less efficient for the out-of-line stub because - we can't combine allocations across the call barrier, but it's better - than using a scratch register. */ - else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset - - m->fs.sp_realigned_offset), - Pmode)) - { - if (!sse_registers_saved) - { - ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); - sse_registers_saved = true; - } - else if (save_stub_call_needed) - { - ix86_emit_outlined_ms2sysv_save (frame); - save_stub_call_needed = false; - } - } + SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol) + |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT; } - allocate = frame.stack_pointer_offset - m->fs.sp_offset; + return ix86_tls_module_base_symbol; +} - if (flag_stack_usage_info) - { - /* We start to count from ARG_POINTER. */ - HOST_WIDE_INT stack_size = frame.stack_pointer_offset; +/* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is + false if we expect this to be used for a memory address and true if + we expect to load the address into a register. */ - /* If it was realigned, take into account the fake frame. */ - if (stack_realign_drap) - { - if (ix86_static_chain_on_stack) - stack_size += UNITS_PER_WORD; +rtx +legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) +{ + rtx dest, base, off; + rtx pic = NULL_RTX, tp = NULL_RTX; + machine_mode tp_mode = Pmode; + int type; - if (!call_used_regs[REGNO (crtl->drap_reg)]) - stack_size += UNITS_PER_WORD; + /* Fall back to global dynamic model if tool chain cannot support local + dynamic. */ + if (TARGET_SUN_TLS && !TARGET_64BIT + && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM + && model == TLS_MODEL_LOCAL_DYNAMIC) + model = TLS_MODEL_GLOBAL_DYNAMIC; - /* This over-estimates by 1 minimal-stack-alignment-unit but - mitigates that by counting in the new return address slot. */ - current_function_dynamic_stack_size - += crtl->stack_alignment_needed / BITS_PER_UNIT; - } + switch (model) + { + case TLS_MODEL_GLOBAL_DYNAMIC: + dest = gen_reg_rtx (Pmode); - current_function_static_stack_size = stack_size; - } + if (!TARGET_64BIT) + { + if (flag_pic && !TARGET_PECOFF) + pic = pic_offset_table_rtx; + else + { + pic = gen_reg_rtx (Pmode); + emit_insn (gen_set_got (pic)); + } + } - /* On SEH target with very large frame size, allocate an area to save - SSE registers (as the very large allocation won't be described). */ - if (TARGET_SEH - && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE - && !sse_registers_saved) - { - HOST_WIDE_INT sse_size - = frame.sse_reg_save_offset - frame.reg_save_offset; + if (TARGET_GNU2_TLS) + { + if (TARGET_64BIT) + emit_insn (gen_tls_dynamic_gnu2_64 (dest, x)); + else + emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic)); - gcc_assert (int_registers_saved); + tp = get_thread_pointer (Pmode, true); + dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest)); - /* No need to do stack checking as the area will be immediately - written. */ - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (-sse_size), -1, - m->fs.cfa_reg == stack_pointer_rtx); - allocate -= sse_size; - ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); - sse_registers_saved = true; - } + if (GET_MODE (x) != Pmode) + x = gen_rtx_ZERO_EXTEND (Pmode, x); - /* The stack has already been decremented by the instruction calling us - so probe if the size is non-negative to preserve the protection area. */ - if (allocate >= 0 - && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK - || flag_stack_clash_protection)) - { - if (flag_stack_clash_protection) - { - ix86_adjust_stack_and_probe_stack_clash (allocate, - int_registers_saved); - allocate = 0; + set_unique_reg_note (get_last_insn (), REG_EQUAL, x); } - else if (STACK_CHECK_MOVING_SP) + else { - if (!(crtl->is_leaf && !cfun->calls_alloca - && allocate <= get_probe_interval ())) + rtx caddr = ix86_tls_get_addr (); + + if (TARGET_64BIT) { - ix86_adjust_stack_and_probe (allocate, int_registers_saved); - allocate = 0; + rtx rax = gen_rtx_REG (Pmode, AX_REG); + rtx_insn *insns; + + start_sequence (); + emit_call_insn + (ix86_gen_tls_global_dynamic_64 (rax, x, caddr)); + insns = get_insns (); + end_sequence (); + + if (GET_MODE (x) != Pmode) + x = gen_rtx_ZERO_EXTEND (Pmode, x); + + RTL_CONST_CALL_P (insns) = 1; + emit_libcall_block (insns, dest, rax, x); } + else + emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr)); } - else - { - HOST_WIDE_INT size = allocate; + break; - if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000)) - size = 0x80000000 - get_stack_check_protect () - 1; + case TLS_MODEL_LOCAL_DYNAMIC: + base = gen_reg_rtx (Pmode); - if (TARGET_STACK_PROBE) - { - if (crtl->is_leaf && !cfun->calls_alloca) - { - if (size > get_probe_interval ()) - ix86_emit_probe_stack_range (0, size, int_registers_saved); - } - else - ix86_emit_probe_stack_range (0, - size + get_stack_check_protect (), - int_registers_saved); - } + if (!TARGET_64BIT) + { + if (flag_pic) + pic = pic_offset_table_rtx; else { - if (crtl->is_leaf && !cfun->calls_alloca) - { - if (size > get_probe_interval () - && size > get_stack_check_protect ()) - ix86_emit_probe_stack_range (get_stack_check_protect (), - (size - - get_stack_check_protect ()), - int_registers_saved); - } - else - ix86_emit_probe_stack_range (get_stack_check_protect (), size, - int_registers_saved); + pic = gen_reg_rtx (Pmode); + emit_insn (gen_set_got (pic)); } } - } - if (allocate == 0) - ; - else if (!ix86_target_stack_probe () - || frame.stack_pointer_offset < CHECK_STACK_LIMIT) - { - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (-allocate), -1, - m->fs.cfa_reg == stack_pointer_rtx); - } - else - { - rtx eax = gen_rtx_REG (Pmode, AX_REG); - rtx r10 = NULL; - rtx (*adjust_stack_insn)(rtx, rtx, rtx); - const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx); - bool eax_live = ix86_eax_live_at_start_p (); - bool r10_live = false; + if (TARGET_GNU2_TLS) + { + rtx tmp = ix86_tls_module_base (); - if (TARGET_64BIT) - r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0); + if (TARGET_64BIT) + emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp)); + else + emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic)); - if (eax_live) - { - insn = emit_insn (gen_push (eax)); - allocate -= UNITS_PER_WORD; - /* Note that SEH directives need to continue tracking the stack - pointer even after the frame pointer has been set up. */ - if (sp_is_cfa_reg || TARGET_SEH) - { - if (sp_is_cfa_reg) - m->fs.cfa_offset += UNITS_PER_WORD; - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - -UNITS_PER_WORD))); - } + tp = get_thread_pointer (Pmode, true); + set_unique_reg_note (get_last_insn (), REG_EQUAL, + gen_rtx_MINUS (Pmode, tmp, tp)); } - - if (r10_live) + else { - r10 = gen_rtx_REG (Pmode, R10_REG); - insn = emit_insn (gen_push (r10)); - allocate -= UNITS_PER_WORD; - if (sp_is_cfa_reg || TARGET_SEH) + rtx caddr = ix86_tls_get_addr (); + + if (TARGET_64BIT) { - if (sp_is_cfa_reg) - m->fs.cfa_offset += UNITS_PER_WORD; - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - -UNITS_PER_WORD))); + rtx rax = gen_rtx_REG (Pmode, AX_REG); + rtx_insn *insns; + rtx eqv; + + start_sequence (); + emit_call_insn + (ix86_gen_tls_local_dynamic_base_64 (rax, caddr)); + insns = get_insns (); + end_sequence (); + + /* Attach a unique REG_EQUAL, to allow the RTL optimizers to + share the LD_BASE result with other LD model accesses. */ + eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), + UNSPEC_TLS_LD_BASE); + + RTL_CONST_CALL_P (insns) = 1; + emit_libcall_block (insns, base, rax, eqv); } + else + emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr)); } - emit_move_insn (eax, GEN_INT (allocate)); - emit_insn (ix86_gen_allocate_stack_worker (eax, eax)); - - /* Use the fact that AX still contains ALLOCATE. */ - adjust_stack_insn = (Pmode == DImode - ? gen_pro_epilogue_adjust_stack_di_sub - : gen_pro_epilogue_adjust_stack_si_sub); + off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF); + off = gen_rtx_CONST (Pmode, off); - insn = emit_insn (adjust_stack_insn (stack_pointer_rtx, - stack_pointer_rtx, eax)); + dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off)); - if (sp_is_cfa_reg || TARGET_SEH) + if (TARGET_GNU2_TLS) { - if (sp_is_cfa_reg) - m->fs.cfa_offset += allocate; - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_FRAME_RELATED_EXPR, - gen_rtx_SET (stack_pointer_rtx, - plus_constant (Pmode, stack_pointer_rtx, - -allocate))); + dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp)); + + if (GET_MODE (x) != Pmode) + x = gen_rtx_ZERO_EXTEND (Pmode, x); + + set_unique_reg_note (get_last_insn (), REG_EQUAL, x); } - m->fs.sp_offset += allocate; + break; - /* Use stack_pointer_rtx for relative addressing so that code works for - realigned stack. But this means that we need a blockage to prevent - stores based on the frame pointer from being scheduled before. */ - if (r10_live && eax_live) - { - t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax); - emit_move_insn (gen_rtx_REG (word_mode, R10_REG), - gen_frame_mem (word_mode, t)); - t = plus_constant (Pmode, t, UNITS_PER_WORD); - emit_move_insn (gen_rtx_REG (word_mode, AX_REG), - gen_frame_mem (word_mode, t)); - emit_insn (gen_memory_blockage ()); + case TLS_MODEL_INITIAL_EXEC: + if (TARGET_64BIT) + { + if (TARGET_SUN_TLS && !TARGET_X32) + { + /* The Sun linker took the AMD64 TLS spec literally + and can only handle %rax as destination of the + initial executable code sequence. */ + + dest = gen_reg_rtx (DImode); + emit_insn (gen_tls_initial_exec_64_sun (dest, x)); + return dest; + } + + /* Generate DImode references to avoid %fs:(%reg32) + problems and linker IE->LE relaxation bug. */ + tp_mode = DImode; + pic = NULL; + type = UNSPEC_GOTNTPOFF; } - else if (eax_live || r10_live) + else if (flag_pic) { - t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax); - emit_move_insn (gen_rtx_REG (word_mode, - (eax_live ? AX_REG : R10_REG)), - gen_frame_mem (word_mode, t)); - emit_insn (gen_memory_blockage ()); + pic = pic_offset_table_rtx; + type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF; + } + else if (!TARGET_ANY_GNU_TLS) + { + pic = gen_reg_rtx (Pmode); + emit_insn (gen_set_got (pic)); + type = UNSPEC_GOTTPOFF; + } + else + { + pic = NULL; + type = UNSPEC_INDNTPOFF; } - } - gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset); - /* If we havn't already set up the frame pointer, do so now. */ - if (frame_pointer_needed && !m->fs.fp_valid) - { - insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx, - GEN_INT (frame.stack_pointer_offset - - frame.hard_frame_pointer_offset)); - insn = emit_insn (insn); - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL); + off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type); + off = gen_rtx_CONST (tp_mode, off); + if (pic) + off = gen_rtx_PLUS (tp_mode, pic, off); + off = gen_const_mem (tp_mode, off); + set_mem_alias_set (off, ix86_GOT_alias_set ()); - if (m->fs.cfa_reg == stack_pointer_rtx) - m->fs.cfa_reg = hard_frame_pointer_rtx; - m->fs.fp_offset = frame.hard_frame_pointer_offset; - m->fs.fp_valid = true; - } + if (TARGET_64BIT || TARGET_ANY_GNU_TLS) + { + base = get_thread_pointer (tp_mode, + for_mov || !TARGET_TLS_DIRECT_SEG_REFS); + off = force_reg (tp_mode, off); + dest = gen_rtx_PLUS (tp_mode, base, off); + if (tp_mode != Pmode) + dest = convert_to_mode (Pmode, dest, 1); + } + else + { + base = get_thread_pointer (Pmode, true); + dest = gen_reg_rtx (Pmode); + emit_insn (ix86_gen_sub3 (dest, base, off)); + } + break; - if (!int_registers_saved) - ix86_emit_save_regs_using_mov (frame.reg_save_offset); - if (!sse_registers_saved) - ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); - else if (save_stub_call_needed) - ix86_emit_outlined_ms2sysv_save (frame); + case TLS_MODEL_LOCAL_EXEC: + off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), + (TARGET_64BIT || TARGET_ANY_GNU_TLS) + ? UNSPEC_NTPOFF : UNSPEC_TPOFF); + off = gen_rtx_CONST (Pmode, off); - /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT - in PROLOGUE. */ - if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry) - { - rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM); - insn = emit_insn (gen_set_got (pic)); - RTX_FRAME_RELATED_P (insn) = 1; - add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX); - emit_insn (gen_prologue_use (pic)); - /* Deleting already emmitted SET_GOT if exist and allocated to - REAL_PIC_OFFSET_TABLE_REGNUM. */ - ix86_elim_entry_set_got (pic); - } + if (TARGET_64BIT || TARGET_ANY_GNU_TLS) + { + base = get_thread_pointer (Pmode, + for_mov || !TARGET_TLS_DIRECT_SEG_REFS); + return gen_rtx_PLUS (Pmode, base, off); + } + else + { + base = get_thread_pointer (Pmode, true); + dest = gen_reg_rtx (Pmode); + emit_insn (ix86_gen_sub3 (dest, base, off)); + } + break; - if (crtl->drap_reg && !crtl->stack_realign_needed) - { - /* vDRAP is setup but after reload it turns out stack realign - isn't necessary, here we will emit prologue to setup DRAP - without stack realign adjustment */ - t = choose_baseaddr (0, NULL); - emit_insn (gen_rtx_SET (crtl->drap_reg, t)); + default: + gcc_unreachable (); } - /* Prevent instructions from being scheduled into register save push - sequence when access to the redzone area is done through frame pointer. - The offset between the frame pointer and the stack pointer is calculated - relative to the value of the stack pointer at the end of the function - prologue, and moving instructions that access redzone area via frame - pointer inside push sequence violates this assumption. */ - if (frame_pointer_needed && frame.red_zone_size) - emit_insn (gen_memory_blockage ()); - - /* SEH requires that the prologue end within 256 bytes of the start of - the function. Prevent instruction schedules that would extend that. - Further, prevent alloca modifications to the stack pointer from being - combined with prologue modifications. */ - if (TARGET_SEH) - emit_insn (gen_prologue_use (stack_pointer_rtx)); + return dest; } -/* Emit code to restore REG using a POP insn. */ - -static void -ix86_emit_restore_reg_using_pop (rtx reg) +/* Return true if OP refers to a TLS address. */ +bool +ix86_tls_address_pattern_p (rtx op) { - struct machine_function *m = cfun->machine; - rtx_insn *insn = emit_insn (gen_pop (reg)); - - ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset); - m->fs.sp_offset -= UNITS_PER_WORD; - - if (m->fs.cfa_reg == crtl->drap_reg - && REGNO (reg) == REGNO (crtl->drap_reg)) + subrtx_var_iterator::array_type array; + FOR_EACH_SUBRTX_VAR (iter, array, op, ALL) { - /* Previously we'd represented the CFA as an expression - like *(%ebp - 8). We've just popped that value from - the stack, which means we need to reset the CFA to - the drap register. This will remain until we restore - the stack pointer. */ - add_reg_note (insn, REG_CFA_DEF_CFA, reg); - RTX_FRAME_RELATED_P (insn) = 1; + rtx op = *iter; + if (MEM_P (op)) + { + rtx *x = &XEXP (op, 0); + while (GET_CODE (*x) == PLUS) + { + int i; + for (i = 0; i < 2; i++) + { + rtx u = XEXP (*x, i); + if (GET_CODE (u) == ZERO_EXTEND) + u = XEXP (u, 0); + if (GET_CODE (u) == UNSPEC + && XINT (u, 1) == UNSPEC_TP) + return true; + } + x = &XEXP (*x, 0); + } - /* This means that the DRAP register is valid for addressing too. */ - m->fs.drap_valid = true; - return; + iter.skip_subrtxes (); + } } - if (m->fs.cfa_reg == stack_pointer_rtx) - { - rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); - x = gen_rtx_SET (stack_pointer_rtx, x); - add_reg_note (insn, REG_CFA_ADJUST_CFA, x); - RTX_FRAME_RELATED_P (insn) = 1; - - m->fs.cfa_offset -= UNITS_PER_WORD; - } + return false; +} - /* When the frame pointer is the CFA, and we pop it, we are - swapping back to the stack pointer as the CFA. This happens - for stack frames that don't allocate other data, so we assume - the stack pointer is now pointing at the return address, i.e. - the function entry state, which makes the offset be 1 word. */ - if (reg == hard_frame_pointer_rtx) +/* Rewrite *LOC so that it refers to a default TLS address space. */ +void +ix86_rewrite_tls_address_1 (rtx *loc) +{ + subrtx_ptr_iterator::array_type array; + FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL) { - m->fs.fp_valid = false; - if (m->fs.cfa_reg == hard_frame_pointer_rtx) + rtx *loc = *iter; + if (MEM_P (*loc)) { - m->fs.cfa_reg = stack_pointer_rtx; - m->fs.cfa_offset -= UNITS_PER_WORD; + rtx addr = XEXP (*loc, 0); + rtx *x = &addr; + while (GET_CODE (*x) == PLUS) + { + int i; + for (i = 0; i < 2; i++) + { + rtx u = XEXP (*x, i); + if (GET_CODE (u) == ZERO_EXTEND) + u = XEXP (u, 0); + if (GET_CODE (u) == UNSPEC + && XINT (u, 1) == UNSPEC_TP) + { + addr_space_t as = DEFAULT_TLS_SEG_REG; - add_reg_note (insn, REG_CFA_DEF_CFA, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, - GEN_INT (m->fs.cfa_offset))); - RTX_FRAME_RELATED_P (insn) = 1; + *x = XEXP (*x, 1 - i); + + *loc = replace_equiv_address_nv (*loc, addr, true); + set_mem_addr_space (*loc, as); + return; + } + } + x = &XEXP (*x, 0); + } + + iter.skip_subrtxes (); } } } -/* Emit code to restore saved registers using POP insns. */ - -static void -ix86_emit_restore_regs_using_pop (void) +/* Rewrite instruction pattern involvning TLS address + so that it refers to a default TLS address space. */ +rtx +ix86_rewrite_tls_address (rtx pattern) { - unsigned int regno; - - for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true)) - ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno)); + pattern = copy_insn (pattern); + ix86_rewrite_tls_address_1 (&pattern); + return pattern; } -/* Emit code and notes for the LEAVE instruction. If insn is non-null, - omits the emit and only attaches the notes. */ +/* Create or return the unique __imp_DECL dllimport symbol corresponding + to symbol DECL if BEIMPORT is true. Otherwise create or return the + unique refptr-DECL symbol corresponding to symbol DECL. */ -static void -ix86_emit_leave (rtx_insn *insn) +struct dllimport_hasher : ggc_cache_ptr_hash { - struct machine_function *m = cfun->machine; - if (!insn) - insn = emit_insn (ix86_gen_leave ()); + static inline hashval_t hash (tree_map *m) { return m->hash; } + static inline bool + equal (tree_map *a, tree_map *b) + { + return a->base.from == b->base.from; + } - ix86_add_queued_cfa_restore_notes (insn); + static int + keep_cache_entry (tree_map *&m) + { + return ggc_marked_p (m->base.from); + } +}; - gcc_assert (m->fs.fp_valid); - m->fs.sp_valid = true; - m->fs.sp_realigned = false; - m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD; - m->fs.fp_valid = false; +static GTY((cache)) hash_table *dllimport_map; - if (m->fs.cfa_reg == hard_frame_pointer_rtx) - { - m->fs.cfa_reg = stack_pointer_rtx; - m->fs.cfa_offset = m->fs.sp_offset; +static tree +get_dllimport_decl (tree decl, bool beimport) +{ + struct tree_map *h, in; + const char *name; + const char *prefix; + size_t namelen, prefixlen; + char *imp_name; + tree to; + rtx rtl; - add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, stack_pointer_rtx, - m->fs.sp_offset)); - RTX_FRAME_RELATED_P (insn) = 1; - } - ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx, - m->fs.fp_offset); -} + if (!dllimport_map) + dllimport_map = hash_table::create_ggc (512); -/* Emit code to restore saved registers using MOV insns. - First register is restored from CFA - CFA_OFFSET. */ -static void -ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset, - bool maybe_eh_return) -{ - struct machine_function *m = cfun->machine; - unsigned int regno; + in.hash = htab_hash_pointer (decl); + in.base.from = decl; + tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT); + h = *loc; + if (h) + return h->to; - for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true)) - { - rtx reg = gen_rtx_REG (word_mode, regno); - rtx mem; - rtx_insn *insn; + *loc = h = ggc_alloc (); + h->hash = in.hash; + h->base.from = decl; + h->to = to = build_decl (DECL_SOURCE_LOCATION (decl), + VAR_DECL, NULL, ptr_type_node); + DECL_ARTIFICIAL (to) = 1; + DECL_IGNORED_P (to) = 1; + DECL_EXTERNAL (to) = 1; + TREE_READONLY (to) = 1; - mem = choose_baseaddr (cfa_offset, NULL); - mem = gen_frame_mem (word_mode, mem); - insn = emit_move_insn (reg, mem); + name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); + name = targetm.strip_name_encoding (name); + if (beimport) + prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0 + ? "*__imp_" : "*__imp__"; + else + prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr."; + namelen = strlen (name); + prefixlen = strlen (prefix); + imp_name = (char *) alloca (namelen + prefixlen + 1); + memcpy (imp_name, prefix, prefixlen); + memcpy (imp_name + prefixlen, name, namelen + 1); - if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg)) - { - /* Previously we'd represented the CFA as an expression - like *(%ebp - 8). We've just popped that value from - the stack, which means we need to reset the CFA to - the drap register. This will remain until we restore - the stack pointer. */ - add_reg_note (insn, REG_CFA_DEF_CFA, reg); - RTX_FRAME_RELATED_P (insn) = 1; + name = ggc_alloc_string (imp_name, namelen + prefixlen); + rtl = gen_rtx_SYMBOL_REF (Pmode, name); + SET_SYMBOL_REF_DECL (rtl, to); + SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR; + if (!beimport) + { + SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL; +#ifdef SUB_TARGET_RECORD_STUB + SUB_TARGET_RECORD_STUB (name); +#endif + } - /* This means that the DRAP register is valid for addressing. */ - m->fs.drap_valid = true; - } - else - ix86_add_cfa_restore_note (NULL, reg, cfa_offset); + rtl = gen_const_mem (Pmode, rtl); + set_mem_alias_set (rtl, ix86_GOT_alias_set ()); - cfa_offset -= UNITS_PER_WORD; - } + SET_DECL_RTL (to, rtl); + SET_DECL_ASSEMBLER_NAME (to, get_identifier (name)); + + return to; } -/* Emit code to restore saved registers using MOV insns. - First register is restored from CFA - CFA_OFFSET. */ -static void -ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset, - bool maybe_eh_return) +/* Expand SYMBOL into its corresponding far-address symbol. + WANT_REG is true if we require the result be a register. */ + +static rtx +legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg) { - unsigned int regno; + tree imp_decl; + rtx x; - for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true)) - { - rtx reg = gen_rtx_REG (V4SFmode, regno); - rtx mem; - unsigned int align = GET_MODE_ALIGNMENT (V4SFmode); + gcc_assert (SYMBOL_REF_DECL (symbol)); + imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false); - mem = choose_baseaddr (cfa_offset, &align); - mem = gen_rtx_MEM (V4SFmode, mem); + x = DECL_RTL (imp_decl); + if (want_reg) + x = force_reg (Pmode, x); + return x; +} - /* The location aligment depends upon the base register. */ - align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align); - gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1))); - set_mem_align (mem, align); - emit_insn (gen_rtx_SET (reg, mem)); - - ix86_add_cfa_restore_note (NULL, reg, cfa_offset); - - cfa_offset -= GET_MODE_SIZE (V4SFmode); - } -} +/* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is + true if we require the result be a register. */ -static void -ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame, - bool use_call, int style) +static rtx +legitimize_dllimport_symbol (rtx symbol, bool want_reg) { - struct machine_function *m = cfun->machine; - const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS - + m->call_ms2sysv_extra_regs; - rtvec v; - unsigned int elems_needed, align, i, vi = 0; - rtx_insn *insn; - rtx sym, tmp; - rtx rsi = gen_rtx_REG (word_mode, SI_REG); - rtx r10 = NULL_RTX; - const struct xlogue_layout &xlogue = xlogue_layout::get_instance (); - HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset (); - HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset; - rtx rsi_frame_load = NULL_RTX; - HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1; - enum xlogue_stub stub; - - gcc_assert (!m->fs.fp_valid || frame_pointer_needed); - - /* If using a realigned stack, we should never start with padding. */ - gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ()); + tree imp_decl; + rtx x; - /* Setup RSI as the stub's base pointer. */ - align = GET_MODE_ALIGNMENT (V4SFmode); - tmp = choose_baseaddr (rsi_offset, &align, SI_REG); - gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode)); + gcc_assert (SYMBOL_REF_DECL (symbol)); + imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true); - emit_insn (gen_rtx_SET (rsi, tmp)); + x = DECL_RTL (imp_decl); + if (want_reg) + x = force_reg (Pmode, x); + return x; +} - /* Get a symbol for the stub. */ - if (frame_pointer_needed) - stub = use_call ? XLOGUE_STUB_RESTORE_HFP - : XLOGUE_STUB_RESTORE_HFP_TAIL; - else - stub = use_call ? XLOGUE_STUB_RESTORE - : XLOGUE_STUB_RESTORE_TAIL; - sym = xlogue.get_stub_rtx (stub); +/* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG + is true if we require the result be a register. */ - elems_needed = ncregs; - if (use_call) - elems_needed += 1; - else - elems_needed += frame_pointer_needed ? 5 : 3; - v = rtvec_alloc (elems_needed); +rtx +legitimize_pe_coff_symbol (rtx addr, bool inreg) +{ + if (!TARGET_PECOFF) + return NULL_RTX; - /* We call the epilogue stub when we need to pop incoming args or we are - doing a sibling call as the tail. Otherwise, we will emit a jmp to the - epilogue stub and it is the tail-call. */ - if (use_call) - RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym); - else + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES) { - RTVEC_ELT (v, vi++) = ret_rtx; - RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym); - if (frame_pointer_needed) + if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr)) + return legitimize_dllimport_symbol (addr, inreg); + if (GET_CODE (addr) == CONST + && GET_CODE (XEXP (addr, 0)) == PLUS + && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF + && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0))) { - rtx rbp = gen_rtx_REG (DImode, BP_REG); - gcc_assert (m->fs.fp_valid); - gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx); - - tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8)); - RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp); - RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp)); - tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode)); - RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp); + rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg); + return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1)); } - else - { - /* If no hard frame pointer, we set R10 to the SP restore value. */ - gcc_assert (!m->fs.fp_valid); - gcc_assert (m->fs.cfa_reg == stack_pointer_rtx); - gcc_assert (m->fs.sp_valid); + } - r10 = gen_rtx_REG (DImode, R10_REG); - tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset)); - emit_insn (gen_rtx_SET (r10, tmp)); + if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC) + return NULL_RTX; + if (GET_CODE (addr) == SYMBOL_REF + && !is_imported_p (addr) + && SYMBOL_REF_EXTERNAL_P (addr) + && SYMBOL_REF_DECL (addr)) + return legitimize_pe_coff_extern_decl (addr, inreg); - RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10); - } + if (GET_CODE (addr) == CONST + && GET_CODE (XEXP (addr, 0)) == PLUS + && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF + && !is_imported_p (XEXP (XEXP (addr, 0), 0)) + && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0)) + && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0))) + { + rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg); + return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1)); } + return NULL_RTX; +} - /* Generate frame load insns and restore notes. */ - for (i = 0; i < ncregs; ++i) - { - const xlogue_layout::reginfo &r = xlogue.get_reginfo (i); - machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode; - rtx reg, frame_load; +/* Try machine-dependent ways of modifying an illegitimate address + to be legitimate. If we find one, return the new, valid address. + This macro is used in only one place: `memory_address' in explow.c. - reg = gen_rtx_REG (mode, r.regno); - frame_load = gen_frame_load (reg, rsi, r.offset); + OLDX is the address as it was before break_out_memory_refs was called. + In some cases it is useful to look at this to decide what needs to be done. - /* Save RSI frame load insn & note to add last. */ - if (r.regno == SI_REG) - { - gcc_assert (!rsi_frame_load); - rsi_frame_load = frame_load; - rsi_restore_offset = r.offset; - } - else - { - RTVEC_ELT (v, vi++) = frame_load; - ix86_add_cfa_restore_note (NULL, reg, r.offset); - } - } + It is always safe for this macro to do nothing. It exists to recognize + opportunities to optimize the output. - /* Add RSI frame load & restore note at the end. */ - gcc_assert (rsi_frame_load); - gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1); - RTVEC_ELT (v, vi++) = rsi_frame_load; - ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG), - rsi_restore_offset); + For the 80386, we handle X+REG by loading X into a register R and + using R+REG. R will go in a general reg and indexing will be used. + However, if REG is a broken-out memory address or multiplication, + nothing needs to be done because REG can certainly go in a general reg. - /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */ - if (!use_call && !frame_pointer_needed) - { - gcc_assert (m->fs.sp_valid); - gcc_assert (!m->fs.sp_realigned); + When -fpic is used, special handling is needed for symbolic references. + See comments by legitimize_pic_address in i386.c for details. */ - /* At this point, R10 should point to frame.stack_realign_offset. */ - if (m->fs.cfa_reg == stack_pointer_rtx) - m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset; - m->fs.sp_offset = frame.stack_realign_offset; - } +static rtx +ix86_legitimize_address (rtx x, rtx, machine_mode mode) +{ + bool changed = false; + unsigned log; - gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v)); - tmp = gen_rtx_PARALLEL (VOIDmode, v); - if (use_call) - insn = emit_insn (tmp); - else + log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0; + if (log) + return legitimize_tls_address (x, (enum tls_model) log, false); + if (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF + && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0)))) { - insn = emit_jump_insn (tmp); - JUMP_LABEL (insn) = ret_rtx; - - if (frame_pointer_needed) - ix86_emit_leave (insn); - else - { - /* Need CFA adjust note. */ - tmp = gen_rtx_SET (stack_pointer_rtx, r10); - add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp); - } + rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0), + (enum tls_model) log, false); + return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1)); } - RTX_FRAME_RELATED_P (insn) = true; - ix86_add_queued_cfa_restore_notes (insn); - - /* If we're not doing a tail-call, we need to adjust the stack. */ - if (use_call && m->fs.sp_valid) + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES) { - HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset; - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (dealloc), style, - m->fs.cfa_reg == stack_pointer_rtx); + rtx tmp = legitimize_pe_coff_symbol (x, true); + if (tmp) + return tmp; } -} -/* Restore function stack, frame, and registers. */ + if (flag_pic && SYMBOLIC_CONST (x)) + return legitimize_pic_address (x, 0); -void -ix86_expand_epilogue (int style) -{ - struct machine_function *m = cfun->machine; - struct machine_frame_state frame_state_save = m->fs; - bool restore_regs_via_mov; - bool using_drap; - bool restore_stub_is_tail = false; +#if TARGET_MACHO + if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x)) + return machopic_indirect_data_reference (x, 0); +#endif - if (ix86_function_naked (current_function_decl)) + /* Canonicalize shifts by 0, 1, 2, 3 into multiply */ + if (GET_CODE (x) == ASHIFT + && CONST_INT_P (XEXP (x, 1)) + && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4) { - /* The program should not reach this point. */ - emit_insn (gen_ud2 ()); - return; + changed = true; + log = INTVAL (XEXP (x, 1)); + x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)), + GEN_INT (1 << log)); } - ix86_finalize_stack_frame_flags (); - const struct ix86_frame &frame = cfun->machine->frame; - - m->fs.sp_realigned = stack_realign_fp; - m->fs.sp_valid = stack_realign_fp - || !frame_pointer_needed - || crtl->sp_is_unchanging; - gcc_assert (!m->fs.sp_valid - || m->fs.sp_offset == frame.stack_pointer_offset); + if (GET_CODE (x) == PLUS) + { + /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */ - /* The FP must be valid if the frame pointer is present. */ - gcc_assert (frame_pointer_needed == m->fs.fp_valid); - gcc_assert (!m->fs.fp_valid - || m->fs.fp_offset == frame.hard_frame_pointer_offset); + if (GET_CODE (XEXP (x, 0)) == ASHIFT + && CONST_INT_P (XEXP (XEXP (x, 0), 1)) + && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4) + { + changed = true; + log = INTVAL (XEXP (XEXP (x, 0), 1)); + XEXP (x, 0) = gen_rtx_MULT (Pmode, + force_reg (Pmode, XEXP (XEXP (x, 0), 0)), + GEN_INT (1 << log)); + } - /* We must have *some* valid pointer to the stack frame. */ - gcc_assert (m->fs.sp_valid || m->fs.fp_valid); + if (GET_CODE (XEXP (x, 1)) == ASHIFT + && CONST_INT_P (XEXP (XEXP (x, 1), 1)) + && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4) + { + changed = true; + log = INTVAL (XEXP (XEXP (x, 1), 1)); + XEXP (x, 1) = gen_rtx_MULT (Pmode, + force_reg (Pmode, XEXP (XEXP (x, 1), 0)), + GEN_INT (1 << log)); + } - /* The DRAP is never valid at this point. */ - gcc_assert (!m->fs.drap_valid); + /* Put multiply first if it isn't already. */ + if (GET_CODE (XEXP (x, 1)) == MULT) + { + std::swap (XEXP (x, 0), XEXP (x, 1)); + changed = true; + } - /* See the comment about red zone and frame - pointer usage in ix86_expand_prologue. */ - if (frame_pointer_needed && frame.red_zone_size) - emit_insn (gen_memory_blockage ()); + /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const))) + into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be + created by virtual register instantiation, register elimination, and + similar optimizations. */ + if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS) + { + changed = true; + x = gen_rtx_PLUS (Pmode, + gen_rtx_PLUS (Pmode, XEXP (x, 0), + XEXP (XEXP (x, 1), 0)), + XEXP (XEXP (x, 1), 1)); + } - using_drap = crtl->drap_reg && crtl->stack_realign_needed; - gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg); + /* Canonicalize + (plus (plus (mult (reg) (const)) (plus (reg) (const))) const) + into (plus (plus (mult (reg) (const)) (reg)) (const)). */ + else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT + && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS + && CONSTANT_P (XEXP (x, 1))) + { + rtx constant; + rtx other = NULL_RTX; - /* Determine the CFA offset of the end of the red-zone. */ - m->fs.red_zone_offset = 0; - if (ix86_using_red_zone () && crtl->args.pops_args < 65536) - { - /* The red-zone begins below return address and error code in - exception handler. */ - m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET; + if (CONST_INT_P (XEXP (x, 1))) + { + constant = XEXP (x, 1); + other = XEXP (XEXP (XEXP (x, 0), 1), 1); + } + else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1))) + { + constant = XEXP (XEXP (XEXP (x, 0), 1), 1); + other = XEXP (x, 1); + } + else + constant = 0; - /* When the register save area is in the aligned portion of - the stack, determine the maximum runtime displacement that - matches up with the aligned frame. */ - if (stack_realign_drap) - m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT - + UNITS_PER_WORD); - } + if (constant) + { + changed = true; + x = gen_rtx_PLUS (Pmode, + gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0), + XEXP (XEXP (XEXP (x, 0), 1), 0)), + plus_constant (Pmode, other, + INTVAL (constant))); + } + } - HOST_WIDE_INT reg_save_offset = frame.reg_save_offset; + if (changed && ix86_legitimate_address_p (mode, x, false)) + return x; - /* Special care must be taken for the normal return case of a function - using eh_return: the eax and edx registers are marked as saved, but - not restored along this path. Adjust the save location to match. */ - if (crtl->calls_eh_return && style != 2) - reg_save_offset -= 2 * UNITS_PER_WORD; + if (GET_CODE (XEXP (x, 0)) == MULT) + { + changed = true; + XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0)); + } - /* EH_RETURN requires the use of moves to function properly. */ - if (crtl->calls_eh_return) - restore_regs_via_mov = true; - /* SEH requires the use of pops to identify the epilogue. */ - else if (TARGET_SEH) - restore_regs_via_mov = false; - /* If we're only restoring one register and sp cannot be used then - using a move instruction to restore the register since it's - less work than reloading sp and popping the register. */ - else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1) - restore_regs_via_mov = true; - else if (TARGET_EPILOGUE_USING_MOVE - && cfun->machine->use_fast_prologue_epilogue - && (frame.nregs > 1 - || m->fs.sp_offset != reg_save_offset)) - restore_regs_via_mov = true; - else if (frame_pointer_needed - && !frame.nregs - && m->fs.sp_offset != reg_save_offset) - restore_regs_via_mov = true; - else if (frame_pointer_needed - && TARGET_USE_LEAVE - && cfun->machine->use_fast_prologue_epilogue - && frame.nregs == 1) - restore_regs_via_mov = true; - else - restore_regs_via_mov = false; + if (GET_CODE (XEXP (x, 1)) == MULT) + { + changed = true; + XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1)); + } - if (restore_regs_via_mov || frame.nsseregs) - { - /* Ensure that the entire register save area is addressable via - the stack pointer, if we will restore SSE regs via sp. */ - if (TARGET_64BIT - && m->fs.sp_offset > 0x7fffffff - && sp_valid_at (frame.stack_realign_offset + 1) - && (frame.nsseregs + frame.nregs) != 0) + if (changed + && REG_P (XEXP (x, 1)) + && REG_P (XEXP (x, 0))) + return x; + + if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1))) { - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (m->fs.sp_offset - - frame.sse_reg_save_offset), - style, - m->fs.cfa_reg == stack_pointer_rtx); + changed = true; + x = legitimize_pic_address (x, 0); } - } - /* If there are any SSE registers to restore, then we have to do it - via moves, since there's obviously no pop for SSE regs. */ - if (frame.nsseregs) - ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset, - style == 2); + if (changed && ix86_legitimate_address_p (mode, x, false)) + return x; - if (m->call_ms2sysv) - { - int pop_incoming_args = crtl->args.pops_args && crtl->args.size; + if (REG_P (XEXP (x, 0))) + { + rtx temp = gen_reg_rtx (Pmode); + rtx val = force_operand (XEXP (x, 1), temp); + if (val != temp) + { + val = convert_to_mode (Pmode, val, 1); + emit_move_insn (temp, val); + } - /* We cannot use a tail-call for the stub if: - 1. We have to pop incoming args, - 2. We have additional int regs to restore, or - 3. A sibling call will be the tail-call, or - 4. We are emitting an eh_return_internal epilogue. + XEXP (x, 1) = temp; + return x; + } - TODO: Item 4 has not yet tested! + else if (REG_P (XEXP (x, 1))) + { + rtx temp = gen_reg_rtx (Pmode); + rtx val = force_operand (XEXP (x, 0), temp); + if (val != temp) + { + val = convert_to_mode (Pmode, val, 1); + emit_move_insn (temp, val); + } - If any of the above are true, we will call the stub rather than - jump to it. */ - restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1); - ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style); + XEXP (x, 0) = temp; + return x; + } } - /* If using out-of-line stub that is a tail-call, then...*/ - if (m->call_ms2sysv && restore_stub_is_tail) - { - /* TODO: parinoid tests. (remove eventually) */ - gcc_assert (m->fs.sp_valid); - gcc_assert (!m->fs.sp_realigned); - gcc_assert (!m->fs.fp_valid); - gcc_assert (!m->fs.realigned); - gcc_assert (m->fs.sp_offset == UNITS_PER_WORD); - gcc_assert (!crtl->drap_reg); - gcc_assert (!frame.nregs); - } - else if (restore_regs_via_mov) - { - rtx t; + return x; +} + +/* Print an integer constant expression in assembler syntax. Addition + and subtraction are the only arithmetic that may appear in these + expressions. FILE is the stdio stream to write to, X is the rtx, and + CODE is the operand print code from the output string. */ - if (frame.nregs) - ix86_emit_restore_regs_using_mov (reg_save_offset, style == 2); +static void +output_pic_addr_const (FILE *file, rtx x, int code) +{ + char buf[256]; - /* eh_return epilogues need %ecx added to the stack pointer. */ - if (style == 2) - { - rtx sa = EH_RETURN_STACKADJ_RTX; - rtx_insn *insn; + switch (GET_CODE (x)) + { + case PC: + gcc_assert (flag_pic); + putc ('.', file); + break; - /* %ecx can't be used for both DRAP register and eh_return. */ - if (crtl->drap_reg) - gcc_assert (REGNO (crtl->drap_reg) != CX_REG); + case SYMBOL_REF: + if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS) + output_addr_const (file, x); + else + { + const char *name = XSTR (x, 0); - /* regparm nested functions don't work with eh_return. */ - gcc_assert (!ix86_static_chain_on_stack); + /* Mark the decl as referenced so that cgraph will + output the function. */ + if (SYMBOL_REF_DECL (x)) + mark_decl_referenced (SYMBOL_REF_DECL (x)); - if (frame_pointer_needed) - { - t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa); - t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD); - emit_insn (gen_rtx_SET (sa, t)); +#if TARGET_MACHO + if (MACHOPIC_INDIRECT + && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION) + name = machopic_indirection_name (x, /*stub_p=*/true); +#endif + assemble_name (file, name); + } + if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF) + && code == 'P' && ! SYMBOL_REF_LOCAL_P (x)) + fputs ("@PLT", file); + break; - t = gen_frame_mem (Pmode, hard_frame_pointer_rtx); - insn = emit_move_insn (hard_frame_pointer_rtx, t); + case LABEL_REF: + x = XEXP (x, 0); + /* FALLTHRU */ + case CODE_LABEL: + ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x)); + assemble_name (asm_out_file, buf); + break; - /* Note that we use SA as a temporary CFA, as the return - address is at the proper place relative to it. We - pretend this happens at the FP restore insn because - prior to this insn the FP would be stored at the wrong - offset relative to SA, and after this insn we have no - other reasonable register to use for the CFA. We don't - bother resetting the CFA to the SP for the duration of - the return insn, unless the control flow instrumentation - is done. In this case the SP is used later and we have - to reset CFA to SP. */ - add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, sa, UNITS_PER_WORD)); - ix86_add_queued_cfa_restore_notes (insn); - add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx); - RTX_FRAME_RELATED_P (insn) = 1; + case CONST_INT: + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); + break; - m->fs.cfa_reg = sa; - m->fs.cfa_offset = UNITS_PER_WORD; - m->fs.fp_valid = false; + case CONST: + /* This used to output parentheses around the expression, + but that does not work on the 386 (either ATT or BSD assembler). */ + output_pic_addr_const (file, XEXP (x, 0), code); + break; - pro_epilogue_adjust_stack (stack_pointer_rtx, sa, - const0_rtx, style, - flag_cf_protection); - } - else - { - t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa); - t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD); - insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t)); - ix86_add_queued_cfa_restore_notes (insn); + case CONST_DOUBLE: + /* We can't handle floating point constants; + TARGET_PRINT_OPERAND must handle them. */ + output_operand_lossage ("floating constant misused"); + break; - gcc_assert (m->fs.cfa_reg == stack_pointer_rtx); - if (m->fs.cfa_offset != UNITS_PER_WORD) - { - m->fs.cfa_offset = UNITS_PER_WORD; - add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, stack_pointer_rtx, - UNITS_PER_WORD)); - RTX_FRAME_RELATED_P (insn) = 1; - } - } - m->fs.sp_offset = UNITS_PER_WORD; - m->fs.sp_valid = true; - m->fs.sp_realigned = false; - } - } - else - { - /* SEH requires that the function end with (1) a stack adjustment - if necessary, (2) a sequence of pops, and (3) a return or - jump instruction. Prevent insns from the function body from - being scheduled into this sequence. */ - if (TARGET_SEH) + case PLUS: + /* Some assemblers need integer constants to appear first. */ + if (CONST_INT_P (XEXP (x, 0))) { - /* Prevent a catch region from being adjacent to the standard - epilogue sequence. Unfortunately neither crtl->uses_eh_lsda - nor several other flags that would be interesting to test are - set up yet. */ - if (flag_non_call_exceptions) - emit_insn (gen_nops (const1_rtx)); - else - emit_insn (gen_blockage ()); + output_pic_addr_const (file, XEXP (x, 0), code); + putc ('+', file); + output_pic_addr_const (file, XEXP (x, 1), code); } - - /* First step is to deallocate the stack frame so that we can - pop the registers. If the stack pointer was realigned, it needs - to be restored now. Also do it on SEH target for very large - frame as the emitted instructions aren't allowed by the ABI - in epilogues. */ - if (!m->fs.sp_valid || m->fs.sp_realigned - || (TARGET_SEH - && (m->fs.sp_offset - reg_save_offset - >= SEH_MAX_FRAME_SIZE))) + else { - pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx, - GEN_INT (m->fs.fp_offset - - reg_save_offset), - style, false); + gcc_assert (CONST_INT_P (XEXP (x, 1))); + output_pic_addr_const (file, XEXP (x, 1), code); + putc ('+', file); + output_pic_addr_const (file, XEXP (x, 0), code); } - else if (m->fs.sp_offset != reg_save_offset) + break; + + case MINUS: + if (!TARGET_MACHO) + putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file); + output_pic_addr_const (file, XEXP (x, 0), code); + putc ('-', file); + output_pic_addr_const (file, XEXP (x, 1), code); + if (!TARGET_MACHO) + putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file); + break; + + case UNSPEC: + gcc_assert (XVECLEN (x, 0) == 1); + output_pic_addr_const (file, XVECEXP (x, 0, 0), code); + switch (XINT (x, 1)) { - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (m->fs.sp_offset - - reg_save_offset), - style, - m->fs.cfa_reg == stack_pointer_rtx); + case UNSPEC_GOT: + fputs ("@GOT", file); + break; + case UNSPEC_GOTOFF: + fputs ("@GOTOFF", file); + break; + case UNSPEC_PLTOFF: + fputs ("@PLTOFF", file); + break; + case UNSPEC_PCREL: + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "(%rip)" : "[rip]", file); + break; + case UNSPEC_GOTPCREL: + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file); + break; + case UNSPEC_GOTTPOFF: + /* FIXME: This might be @TPOFF in Sun ld too. */ + fputs ("@gottpoff", file); + break; + case UNSPEC_TPOFF: + fputs ("@tpoff", file); + break; + case UNSPEC_NTPOFF: + if (TARGET_64BIT) + fputs ("@tpoff", file); + else + fputs ("@ntpoff", file); + break; + case UNSPEC_DTPOFF: + fputs ("@dtpoff", file); + break; + case UNSPEC_GOTNTPOFF: + if (TARGET_64BIT) + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "@gottpoff(%rip)": "@gottpoff[rip]", file); + else + fputs ("@gotntpoff", file); + break; + case UNSPEC_INDNTPOFF: + fputs ("@indntpoff", file); + break; +#if TARGET_MACHO + case UNSPEC_MACHOPIC_OFFSET: + putc ('-', file); + machopic_output_function_base_name (file); + break; +#endif + default: + output_operand_lossage ("invalid UNSPEC as operand"); + break; } + break; - ix86_emit_restore_regs_using_pop (); + default: + output_operand_lossage ("invalid expression as operand"); } +} - /* If we used a stack pointer and haven't already got rid of it, - then do so now. */ - if (m->fs.fp_valid) - { - /* If the stack pointer is valid and pointing at the frame - pointer store address, then we only need a pop. */ - if (sp_valid_at (frame.hfp_save_offset) - && m->fs.sp_offset == frame.hfp_save_offset) - ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx); - /* Leave results in shorter dependency chains on CPUs that are - able to grok it fast. */ - else if (TARGET_USE_LEAVE - || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun)) - || !cfun->machine->use_fast_prologue_epilogue) - ix86_emit_leave (NULL); - else - { - pro_epilogue_adjust_stack (stack_pointer_rtx, - hard_frame_pointer_rtx, - const0_rtx, style, !using_drap); - ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx); - } - } +/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL. + We need to emit DTP-relative relocations. */ - if (using_drap) +static void ATTRIBUTE_UNUSED +i386_output_dwarf_dtprel (FILE *file, int size, rtx x) +{ + fputs (ASM_LONG, file); + output_addr_const (file, x); + fputs ("@dtpoff", file); + switch (size) { - int param_ptr_offset = UNITS_PER_WORD; - rtx_insn *insn; - - gcc_assert (stack_realign_drap); - - if (ix86_static_chain_on_stack) - param_ptr_offset += UNITS_PER_WORD; - if (!call_used_regs[REGNO (crtl->drap_reg)]) - param_ptr_offset += UNITS_PER_WORD; - - insn = emit_insn (gen_rtx_SET - (stack_pointer_rtx, - gen_rtx_PLUS (Pmode, - crtl->drap_reg, - GEN_INT (-param_ptr_offset)))); - m->fs.cfa_reg = stack_pointer_rtx; - m->fs.cfa_offset = param_ptr_offset; - m->fs.sp_offset = param_ptr_offset; - m->fs.realigned = false; - - add_reg_note (insn, REG_CFA_DEF_CFA, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, - GEN_INT (param_ptr_offset))); - RTX_FRAME_RELATED_P (insn) = 1; + case 4: + break; + case 8: + fputs (", 0", file); + break; + default: + gcc_unreachable (); + } +} - if (!call_used_regs[REGNO (crtl->drap_reg)]) - ix86_emit_restore_reg_using_pop (crtl->drap_reg); - } +/* Return true if X is a representation of the PIC register. This copes + with calls from ix86_find_base_term, where the register might have + been replaced by a cselib value. */ - /* At this point the stack pointer must be valid, and we must have - restored all of the registers. We may not have deallocated the - entire stack frame. We've delayed this until now because it may - be possible to merge the local stack deallocation with the - deallocation forced by ix86_static_chain_on_stack. */ - gcc_assert (m->fs.sp_valid); - gcc_assert (!m->fs.sp_realigned); - gcc_assert (!m->fs.fp_valid); - gcc_assert (!m->fs.realigned); - if (m->fs.sp_offset != UNITS_PER_WORD) +static bool +ix86_pic_register_p (rtx x) +{ + if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x)) + return (pic_offset_table_rtx + && rtx_equal_for_cselib_p (x, pic_offset_table_rtx)); + else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SET_GOT) + return true; + else if (!REG_P (x)) + return false; + else if (pic_offset_table_rtx) { - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (m->fs.sp_offset - UNITS_PER_WORD), - style, true); + if (REGNO (x) == REGNO (pic_offset_table_rtx)) + return true; + if (HARD_REGISTER_P (x) + && !HARD_REGISTER_P (pic_offset_table_rtx) + && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx)) + return true; + return false; } else - ix86_add_queued_cfa_restore_notes (get_last_insn ()); + return REGNO (x) == PIC_OFFSET_TABLE_REGNUM; +} - /* Sibcall epilogues don't want a return instruction. */ - if (style == 0) +/* Helper function for ix86_delegitimize_address. + Attempt to delegitimize TLS local-exec accesses. */ + +static rtx +ix86_delegitimize_tls_address (rtx orig_x) +{ + rtx x = orig_x, unspec; + struct ix86_address addr; + + if (!TARGET_TLS_DIRECT_SEG_REFS) + return orig_x; + if (MEM_P (x)) + x = XEXP (x, 0); + if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode) + return orig_x; + if (ix86_decompose_address (x, &addr) == 0 + || addr.seg != DEFAULT_TLS_SEG_REG + || addr.disp == NULL_RTX + || GET_CODE (addr.disp) != CONST) + return orig_x; + unspec = XEXP (addr.disp, 0); + if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1))) + unspec = XEXP (unspec, 0); + if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF) + return orig_x; + x = XVECEXP (unspec, 0, 0); + gcc_assert (GET_CODE (x) == SYMBOL_REF); + if (unspec != XEXP (addr.disp, 0)) + x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1)); + if (addr.index) { - m->fs = frame_state_save; - return; + rtx idx = addr.index; + if (addr.scale != 1) + idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale)); + x = gen_rtx_PLUS (Pmode, idx, x); } + if (addr.base) + x = gen_rtx_PLUS (Pmode, addr.base, x); + if (MEM_P (orig_x)) + x = replace_equiv_address_nv (orig_x, x); + return x; +} - if (cfun->machine->func_type != TYPE_NORMAL) - emit_jump_insn (gen_interrupt_return ()); - else if (crtl->args.pops_args && crtl->args.size) - { - rtx popc = GEN_INT (crtl->args.pops_args); +/* In the name of slightly smaller debug output, and to cater to + general assembler lossage, recognize PIC+GOTOFF and turn it back + into a direct symbol reference. - /* i386 can only pop 64K bytes. If asked to pop more, pop return - address, do explicit add, and jump indirectly to the caller. */ + On Darwin, this is necessary to avoid a crash, because Darwin + has a different PIC label for each routine but the DWARF debugging + information is not associated with any particular routine, so it's + necessary to remove references to the PIC label from RTL stored by + the DWARF output code. - if (crtl->args.pops_args >= 65536) - { - rtx ecx = gen_rtx_REG (SImode, CX_REG); - rtx_insn *insn; + This helper is used in the normal ix86_delegitimize_address + entrypoint (e.g. used in the target delegitimization hook) and + in ix86_find_base_term. As compile time memory optimization, we + avoid allocating rtxes that will not change anything on the outcome + of the callers (find_base_value and find_base_term). */ - /* There is no "pascal" calling convention in any 64bit ABI. */ - gcc_assert (!TARGET_64BIT); +static inline rtx +ix86_delegitimize_address_1 (rtx x, bool base_term_p) +{ + rtx orig_x = delegitimize_mem_from_attrs (x); + /* addend is NULL or some rtx if x is something+GOTOFF where + something doesn't include the PIC register. */ + rtx addend = NULL_RTX; + /* reg_addend is NULL or a multiple of some register. */ + rtx reg_addend = NULL_RTX; + /* const_addend is NULL or a const_int. */ + rtx const_addend = NULL_RTX; + /* This is the result, or NULL. */ + rtx result = NULL_RTX; - insn = emit_insn (gen_pop (ecx)); - m->fs.cfa_offset -= UNITS_PER_WORD; - m->fs.sp_offset -= UNITS_PER_WORD; + x = orig_x; - rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); - x = gen_rtx_SET (stack_pointer_rtx, x); - add_reg_note (insn, REG_CFA_ADJUST_CFA, x); - add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx)); - RTX_FRAME_RELATED_P (insn) = 1; + if (MEM_P (x)) + x = XEXP (x, 0); - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, - popc, -1, true); - emit_jump_insn (gen_simple_return_indirect_internal (ecx)); - } - else - emit_jump_insn (gen_simple_return_pop_internal (popc)); - } - else if (!m->call_ms2sysv || !restore_stub_is_tail) + if (TARGET_64BIT) { - /* In case of return from EH a simple return cannot be used - as a return address will be compared with a shadow stack - return address. Use indirect jump instead. */ - if (style == 2 && flag_cf_protection) + if (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_MODE (XEXP (x, 0)) == Pmode + && CONST_INT_P (XEXP (XEXP (x, 0), 1)) + && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC + && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL) + { + /* find_base_{value,term} only care about MEMs with arg_pointer_rtx + base. A CONST can't be arg_pointer_rtx based. */ + if (base_term_p && MEM_P (orig_x)) + return orig_x; + rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0); + x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2); + if (MEM_P (orig_x)) + x = replace_equiv_address_nv (orig_x, x); + return x; + } + + if (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == UNSPEC + && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL + || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL) + && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)) { - /* Register used in indirect jump must be in word_mode. But - Pmode may not be the same as word_mode for x32. */ - rtx ecx = gen_rtx_REG (word_mode, CX_REG); - rtx_insn *insn; + x = XVECEXP (XEXP (x, 0), 0, 0); + if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x)) + { + x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x)); + if (x == NULL_RTX) + return orig_x; + } + return x; + } - insn = emit_insn (gen_pop (ecx)); - m->fs.cfa_offset -= UNITS_PER_WORD; - m->fs.sp_offset -= UNITS_PER_WORD; + if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC) + return ix86_delegitimize_tls_address (orig_x); - rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); - x = gen_rtx_SET (stack_pointer_rtx, x); - add_reg_note (insn, REG_CFA_ADJUST_CFA, x); - add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx)); - RTX_FRAME_RELATED_P (insn) = 1; + /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic + and -mcmodel=medium -fpic. */ + } - emit_jump_insn (gen_simple_return_indirect_internal (ecx)); - } + if (GET_CODE (x) != PLUS + || GET_CODE (XEXP (x, 1)) != CONST) + return ix86_delegitimize_tls_address (orig_x); + + if (ix86_pic_register_p (XEXP (x, 0))) + /* %ebx + GOT/GOTOFF */ + ; + else if (GET_CODE (XEXP (x, 0)) == PLUS) + { + /* %ebx + %reg * scale + GOT/GOTOFF */ + reg_addend = XEXP (x, 0); + if (ix86_pic_register_p (XEXP (reg_addend, 0))) + reg_addend = XEXP (reg_addend, 1); + else if (ix86_pic_register_p (XEXP (reg_addend, 1))) + reg_addend = XEXP (reg_addend, 0); else - emit_jump_insn (gen_simple_return_internal ()); + { + reg_addend = NULL_RTX; + addend = XEXP (x, 0); + } } + else + addend = XEXP (x, 0); - /* Restore the state back to the state from the prologue, - so that it's correct for the next epilogue. */ - m->fs = frame_state_save; -} + x = XEXP (XEXP (x, 1), 0); + if (GET_CODE (x) == PLUS + && CONST_INT_P (XEXP (x, 1))) + { + const_addend = XEXP (x, 1); + x = XEXP (x, 0); + } -/* Reset from the function's potential modifications. */ - -static void -ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED) -{ - if (pic_offset_table_rtx - && !ix86_use_pseudo_pic_reg ()) - SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM); - - if (TARGET_MACHO) - { - rtx_insn *insn = get_last_insn (); - rtx_insn *deleted_debug_label = NULL; - - /* Mach-O doesn't support labels at the end of objects, so if - it looks like we might want one, take special action. - First, collect any sequence of deleted debug labels. */ - while (insn - && NOTE_P (insn) - && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL) - { - /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL - notes only, instead set their CODE_LABEL_NUMBER to -1, - otherwise there would be code generation differences - in between -g and -g0. */ - if (NOTE_P (insn) && NOTE_KIND (insn) - == NOTE_INSN_DELETED_DEBUG_LABEL) - deleted_debug_label = insn; - insn = PREV_INSN (insn); - } - - /* If we have: - label: - barrier - then this needs to be detected, so skip past the barrier. */ - - if (insn && BARRIER_P (insn)) - insn = PREV_INSN (insn); + if (GET_CODE (x) == UNSPEC + && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend) + || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x)) + || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC + && !MEM_P (orig_x) && !addend))) + result = XVECEXP (x, 0, 0); - /* Up to now we've only seen notes or barriers. */ - if (insn) - { - if (LABEL_P (insn) - || (NOTE_P (insn) - && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)) - /* Trailing label. */ - fputs ("\tnop\n", file); - else if (cfun && ! cfun->is_thunk) - { - /* See if we have a completely empty function body, skipping - the special case of the picbase thunk emitted as asm. */ - while (insn && ! INSN_P (insn)) - insn = PREV_INSN (insn); - /* If we don't find any insns, we've got an empty function body; - I.e. completely empty - without a return or branch. This is - taken as the case where a function body has been removed - because it contains an inline __builtin_unreachable(). GCC - declares that reaching __builtin_unreachable() means UB so - we're not obliged to do anything special; however, we want - non-zero-sized function bodies. To meet this, and help the - user out, let's trap the case. */ - if (insn == NULL) - fputs ("\tud2\n", file); - } - } - else if (deleted_debug_label) - for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn)) - if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL) - CODE_LABEL_NUMBER (insn) = -1; - } -} + if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x) + && !MEM_P (orig_x)) + result = XVECEXP (x, 0, 0); -/* Return a scratch register to use in the split stack prologue. The - split stack prologue is used for -fsplit-stack. It is the first - instructions in the function, even before the regular prologue. - The scratch register can be any caller-saved register which is not - used for parameters or for the static chain. */ + if (! result) + return ix86_delegitimize_tls_address (orig_x); -static unsigned int -split_stack_prologue_scratch_regno (void) -{ - if (TARGET_64BIT) - return R11_REG; - else + /* For (PLUS something CONST_INT) both find_base_{value,term} just + recurse on the first operand. */ + if (const_addend && !base_term_p) + result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend)); + if (reg_addend) + result = gen_rtx_PLUS (Pmode, reg_addend, result); + if (addend) { - bool is_fastcall, is_thiscall; - int regparm; - - is_fastcall = (lookup_attribute ("fastcall", - TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) - != NULL); - is_thiscall = (lookup_attribute ("thiscall", - TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) - != NULL); - regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl); - - if (is_fastcall) - { - if (DECL_STATIC_CHAIN (cfun->decl)) - { - sorry ("%<-fsplit-stack%> does not support fastcall with " - "nested function"); - return INVALID_REGNUM; - } - return AX_REG; - } - else if (is_thiscall) - { - if (!DECL_STATIC_CHAIN (cfun->decl)) - return DX_REG; - return AX_REG; - } - else if (regparm < 3) + /* If the rest of original X doesn't involve the PIC register, add + addend and subtract pic_offset_table_rtx. This can happen e.g. + for code like: + leal (%ebx, %ecx, 4), %ecx + ... + movl foo@GOTOFF(%ecx), %edx + in which case we return (%ecx - %ebx) + foo + or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg + and reload has completed. Don't do the latter for debug, + as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */ + if (pic_offset_table_rtx + && (!reload_completed || !ix86_use_pseudo_pic_reg ())) + result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend), + pic_offset_table_rtx), + result); + else if (base_term_p + && pic_offset_table_rtx + && !TARGET_MACHO + && !TARGET_VXWORKS_RTP) { - if (!DECL_STATIC_CHAIN (cfun->decl)) - return CX_REG; - else - { - if (regparm >= 2) - { - sorry ("%<-fsplit-stack%> does not support 2 register " - "parameters for a nested function"); - return INVALID_REGNUM; - } - return DX_REG; - } + rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); + tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp); + result = gen_rtx_PLUS (Pmode, tmp, result); } else - { - /* FIXME: We could make this work by pushing a register - around the addition and comparison. */ - sorry ("%<-fsplit-stack%> does not support 3 register parameters"); - return INVALID_REGNUM; - } + return orig_x; } + if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x)) + { + result = lowpart_subreg (GET_MODE (orig_x), result, Pmode); + if (result == NULL_RTX) + return orig_x; + } + return result; } -/* A SYMBOL_REF for the function which allocates new stackspace for - -fsplit-stack. */ - -static GTY(()) rtx split_stack_fn; - -/* A SYMBOL_REF for the more stack function when using the large - model. */ +/* The normal instantiation of the above template. */ -static GTY(()) rtx split_stack_fn_large; +static rtx +ix86_delegitimize_address (rtx x) +{ + return ix86_delegitimize_address_1 (x, false); +} -/* Return location of the stack guard value in the TLS block. */ +/* If X is a machine specific address (i.e. a symbol or label being + referenced as a displacement from the GOT implemented using an + UNSPEC), then return the base term. Otherwise return X. */ rtx -ix86_split_stack_guard (void) +ix86_find_base_term (rtx x) { - int offset; - addr_space_t as = DEFAULT_TLS_SEG_REG; - rtx r; - - gcc_assert (flag_split_stack); + rtx term; -#ifdef TARGET_THREAD_SPLIT_STACK_OFFSET - offset = TARGET_THREAD_SPLIT_STACK_OFFSET; -#else - gcc_unreachable (); -#endif + if (TARGET_64BIT) + { + if (GET_CODE (x) != CONST) + return x; + term = XEXP (x, 0); + if (GET_CODE (term) == PLUS + && CONST_INT_P (XEXP (term, 1))) + term = XEXP (term, 0); + if (GET_CODE (term) != UNSPEC + || (XINT (term, 1) != UNSPEC_GOTPCREL + && XINT (term, 1) != UNSPEC_PCREL)) + return x; - r = GEN_INT (offset); - r = gen_const_mem (Pmode, r); - set_mem_addr_space (r, as); + return XVECEXP (term, 0, 0); + } - return r; + return ix86_delegitimize_address_1 (x, true); } -/* Handle -fsplit-stack. These are the first instructions in the - function, even before the regular prologue. */ +/* Return true if X shouldn't be emitted into the debug info. + Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_ + symbol easily into the .debug_info section, so we need not to + delegitimize, but instead assemble as @gotoff. + Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically + assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */ -void -ix86_expand_split_stack_prologue (void) +static bool +ix86_const_not_ok_for_debug_p (rtx x) { - HOST_WIDE_INT allocate; - unsigned HOST_WIDE_INT args_size; - rtx_code_label *label; - rtx limit, current, allocate_rtx, call_fusage; - rtx_insn *call_insn; - rtx scratch_reg = NULL_RTX; - rtx_code_label *varargs_label = NULL; - rtx fn; - - gcc_assert (flag_split_stack && reload_completed); + if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF) + return true; - ix86_finalize_stack_frame_flags (); - struct ix86_frame &frame = cfun->machine->frame; - allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET; + if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0) + return true; - /* This is the label we will branch to if we have enough stack - space. We expect the basic block reordering pass to reverse this - branch if optimizing, so that we branch in the unlikely case. */ - label = gen_label_rtx (); - - /* We need to compare the stack pointer minus the frame size with - the stack boundary in the TCB. The stack boundary always gives - us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we - can compare directly. Otherwise we need to do an addition. */ - - limit = ix86_split_stack_guard (); + return false; +} + +static void +put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, + bool fp, FILE *file) +{ + const char *suffix; - if (allocate < SPLIT_STACK_AVAILABLE) - current = stack_pointer_rtx; - else + if (mode == CCFPmode) { - unsigned int scratch_regno; - rtx offset; + code = ix86_fp_compare_code_to_integer (code); + mode = CCmode; + } + if (reverse) + code = reverse_condition (code); - /* We need a scratch register to hold the stack pointer minus - the required frame size. Since this is the very start of the - function, the scratch register can be any caller-saved - register which is not used for parameters. */ - offset = GEN_INT (- allocate); - scratch_regno = split_stack_prologue_scratch_regno (); - if (scratch_regno == INVALID_REGNUM) - return; - scratch_reg = gen_rtx_REG (Pmode, scratch_regno); - if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode)) + switch (code) + { + case EQ: + gcc_assert (mode != CCGZmode); + switch (mode) { - /* We don't use ix86_gen_add3 in this case because it will - want to split to lea, but when not optimizing the insn - will not be split after this point. */ - emit_insn (gen_rtx_SET (scratch_reg, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, - offset))); + case E_CCAmode: + suffix = "a"; + break; + case E_CCCmode: + suffix = "c"; + break; + case E_CCOmode: + suffix = "o"; + break; + case E_CCPmode: + suffix = "p"; + break; + case E_CCSmode: + suffix = "s"; + break; + default: + suffix = "e"; + break; + } + break; + case NE: + gcc_assert (mode != CCGZmode); + switch (mode) + { + case E_CCAmode: + suffix = "na"; + break; + case E_CCCmode: + suffix = "nc"; + break; + case E_CCOmode: + suffix = "no"; + break; + case E_CCPmode: + suffix = "np"; + break; + case E_CCSmode: + suffix = "ns"; + break; + default: + suffix = "ne"; + break; } + break; + case GT: + gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode); + suffix = "g"; + break; + case GTU: + /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers. + Those same assemblers have the same but opposite lossage on cmov. */ + if (mode == CCmode) + suffix = fp ? "nbe" : "a"; else + gcc_unreachable (); + break; + case LT: + switch (mode) { - emit_move_insn (scratch_reg, offset); - emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg, - stack_pointer_rtx)); + case E_CCNOmode: + case E_CCGOCmode: + suffix = "s"; + break; + + case E_CCmode: + case E_CCGCmode: + case E_CCGZmode: + suffix = "l"; + break; + + default: + gcc_unreachable (); } - current = scratch_reg; + break; + case LTU: + if (mode == CCmode || mode == CCGZmode) + suffix = "b"; + else if (mode == CCCmode) + suffix = fp ? "b" : "c"; + else + gcc_unreachable (); + break; + case GE: + switch (mode) + { + case E_CCNOmode: + case E_CCGOCmode: + suffix = "ns"; + break; + + case E_CCmode: + case E_CCGCmode: + case E_CCGZmode: + suffix = "ge"; + break; + + default: + gcc_unreachable (); + } + break; + case GEU: + if (mode == CCmode || mode == CCGZmode) + suffix = "nb"; + else if (mode == CCCmode) + suffix = fp ? "nb" : "nc"; + else + gcc_unreachable (); + break; + case LE: + gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode); + suffix = "le"; + break; + case LEU: + if (mode == CCmode) + suffix = "be"; + else + gcc_unreachable (); + break; + case UNORDERED: + suffix = fp ? "u" : "p"; + break; + case ORDERED: + suffix = fp ? "nu" : "np"; + break; + default: + gcc_unreachable (); } + fputs (suffix, file); +} - ix86_expand_branch (GEU, current, limit, label); - rtx_insn *jump_insn = get_last_insn (); - JUMP_LABEL (jump_insn) = label; +/* Print the name of register X to FILE based on its machine mode and number. + If CODE is 'w', pretend the mode is HImode. + If CODE is 'b', pretend the mode is QImode. + If CODE is 'k', pretend the mode is SImode. + If CODE is 'q', pretend the mode is DImode. + If CODE is 'x', pretend the mode is V4SFmode. + If CODE is 't', pretend the mode is V8SFmode. + If CODE is 'g', pretend the mode is V16SFmode. + If CODE is 'h', pretend the reg is the 'high' byte register. + If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op. + If CODE is 'd', duplicate the operand for AVX instruction. + If CODE is 'V', print naked full integer register name without %. + */ - /* Mark the jump as very likely to be taken. */ - add_reg_br_prob_note (jump_insn, profile_probability::very_likely ()); +void +print_reg (rtx x, int code, FILE *file) +{ + const char *reg; + int msize; + unsigned int regno; + bool duplicated; - if (split_stack_fn == NULL_RTX) + if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V') + putc ('%', file); + + if (x == pc_rtx) { - split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack"); - SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL; + gcc_assert (TARGET_64BIT); + fputs ("rip", file); + return; } - fn = split_stack_fn; - /* Get more stack space. We pass in the desired stack space and the - size of the arguments to copy to the new stack. In 32-bit mode - we push the parameters; __morestack will return on a new stack - anyhow. In 64-bit mode we pass the parameters in r10 and - r11. */ - allocate_rtx = GEN_INT (allocate); - args_size = crtl->args.size >= 0 ? (HOST_WIDE_INT) crtl->args.size : 0; - call_fusage = NULL_RTX; - rtx pop = NULL_RTX; - if (TARGET_64BIT) + if (code == 'y' && STACK_TOP_P (x)) { - rtx reg10, reg11; + fputs ("st(0)", file); + return; + } - reg10 = gen_rtx_REG (Pmode, R10_REG); - reg11 = gen_rtx_REG (Pmode, R11_REG); + if (code == 'w') + msize = 2; + else if (code == 'b') + msize = 1; + else if (code == 'k') + msize = 4; + else if (code == 'q') + msize = 8; + else if (code == 'h') + msize = 0; + else if (code == 'x') + msize = 16; + else if (code == 't') + msize = 32; + else if (code == 'g') + msize = 64; + else + msize = GET_MODE_SIZE (GET_MODE (x)); - /* If this function uses a static chain, it will be in %r10. - Preserve it across the call to __morestack. */ - if (DECL_STATIC_CHAIN (cfun->decl)) - { - rtx rax; + regno = REGNO (x); - rax = gen_rtx_REG (word_mode, AX_REG); - emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG)); - use_reg (&call_fusage, rax); - } - - if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC) - && !TARGET_PECOFF) - { - HOST_WIDE_INT argval; - - gcc_assert (Pmode == DImode); - /* When using the large model we need to load the address - into a register, and we've run out of registers. So we - switch to a different calling convention, and we call a - different function: __morestack_large. We pass the - argument size in the upper 32 bits of r10 and pass the - frame size in the lower 32 bits. */ - gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate); - gcc_assert ((args_size & 0xffffffff) == args_size); - - if (split_stack_fn_large == NULL_RTX) - { - split_stack_fn_large - = gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model"); - SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL; - } - if (ix86_cmodel == CM_LARGE_PIC) - { - rtx_code_label *label; - rtx x; - - label = gen_label_rtx (); - emit_label (label); - LABEL_PRESERVE_P (label) = 1; - emit_insn (gen_set_rip_rex64 (reg10, label)); - emit_insn (gen_set_got_offset_rex64 (reg11, label)); - emit_insn (ix86_gen_add3 (reg10, reg10, reg11)); - x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large), - UNSPEC_GOT); - x = gen_rtx_CONST (Pmode, x); - emit_move_insn (reg11, x); - x = gen_rtx_PLUS (Pmode, reg10, reg11); - x = gen_const_mem (Pmode, x); - emit_move_insn (reg11, x); - } - else - emit_move_insn (reg11, split_stack_fn_large); - - fn = reg11; - - argval = ((args_size << 16) << 16) + allocate; - emit_move_insn (reg10, GEN_INT (argval)); - } - else - { - emit_move_insn (reg10, allocate_rtx); - emit_move_insn (reg11, GEN_INT (args_size)); - use_reg (&call_fusage, reg11); - } - - use_reg (&call_fusage, reg10); + if (regno == ARG_POINTER_REGNUM + || regno == FRAME_POINTER_REGNUM + || regno == FPSR_REG) + { + output_operand_lossage + ("invalid use of register '%s'", reg_names[regno]); + return; } - else + else if (regno == FLAGS_REG) { - rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size))); - add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD)); - insn = emit_insn (gen_push (allocate_rtx)); - add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD)); - pop = GEN_INT (2 * UNITS_PER_WORD); + output_operand_lossage ("invalid use of asm flag output"); + return; } - call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn), - GEN_INT (UNITS_PER_WORD), constm1_rtx, - pop, false); - add_function_usage_to (call_insn, call_fusage); - if (!TARGET_64BIT) - add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0)); - /* Indicate that this function can't jump to non-local gotos. */ - make_reg_eh_region_note_nothrow_nononlocal (call_insn); - - /* In order to make call/return prediction work right, we now need - to execute a return instruction. See - libgcc/config/i386/morestack.S for the details on how this works. - - For flow purposes gcc must not see this as a return - instruction--we need control flow to continue at the subsequent - label. Therefore, we use an unspec. */ - gcc_assert (crtl->args.pops_args < 65536); - rtx_insn *ret_insn - = emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args))); - if ((flag_cf_protection & CF_BRANCH)) + if (code == 'V') { - /* Insert ENDBR since __morestack will jump back here via indirect - call. */ - rtx cet_eb = gen_nop_endbr (); - emit_insn_after (cet_eb, ret_insn); + if (GENERAL_REGNO_P (regno)) + msize = GET_MODE_SIZE (word_mode); + else + error ("% modifier on non-integer register"); } - /* If we are in 64-bit mode and this function uses a static chain, - we saved %r10 in %rax before calling _morestack. */ - if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl)) - emit_move_insn (gen_rtx_REG (word_mode, R10_REG), - gen_rtx_REG (word_mode, AX_REG)); + duplicated = code == 'd' && TARGET_AVX; - /* If this function calls va_start, we need to store a pointer to - the arguments on the old stack, because they may not have been - all copied to the new stack. At this point the old stack can be - found at the frame pointer value used by __morestack, because - __morestack has set that up before calling back to us. Here we - store that pointer in a scratch register, and in - ix86_expand_prologue we store the scratch register in a stack - slot. */ - if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + switch (msize) { - unsigned int scratch_regno; - rtx frame_reg; - int words; - - scratch_regno = split_stack_prologue_scratch_regno (); - scratch_reg = gen_rtx_REG (Pmode, scratch_regno); - frame_reg = gen_rtx_REG (Pmode, BP_REG); - - /* 64-bit: - fp -> old fp value - return address within this function - return address of caller of this function - stack arguments - So we add three words to get to the stack arguments. - - 32-bit: - fp -> old fp value - return address within this function - first argument to __morestack - second argument to __morestack - return address of caller of this function - stack arguments - So we add five words to get to the stack arguments. - */ - words = TARGET_64BIT ? 3 : 5; - emit_insn (gen_rtx_SET (scratch_reg, - gen_rtx_PLUS (Pmode, frame_reg, - GEN_INT (words * UNITS_PER_WORD)))); - - varargs_label = gen_label_rtx (); - emit_jump_insn (gen_jump (varargs_label)); - JUMP_LABEL (get_last_insn ()) = varargs_label; - - emit_barrier (); + case 16: + case 12: + case 8: + if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode)) + warning (0, "unsupported size for integer register"); + /* FALLTHRU */ + case 4: + if (LEGACY_INT_REGNO_P (regno)) + putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file); + /* FALLTHRU */ + case 2: + normal: + reg = hi_reg_name[regno]; + break; + case 1: + if (regno >= ARRAY_SIZE (qi_reg_name)) + goto normal; + if (!ANY_QI_REGNO_P (regno)) + error ("unsupported size for integer register"); + reg = qi_reg_name[regno]; + break; + case 0: + if (regno >= ARRAY_SIZE (qi_high_reg_name)) + goto normal; + reg = qi_high_reg_name[regno]; + break; + case 32: + case 64: + if (SSE_REGNO_P (regno)) + { + gcc_assert (!duplicated); + putc (msize == 32 ? 'y' : 'z', file); + reg = hi_reg_name[regno] + 1; + break; + } + goto normal; + default: + gcc_unreachable (); } - emit_label (label); - LABEL_NUSES (label) = 1; + fputs (reg, file); - /* If this function calls va_start, we now have to set the scratch - register for the case where we do not call __morestack. In this - case we need to set it based on the stack pointer. */ - if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + /* Irritatingly, AMD extended registers use + different naming convention: "r%d[bwd]" */ + if (REX_INT_REGNO_P (regno)) { - emit_insn (gen_rtx_SET (scratch_reg, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, - GEN_INT (UNITS_PER_WORD)))); - - emit_label (varargs_label); - LABEL_NUSES (varargs_label) = 1; + gcc_assert (TARGET_64BIT); + switch (msize) + { + case 0: + error ("extended registers have no high halves"); + break; + case 1: + putc ('b', file); + break; + case 2: + putc ('w', file); + break; + case 4: + putc ('d', file); + break; + case 8: + /* no suffix */ + break; + default: + error ("unsupported operand size for extended register"); + break; + } + return; } -} -/* We may have to tell the dataflow pass that the split stack prologue - is initializing a scratch register. */ - -static void -ix86_live_on_entry (bitmap regs) -{ - if (cfun->machine->split_stack_varargs_pointer != NULL_RTX) + if (duplicated) { - gcc_assert (flag_split_stack); - bitmap_set_bit (regs, split_stack_prologue_scratch_regno ()); + if (ASSEMBLER_DIALECT == ASM_ATT) + fprintf (file, ", %%%s", reg); + else + fprintf (file, ", %s", reg); } } - -/* Extract the parts of an RTL expression that is a valid memory address - for an instruction. Return 0 if the structure of the address is - grossly off. Return -1 if the address contains ASHIFT, so it is not - strictly valid, but still used for computing length of lea instruction. */ -int -ix86_decompose_address (rtx addr, struct ix86_address *out) -{ - rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX; - rtx base_reg, index_reg; - HOST_WIDE_INT scale = 1; - rtx scale_rtx = NULL_RTX; - rtx tmp; - int retval = 1; - addr_space_t seg = ADDR_SPACE_GENERIC; +/* Meaning of CODE: + L,W,B,Q,S,T -- print the opcode suffix for specified size of operand. + C -- print opcode suffix for set/cmov insn. + c -- like C, but print reversed condition + F,f -- likewise, but for floating-point. + O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.", + otherwise nothing + R -- print embedded rounding and sae. + r -- print only sae. + z -- print the opcode suffix for the size of the current operand. + Z -- likewise, with special suffixes for x87 instructions. + * -- print a star (in certain assembler syntax) + A -- print an absolute memory reference. + E -- print address with DImode register names if TARGET_64BIT. + w -- print the operand as if it's a "word" (HImode) even if it isn't. + s -- print a shift double count, followed by the assemblers argument + delimiter. + b -- print the QImode name of the register for the indicated operand. + %b0 would print %al if operands[0] is reg 0. + w -- likewise, print the HImode name of the register. + k -- likewise, print the SImode name of the register. + q -- likewise, print the DImode name of the register. + x -- likewise, print the V4SFmode name of the register. + t -- likewise, print the V8SFmode name of the register. + g -- likewise, print the V16SFmode name of the register. + h -- print the QImode name for a "high" register, either ah, bh, ch or dh. + y -- print "st(0)" instead of "st" as a register. + d -- print duplicated register operand for AVX instruction. + D -- print condition for SSE cmp instruction. + P -- if PIC, print an @PLT suffix. + p -- print raw symbol name. + X -- don't print any sort of PIC '@' suffix for a symbol. + & -- print some in-use local-dynamic symbol name. + H -- print a memory address offset by 8; used for sse high-parts + Y -- print condition for XOP pcom* instruction. + V -- print naked full integer register name without %. + + -- print a branch hint as 'cs' or 'ds' prefix + ; -- print a semicolon (after prefixes due to bug in older gas). + ~ -- print "i" if TARGET_AVX2, "f" otherwise. + ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode + M -- print addr32 prefix for TARGET_X32 with VSIB address. + ! -- print NOTRACK prefix for jxx/call/ret instructions if required. + */ - /* Allow zero-extended SImode addresses, - they will be emitted with addr32 prefix. */ - if (TARGET_64BIT && GET_MODE (addr) == DImode) +void +ix86_print_operand (FILE *file, rtx x, int code) +{ + if (code) { - if (GET_CODE (addr) == ZERO_EXTEND - && GET_MODE (XEXP (addr, 0)) == SImode) - { - addr = XEXP (addr, 0); - if (CONST_INT_P (addr)) - return 0; - } - else if (GET_CODE (addr) == AND - && const_32bit_mask (XEXP (addr, 1), DImode)) + switch (code) { - addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode); - if (addr == NULL_RTX) - return 0; + case 'A': + switch (ASSEMBLER_DIALECT) + { + case ASM_ATT: + putc ('*', file); + break; - if (CONST_INT_P (addr)) - return 0; - } - } + case ASM_INTEL: + /* Intel syntax. For absolute addresses, registers should not + be surrounded by braces. */ + if (!REG_P (x)) + { + putc ('[', file); + ix86_print_operand (file, x, 0); + putc (']', file); + return; + } + break; - /* Allow SImode subregs of DImode addresses, - they will be emitted with addr32 prefix. */ - if (TARGET_64BIT && GET_MODE (addr) == SImode) - { - if (SUBREG_P (addr) - && GET_MODE (SUBREG_REG (addr)) == DImode) - { - addr = SUBREG_REG (addr); - if (CONST_INT_P (addr)) - return 0; - } - } + default: + gcc_unreachable (); + } - if (REG_P (addr)) - base = addr; - else if (SUBREG_P (addr)) - { - if (REG_P (SUBREG_REG (addr))) - base = addr; - else - return 0; - } - else if (GET_CODE (addr) == PLUS) - { - rtx addends[4], op; - int n = 0, i; + ix86_print_operand (file, x, 0); + return; - op = addr; - do - { - if (n >= 4) - return 0; - addends[n++] = XEXP (op, 1); - op = XEXP (op, 0); - } - while (GET_CODE (op) == PLUS); - if (n >= 4) - return 0; - addends[n] = op; + case 'E': + /* Wrap address in an UNSPEC to declare special handling. */ + if (TARGET_64BIT) + x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR); - for (i = n; i >= 0; --i) - { - op = addends[i]; - switch (GET_CODE (op)) - { - case MULT: - if (index) - return 0; - index = XEXP (op, 0); - scale_rtx = XEXP (op, 1); - break; + output_address (VOIDmode, x); + return; - case ASHIFT: - if (index) - return 0; - index = XEXP (op, 0); - tmp = XEXP (op, 1); - if (!CONST_INT_P (tmp)) - return 0; - scale = INTVAL (tmp); - if ((unsigned HOST_WIDE_INT) scale > 3) - return 0; - scale = 1 << scale; - break; + case 'L': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('l', file); + return; - case ZERO_EXTEND: - op = XEXP (op, 0); - if (GET_CODE (op) != UNSPEC) - return 0; - /* FALLTHRU */ + case 'W': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('w', file); + return; - case UNSPEC: - if (XINT (op, 1) == UNSPEC_TP - && TARGET_TLS_DIRECT_SEG_REFS - && seg == ADDR_SPACE_GENERIC) - seg = DEFAULT_TLS_SEG_REG; - else - return 0; - break; + case 'B': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('b', file); + return; - case SUBREG: - if (!REG_P (SUBREG_REG (op))) - return 0; - /* FALLTHRU */ + case 'Q': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('l', file); + return; - case REG: - if (!base) - base = op; - else if (!index) - index = op; - else - return 0; + case 'S': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('s', file); + return; + + case 'T': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('t', file); + return; + + case 'O': +#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX + if (ASSEMBLER_DIALECT != ASM_ATT) + return; + + switch (GET_MODE_SIZE (GET_MODE (x))) + { + case 2: + putc ('w', file); + break; + + case 4: + putc ('l', file); break; - case CONST: - case CONST_INT: - case SYMBOL_REF: - case LABEL_REF: - if (disp) - return 0; - disp = op; + case 8: + putc ('q', file); break; default: - return 0; + output_operand_lossage ("invalid operand size for operand " + "code 'O'"); + return; } - } - } - else if (GET_CODE (addr) == MULT) - { - index = XEXP (addr, 0); /* index*scale */ - scale_rtx = XEXP (addr, 1); - } - else if (GET_CODE (addr) == ASHIFT) - { - /* We're called for lea too, which implements ashift on occasion. */ - index = XEXP (addr, 0); - tmp = XEXP (addr, 1); - if (!CONST_INT_P (tmp)) - return 0; - scale = INTVAL (tmp); - if ((unsigned HOST_WIDE_INT) scale > 3) - return 0; - scale = 1 << scale; - retval = -1; - } - else - disp = addr; /* displacement */ - - if (index) - { - if (REG_P (index)) - ; - else if (SUBREG_P (index) - && REG_P (SUBREG_REG (index))) - ; - else - return 0; - } - - /* Extract the integral value of scale. */ - if (scale_rtx) - { - if (!CONST_INT_P (scale_rtx)) - return 0; - scale = INTVAL (scale_rtx); - } - base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base; - index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index; + putc ('.', file); +#endif + return; - /* Avoid useless 0 displacement. */ - if (disp == const0_rtx && (base || index)) - disp = NULL_RTX; + case 'z': + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) + { + /* Opcodes don't get size suffixes if using Intel opcodes. */ + if (ASSEMBLER_DIALECT == ASM_INTEL) + return; - /* Allow arg pointer and stack pointer as index if there is not scaling. */ - if (base_reg && index_reg && scale == 1 - && (REGNO (index_reg) == ARG_POINTER_REGNUM - || REGNO (index_reg) == FRAME_POINTER_REGNUM - || REGNO (index_reg) == SP_REG)) - { - std::swap (base, index); - std::swap (base_reg, index_reg); - } + switch (GET_MODE_SIZE (GET_MODE (x))) + { + case 1: + putc ('b', file); + return; - /* Special case: %ebp cannot be encoded as a base without a displacement. - Similarly %r13. */ - if (!disp && base_reg - && (REGNO (base_reg) == ARG_POINTER_REGNUM - || REGNO (base_reg) == FRAME_POINTER_REGNUM - || REGNO (base_reg) == BP_REG - || REGNO (base_reg) == R13_REG)) - disp = const0_rtx; + case 2: + putc ('w', file); + return; - /* Special case: on K6, [%esi] makes the instruction vector decoded. - Avoid this by transforming to [%esi+0]. - Reload calls address legitimization without cfun defined, so we need - to test cfun for being non-NULL. */ - if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun) - && base_reg && !index_reg && !disp - && REGNO (base_reg) == SI_REG) - disp = const0_rtx; + case 4: + putc ('l', file); + return; - /* Special case: encode reg+reg instead of reg*2. */ - if (!base && index && scale == 2) - base = index, base_reg = index_reg, scale = 1; + case 8: + putc ('q', file); + return; - /* Special case: scaling cannot be encoded without base or displacement. */ - if (!base && !disp && index && scale != 1) - disp = const0_rtx; + default: + output_operand_lossage ("invalid operand size for operand " + "code 'z'"); + return; + } + } - out->base = base; - out->index = index; - out->disp = disp; - out->scale = scale; - out->seg = seg; + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT) + warning (0, "non-integer operand used with operand code %"); + /* FALLTHRU */ - return retval; -} - -/* Return cost of the memory address x. - For i386, it is better to use a complex address than let gcc copy - the address into a reg and make a new pseudo. But not if the address - requires to two regs - that would mean more pseudos with longer - lifetimes. */ -static int -ix86_address_cost (rtx x, machine_mode, addr_space_t, bool) -{ - struct ix86_address parts; - int cost = 1; - int ok = ix86_decompose_address (x, &parts); + case 'Z': + /* 387 opcodes don't get size suffixes if using Intel opcodes. */ + if (ASSEMBLER_DIALECT == ASM_INTEL) + return; - gcc_assert (ok); + if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) + { + switch (GET_MODE_SIZE (GET_MODE (x))) + { + case 2: +#ifdef HAVE_AS_IX86_FILDS + putc ('s', file); +#endif + return; - if (parts.base && SUBREG_P (parts.base)) - parts.base = SUBREG_REG (parts.base); - if (parts.index && SUBREG_P (parts.index)) - parts.index = SUBREG_REG (parts.index); + case 4: + putc ('l', file); + return; - /* Attempt to minimize number of registers in the address by increasing - address cost for each used register. We don't increase address cost - for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx" - is not invariant itself it most likely means that base or index is not - invariant. Therefore only "pic_offset_table_rtx" could be hoisted out, - which is not profitable for x86. */ - if (parts.base - && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER) - && (current_pass->type == GIMPLE_PASS - || !pic_offset_table_rtx - || !REG_P (parts.base) - || REGNO (pic_offset_table_rtx) != REGNO (parts.base))) - cost++; + case 8: +#ifdef HAVE_AS_IX86_FILDQ + putc ('q', file); +#else + fputs ("ll", file); +#endif + return; - if (parts.index - && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER) - && (current_pass->type == GIMPLE_PASS - || !pic_offset_table_rtx - || !REG_P (parts.index) - || REGNO (pic_offset_table_rtx) != REGNO (parts.index))) - cost++; + default: + break; + } + } + else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT) + { + /* 387 opcodes don't get size suffixes + if the operands are registers. */ + if (STACK_REG_P (x)) + return; - /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b, - since it's predecode logic can't detect the length of instructions - and it degenerates to vector decoded. Increase cost of such - addresses here. The penalty is minimally 2 cycles. It may be worthwhile - to split such addresses or even refuse such addresses at all. + switch (GET_MODE_SIZE (GET_MODE (x))) + { + case 4: + putc ('s', file); + return; - Following addressing modes are affected: - [base+scale*index] - [scale*index+disp] - [base+index] + case 8: + putc ('l', file); + return; - The first and last case may be avoidable by explicitly coding the zero in - memory address, but I don't have AMD-K6 machine handy to check this - theory. */ + case 12: + case 16: + putc ('t', file); + return; - if (TARGET_K6 - && ((!parts.disp && parts.base && parts.index && parts.scale != 1) - || (parts.disp && !parts.base && parts.index && parts.scale != 1) - || (!parts.disp && parts.base && parts.index && parts.scale == 1))) - cost += 10; + default: + break; + } + } + else + { + output_operand_lossage ("invalid operand type used with " + "operand code 'Z'"); + return; + } - return cost; -} - -/* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as - this is used for to form addresses to local data when -fPIC is in - use. */ + output_operand_lossage ("invalid operand size for operand code 'Z'"); + return; -static bool -darwin_local_data_pic (rtx disp) -{ - return (GET_CODE (disp) == UNSPEC - && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET); -} + case 'd': + case 'b': + case 'w': + case 'k': + case 'q': + case 'h': + case 't': + case 'g': + case 'y': + case 'x': + case 'X': + case 'P': + case 'p': + case 'V': + break; -/* True if operand X should be loaded from GOT. */ + case 's': + if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT) + { + ix86_print_operand (file, x, 0); + fputs (", ", file); + } + return; -bool -ix86_force_load_from_GOT_p (rtx x) -{ - return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X) - && !TARGET_PECOFF && !TARGET_MACHO - && !flag_pic - && ix86_cmodel != CM_LARGE - && GET_CODE (x) == SYMBOL_REF - && SYMBOL_REF_FUNCTION_P (x) - && (!flag_plt - || (SYMBOL_REF_DECL (x) - && lookup_attribute ("noplt", - DECL_ATTRIBUTES (SYMBOL_REF_DECL (x))))) - && !SYMBOL_REF_LOCAL_P (x)); -} - -/* Determine if a given RTX is a valid constant. We already know this - satisfies CONSTANT_P. */ - -static bool -ix86_legitimate_constant_p (machine_mode mode, rtx x) -{ - switch (GET_CODE (x)) - { - case CONST: - x = XEXP (x, 0); - - if (GET_CODE (x) == PLUS) - { - if (!CONST_INT_P (XEXP (x, 1))) - return false; - x = XEXP (x, 0); - } + case 'Y': + switch (GET_CODE (x)) + { + case NE: + fputs ("neq", file); + break; + case EQ: + fputs ("eq", file); + break; + case GE: + case GEU: + fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file); + break; + case GT: + case GTU: + fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file); + break; + case LE: + case LEU: + fputs ("le", file); + break; + case LT: + case LTU: + fputs ("lt", file); + break; + case UNORDERED: + fputs ("unord", file); + break; + case ORDERED: + fputs ("ord", file); + break; + case UNEQ: + fputs ("ueq", file); + break; + case UNGE: + fputs ("nlt", file); + break; + case UNGT: + fputs ("nle", file); + break; + case UNLE: + fputs ("ule", file); + break; + case UNLT: + fputs ("ult", file); + break; + case LTGT: + fputs ("une", file); + break; + default: + output_operand_lossage ("operand is not a condition code, " + "invalid operand code 'Y'"); + return; + } + return; - if (TARGET_MACHO && darwin_local_data_pic (x)) - return true; + case 'D': + /* Little bit of braindamage here. The SSE compare instructions + does use completely different names for the comparisons that the + fp conditional moves. */ + switch (GET_CODE (x)) + { + case UNEQ: + if (TARGET_AVX) + { + fputs ("eq_us", file); + break; + } + /* FALLTHRU */ + case EQ: + fputs ("eq", file); + break; + case UNLT: + if (TARGET_AVX) + { + fputs ("nge", file); + break; + } + /* FALLTHRU */ + case LT: + fputs ("lt", file); + break; + case UNLE: + if (TARGET_AVX) + { + fputs ("ngt", file); + break; + } + /* FALLTHRU */ + case LE: + fputs ("le", file); + break; + case UNORDERED: + fputs ("unord", file); + break; + case LTGT: + if (TARGET_AVX) + { + fputs ("neq_oq", file); + break; + } + /* FALLTHRU */ + case NE: + fputs ("neq", file); + break; + case GE: + if (TARGET_AVX) + { + fputs ("ge", file); + break; + } + /* FALLTHRU */ + case UNGE: + fputs ("nlt", file); + break; + case GT: + if (TARGET_AVX) + { + fputs ("gt", file); + break; + } + /* FALLTHRU */ + case UNGT: + fputs ("nle", file); + break; + case ORDERED: + fputs ("ord", file); + break; + default: + output_operand_lossage ("operand is not a condition code, " + "invalid operand code 'D'"); + return; + } + return; - /* Only some unspecs are valid as "constants". */ - if (GET_CODE (x) == UNSPEC) - switch (XINT (x, 1)) - { - case UNSPEC_GOT: - case UNSPEC_GOTOFF: - case UNSPEC_PLTOFF: - return TARGET_64BIT; - case UNSPEC_TPOFF: - case UNSPEC_NTPOFF: - x = XVECEXP (x, 0, 0); - return (GET_CODE (x) == SYMBOL_REF - && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC); - case UNSPEC_DTPOFF: - x = XVECEXP (x, 0, 0); - return (GET_CODE (x) == SYMBOL_REF - && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC); - default: - return false; - } + case 'F': + case 'f': +#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('.', file); + gcc_fallthrough (); +#endif - /* We must have drilled down to a symbol. */ - if (GET_CODE (x) == LABEL_REF) - return true; - if (GET_CODE (x) != SYMBOL_REF) - return false; - /* FALLTHRU */ + case 'C': + case 'c': + if (!COMPARISON_P (x)) + { + output_operand_lossage ("operand is not a condition code, " + "invalid operand code '%c'", code); + return; + } + put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), + code == 'c' || code == 'f', + code == 'F' || code == 'f', + file); + return; - case SYMBOL_REF: - /* TLS symbols are never valid. */ - if (SYMBOL_REF_TLS_MODEL (x)) - return false; + case 'H': + if (!offsettable_memref_p (x)) + { + output_operand_lossage ("operand is not an offsettable memory " + "reference, invalid operand code 'H'"); + return; + } + /* It doesn't actually matter what mode we use here, as we're + only going to use this for printing. */ + x = adjust_address_nv (x, DImode, 8); + /* Output 'qword ptr' for intel assembler dialect. */ + if (ASSEMBLER_DIALECT == ASM_INTEL) + code = 'q'; + break; - /* DLLIMPORT symbols are never valid. */ - if (TARGET_DLLIMPORT_DECL_ATTRIBUTES - && SYMBOL_REF_DLLIMPORT_P (x)) - return false; + case 'K': + if (!CONST_INT_P (x)) + { + output_operand_lossage ("operand is not an integer, invalid " + "operand code 'K'"); + return; + } -#if TARGET_MACHO - /* mdynamic-no-pic */ - if (MACHO_DYNAMIC_NO_PIC_P) - return machopic_symbol_defined_p (x); + if (INTVAL (x) & IX86_HLE_ACQUIRE) +#ifdef HAVE_AS_IX86_HLE + fputs ("xacquire ", file); +#else + fputs ("\n" ASM_BYTE "0xf2\n\t", file); #endif + else if (INTVAL (x) & IX86_HLE_RELEASE) +#ifdef HAVE_AS_IX86_HLE + fputs ("xrelease ", file); +#else + fputs ("\n" ASM_BYTE "0xf3\n\t", file); +#endif + /* We do not want to print value of the operand. */ + return; - /* External function address should be loaded - via the GOT slot to avoid PLT. */ - if (ix86_force_load_from_GOT_p (x)) - return false; - - break; + case 'N': + if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x))) + fputs ("{z}", file); + return; - CASE_CONST_SCALAR_INT: - switch (mode) - { - case E_TImode: - if (TARGET_64BIT) - return true; - /* FALLTHRU */ - case E_OImode: - case E_XImode: - if (!standard_sse_constant_p (x, mode)) - return false; - default: - break; - } - break; + case 'r': + if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE) + { + output_operand_lossage ("operand is not a specific integer, " + "invalid operand code 'r'"); + return; + } - case CONST_VECTOR: - if (!standard_sse_constant_p (x, mode)) - return false; + if (ASSEMBLER_DIALECT == ASM_INTEL) + fputs (", ", file); - default: - break; - } + fputs ("{sae}", file); - /* Otherwise we handle everything else in the move patterns. */ - return true; -} + if (ASSEMBLER_DIALECT == ASM_ATT) + fputs (", ", file); -/* Determine if it's legal to put X into the constant pool. This - is not possible for the address of thread-local symbols, which - is checked above. */ + return; -static bool -ix86_cannot_force_const_mem (machine_mode mode, rtx x) -{ - /* We can put any immediate constant in memory. */ - switch (GET_CODE (x)) - { - CASE_CONST_ANY: - return false; + case 'R': + if (!CONST_INT_P (x)) + { + output_operand_lossage ("operand is not an integer, invalid " + "operand code 'R'"); + return; + } - default: - break; - } + if (ASSEMBLER_DIALECT == ASM_INTEL) + fputs (", ", file); - return !ix86_legitimate_constant_p (mode, x); -} + switch (INTVAL (x)) + { + case ROUND_NEAREST_INT | ROUND_SAE: + fputs ("{rn-sae}", file); + break; + case ROUND_NEG_INF | ROUND_SAE: + fputs ("{rd-sae}", file); + break; + case ROUND_POS_INF | ROUND_SAE: + fputs ("{ru-sae}", file); + break; + case ROUND_ZERO | ROUND_SAE: + fputs ("{rz-sae}", file); + break; + default: + output_operand_lossage ("operand is not a specific integer, " + "invalid operand code 'R'"); + } -/* Nonzero if the symbol is marked as dllimport, or as stub-variable, - otherwise zero. */ + if (ASSEMBLER_DIALECT == ASM_ATT) + fputs (", ", file); -static bool -is_imported_p (rtx x) -{ - if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES - || GET_CODE (x) != SYMBOL_REF) - return false; + return; - return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x); -} + case '*': + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('*', file); + return; + case '&': + { + const char *name = get_some_local_dynamic_name (); + if (name == NULL) + output_operand_lossage ("'%%&' used without any " + "local dynamic TLS references"); + else + assemble_name (file, name); + return; + } -/* Nonzero if the constant value X is a legitimate general operand - when generating PIC code. It is given that flag_pic is on and - that X satisfies CONSTANT_P. */ + case '+': + { + rtx x; -bool -legitimate_pic_operand_p (rtx x) -{ - rtx inner; + if (!optimize + || optimize_function_for_size_p (cfun) + || !TARGET_BRANCH_PREDICTION_HINTS) + return; - switch (GET_CODE (x)) - { - case CONST: - inner = XEXP (x, 0); - if (GET_CODE (inner) == PLUS - && CONST_INT_P (XEXP (inner, 1))) - inner = XEXP (inner, 0); + x = find_reg_note (current_output_insn, REG_BR_PROB, 0); + if (x) + { + int pred_val = profile_probability::from_reg_br_prob_note + (XINT (x, 0)).to_reg_br_prob_base (); - /* Only some unspecs are valid as "constants". */ - if (GET_CODE (inner) == UNSPEC) - switch (XINT (inner, 1)) - { - case UNSPEC_GOT: - case UNSPEC_GOTOFF: - case UNSPEC_PLTOFF: - return TARGET_64BIT; - case UNSPEC_TPOFF: - x = XVECEXP (inner, 0, 0); - return (GET_CODE (x) == SYMBOL_REF - && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC); - case UNSPEC_MACHOPIC_OFFSET: - return legitimate_pic_address_disp_p (x); - default: - return false; + if (pred_val < REG_BR_PROB_BASE * 45 / 100 + || pred_val > REG_BR_PROB_BASE * 55 / 100) + { + bool taken = pred_val > REG_BR_PROB_BASE / 2; + bool cputaken + = final_forward_branch_p (current_output_insn) == 0; + + /* Emit hints only in the case default branch prediction + heuristics would fail. */ + if (taken != cputaken) + { + /* We use 3e (DS) prefix for taken branches and + 2e (CS) prefix for not taken branches. */ + if (taken) + fputs ("ds ; ", file); + else + fputs ("cs ; ", file); + } + } + } + return; } - /* FALLTHRU */ - case SYMBOL_REF: - case LABEL_REF: - return legitimate_pic_address_disp_p (x); + case ';': +#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX + putc (';', file); +#endif + return; - default: - return true; - } -} + case '~': + putc (TARGET_AVX2 ? 'i' : 'f', file); + return; -/* Determine if a given CONST RTX is a valid memory displacement - in PIC mode. */ + case 'M': + if (TARGET_X32) + { + /* NB: 32-bit indices in VSIB address are sign-extended + to 64 bits. In x32, if 32-bit address 0xf7fa3010 is + sign-extended to 0xfffffffff7fa3010 which is invalid + address. Add addr32 prefix if there is no base + register nor symbol. */ + bool ok; + struct ix86_address parts; + ok = ix86_decompose_address (x, &parts); + gcc_assert (ok && parts.index == NULL_RTX); + if (parts.base == NULL_RTX + && (parts.disp == NULL_RTX + || !symbolic_operand (parts.disp, + GET_MODE (parts.disp)))) + fputs ("addr32 ", file); + } + return; -bool -legitimate_pic_address_disp_p (rtx disp) -{ - bool saw_plus; + case '^': + if (TARGET_64BIT && Pmode != word_mode) + fputs ("addr32 ", file); + return; - /* In 64bit mode we can allow direct addresses of symbols and labels - when they are not dynamic symbols. */ - if (TARGET_64BIT) + case '!': + if (ix86_notrack_prefixed_insn_p (current_output_insn)) + fputs ("notrack ", file); + return; + + default: + output_operand_lossage ("invalid operand code '%c'", code); + } + } + + if (REG_P (x)) + print_reg (x, code, file); + + else if (MEM_P (x)) { - rtx op0 = disp, op1; + rtx addr = XEXP (x, 0); - switch (GET_CODE (disp)) + /* No `byte ptr' prefix for call instructions ... */ + if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P') { - case LABEL_REF: - return true; + machine_mode mode = GET_MODE (x); + const char *size; - case CONST: - if (GET_CODE (XEXP (disp, 0)) != PLUS) - break; - op0 = XEXP (XEXP (disp, 0), 0); - op1 = XEXP (XEXP (disp, 0), 1); - if (!CONST_INT_P (op1)) - break; - if (GET_CODE (op0) == UNSPEC - && (XINT (op0, 1) == UNSPEC_DTPOFF - || XINT (op0, 1) == UNSPEC_NTPOFF) - && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1)) - return true; - if (INTVAL (op1) >= 16*1024*1024 - || INTVAL (op1) < -16*1024*1024) - break; - if (GET_CODE (op0) == LABEL_REF) - return true; - if (GET_CODE (op0) == CONST - && GET_CODE (XEXP (op0, 0)) == UNSPEC - && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL) - return true; - if (GET_CODE (op0) == UNSPEC - && XINT (op0, 1) == UNSPEC_PCREL) - return true; - if (GET_CODE (op0) != SYMBOL_REF) - break; - /* FALLTHRU */ - - case SYMBOL_REF: - /* TLS references should always be enclosed in UNSPEC. - The dllimported symbol needs always to be resolved. */ - if (SYMBOL_REF_TLS_MODEL (op0) - || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0))) - return false; - - if (TARGET_PECOFF) - { - if (is_imported_p (op0)) - return true; - - if (SYMBOL_REF_FAR_ADDR_P (op0) - || !SYMBOL_REF_LOCAL_P (op0)) + /* Check for explicit size override codes. */ + if (code == 'b') + size = "BYTE"; + else if (code == 'w') + size = "WORD"; + else if (code == 'k') + size = "DWORD"; + else if (code == 'q') + size = "QWORD"; + else if (code == 'x') + size = "XMMWORD"; + else if (code == 't') + size = "YMMWORD"; + else if (code == 'g') + size = "ZMMWORD"; + else if (mode == BLKmode) + /* ... or BLKmode operands, when not overridden. */ + size = NULL; + else + switch (GET_MODE_SIZE (mode)) + { + case 1: size = "BYTE"; break; + case 2: size = "WORD"; break; + case 4: size = "DWORD"; break; + case 8: size = "QWORD"; break; + case 12: size = "TBYTE"; break; + case 16: + if (mode == XFmode) + size = "TBYTE"; + else + size = "XMMWORD"; break; - - /* Function-symbols need to be resolved only for - large-model. - For the small-model we don't need to resolve anything - here. */ - if ((ix86_cmodel != CM_LARGE_PIC - && SYMBOL_REF_FUNCTION_P (op0)) - || ix86_cmodel == CM_SMALL_PIC) - return true; - /* Non-external symbols don't need to be resolved for - large, and medium-model. */ - if ((ix86_cmodel == CM_LARGE_PIC - || ix86_cmodel == CM_MEDIUM_PIC) - && !SYMBOL_REF_EXTERNAL_P (op0)) - return true; + case 32: size = "YMMWORD"; break; + case 64: size = "ZMMWORD"; break; + default: + gcc_unreachable (); + } + if (size) + { + fputs (size, file); + fputs (" PTR ", file); } - else if (!SYMBOL_REF_FAR_ADDR_P (op0) - && (SYMBOL_REF_LOCAL_P (op0) - || (HAVE_LD_PIE_COPYRELOC - && flag_pie - && !SYMBOL_REF_WEAK (op0) - && !SYMBOL_REF_FUNCTION_P (op0))) - && ix86_cmodel != CM_LARGE_PIC) - return true; - break; - - default: - break; } + + if (this_is_asm_operands && ! address_operand (addr, VOIDmode)) + output_operand_lossage ("invalid constraints for operand"); + else + ix86_print_operand_address_as + (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P'); } - if (GET_CODE (disp) != CONST) - return false; - disp = XEXP (disp, 0); - if (TARGET_64BIT) + else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode) { - /* We are unsafe to allow PLUS expressions. This limit allowed distance - of GOT tables. We should not need these anyway. */ - if (GET_CODE (disp) != UNSPEC - || (XINT (disp, 1) != UNSPEC_GOTPCREL - && XINT (disp, 1) != UNSPEC_GOTOFF - && XINT (disp, 1) != UNSPEC_PCREL - && XINT (disp, 1) != UNSPEC_PLTOFF)) - return false; + long l; - if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF - && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF) - return false; - return true; + REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l); + + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + /* Sign extend 32bit SFmode immediate to 8 bytes. */ + if (code == 'q') + fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x", + (unsigned long long) (int) l); + else + fprintf (file, "0x%08x", (unsigned int) l); } - saw_plus = false; - if (GET_CODE (disp) == PLUS) + else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode) { - if (!CONST_INT_P (XEXP (disp, 1))) - return false; - disp = XEXP (disp, 0); - saw_plus = true; - } + long l[2]; - if (TARGET_MACHO && darwin_local_data_pic (disp)) - return true; + REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l); - if (GET_CODE (disp) != UNSPEC) - return false; + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff); + } - switch (XINT (disp, 1)) + /* These float cases don't actually occur as immediate operands. */ + else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode) { - case UNSPEC_GOT: - if (saw_plus) - return false; - /* We need to check for both symbols and labels because VxWorks loads - text labels with @GOT rather than @GOTOFF. See gotoff_operand for - details. */ - return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF - || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF); - case UNSPEC_GOTOFF: - /* Refuse GOTOFF in 64bit mode since it is always 64bit when used. - While ABI specify also 32bit relocation but we don't produce it in - small PIC model at all. */ - if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF - || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF) - && !TARGET_64BIT) - return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode); - return false; - case UNSPEC_GOTTPOFF: - case UNSPEC_GOTNTPOFF: - case UNSPEC_INDNTPOFF: - if (saw_plus) - return false; - disp = XVECEXP (disp, 0, 0); - return (GET_CODE (disp) == SYMBOL_REF - && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC); - case UNSPEC_NTPOFF: - disp = XVECEXP (disp, 0, 0); - return (GET_CODE (disp) == SYMBOL_REF - && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC); - case UNSPEC_DTPOFF: - disp = XVECEXP (disp, 0, 0); - return (GET_CODE (disp) == SYMBOL_REF - && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC); + char dstr[30]; + + real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1); + fputs (dstr, file); } - return false; -} + else + { + /* We have patterns that allow zero sets of memory, for instance. + In 64-bit mode, we should probably support all 8-byte vectors, + since we can in fact encode that into an immediate. */ + if (GET_CODE (x) == CONST_VECTOR) + { + if (x != CONST0_RTX (GET_MODE (x))) + output_operand_lossage ("invalid vector immediate"); + x = const0_rtx; + } -/* Determine if op is suitable RTX for an address register. - Return naked register if a register or a register subreg is - found, otherwise return NULL_RTX. */ + if (code != 'P' && code != 'p') + { + if (CONST_INT_P (x)) + { + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + } + else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF + || GET_CODE (x) == LABEL_REF) + { + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('$', file); + else + fputs ("OFFSET FLAT:", file); + } + } + if (CONST_INT_P (x)) + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); + else if (flag_pic || MACHOPIC_INDIRECT) + output_pic_addr_const (file, x, code); + else + output_addr_const (file, x); + } +} -static rtx -ix86_validate_address_register (rtx op) +static bool +ix86_print_operand_punct_valid_p (unsigned char code) { - machine_mode mode = GET_MODE (op); + return (code == '*' || code == '+' || code == '&' || code == ';' + || code == '~' || code == '^' || code == '!'); +} + +/* Print a memory operand whose address is ADDR. */ - /* Only SImode or DImode registers can form the address. */ - if (mode != SImode && mode != DImode) - return NULL_RTX; +static void +ix86_print_operand_address_as (FILE *file, rtx addr, + addr_space_t as, bool no_rip) +{ + struct ix86_address parts; + rtx base, index, disp; + int scale; + int ok; + bool vsib = false; + int code = 0; - if (REG_P (op)) - return op; - else if (SUBREG_P (op)) + if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR) { - rtx reg = SUBREG_REG (op); - - if (!REG_P (reg)) - return NULL_RTX; - - mode = GET_MODE (reg); - - /* Don't allow SUBREGs that span more than a word. It can - lead to spill failures when the register is one word out - of a two word structure. */ - if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) - return NULL_RTX; - - /* Allow only SUBREGs of non-eliminable hard registers. */ - if (register_no_elim_operand (reg, mode)) - return reg; + ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts); + gcc_assert (parts.index == NULL_RTX); + parts.index = XVECEXP (addr, 0, 1); + parts.scale = INTVAL (XVECEXP (addr, 0, 2)); + addr = XVECEXP (addr, 0, 0); + vsib = true; } + else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR) + { + gcc_assert (TARGET_64BIT); + ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts); + code = 'q'; + } + else + ok = ix86_decompose_address (addr, &parts); - /* Op is not a register. */ - return NULL_RTX; -} - -/* Recognizes RTL expressions that are valid memory addresses for an - instruction. The MODE argument is the machine mode for the MEM - expression that wants to use this address. - - It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should - convert common non-canonical forms to canonical form so that they will - be recognized. */ - -static bool -ix86_legitimate_address_p (machine_mode, rtx addr, bool strict) -{ - struct ix86_address parts; - rtx base, index, disp; - HOST_WIDE_INT scale; - addr_space_t seg; - - if (ix86_decompose_address (addr, &parts) <= 0) - /* Decomposition failed. */ - return false; + gcc_assert (ok); base = parts.base; index = parts.index; disp = parts.disp; scale = parts.scale; - seg = parts.seg; - /* Validate base register. */ - if (base) - { - rtx reg = ix86_validate_address_register (base); + if (ADDR_SPACE_GENERIC_P (as)) + as = parts.seg; + else + gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg)); - if (reg == NULL_RTX) - return false; + if (!ADDR_SPACE_GENERIC_P (as)) + { + if (ASSEMBLER_DIALECT == ASM_ATT) + putc ('%', file); - if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg)) - || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg))) - /* Base is not valid. */ - return false; + switch (as) + { + case ADDR_SPACE_SEG_FS: + fputs ("fs:", file); + break; + case ADDR_SPACE_SEG_GS: + fputs ("gs:", file); + break; + default: + gcc_unreachable (); + } } - /* Validate index register. */ - if (index) + /* Use one byte shorter RIP relative addressing for 64bit mode. */ + if (TARGET_64BIT && !base && !index && !no_rip) { - rtx reg = ix86_validate_address_register (index); + rtx symbol = disp; - if (reg == NULL_RTX) - return false; + if (GET_CODE (disp) == CONST + && GET_CODE (XEXP (disp, 0)) == PLUS + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) + symbol = XEXP (XEXP (disp, 0), 0); - if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg)) - || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg))) - /* Index is not valid. */ - return false; + if (GET_CODE (symbol) == LABEL_REF + || (GET_CODE (symbol) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (symbol) == 0)) + base = pc_rtx; } - /* Index and base should have the same mode. */ - if (base && index - && GET_MODE (base) != GET_MODE (index)) - return false; - - /* Address override works only on the (%reg) part of %fs:(%reg). */ - if (seg != ADDR_SPACE_GENERIC - && ((base && GET_MODE (base) != word_mode) - || (index && GET_MODE (index) != word_mode))) - return false; - - /* Validate scale factor. */ - if (scale != 1) + if (!base && !index) { - if (!index) - /* Scale without index. */ - return false; - - if (scale != 2 && scale != 4 && scale != 8) - /* Scale is not a valid multiplier. */ - return false; + /* Displacement only requires special attention. */ + if (CONST_INT_P (disp)) + { + if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as)) + fputs ("ds:", file); + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp)); + } + /* Load the external function address via the GOT slot to avoid PLT. */ + else if (GET_CODE (disp) == CONST + && GET_CODE (XEXP (disp, 0)) == UNSPEC + && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL + || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT) + && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0))) + output_pic_addr_const (file, disp, 0); + else if (flag_pic) + output_pic_addr_const (file, disp, 0); + else + output_addr_const (file, disp); } - - /* Validate displacement. */ - if (disp) + else { - if (GET_CODE (disp) == CONST - && GET_CODE (XEXP (disp, 0)) == UNSPEC - && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET) - switch (XINT (XEXP (disp, 0), 1)) - { - /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit - when used. While ABI specify also 32bit relocations, we - don't produce them at all and use IP relative instead. - Allow GOT in 32bit mode for both PIC and non-PIC if symbol - should be loaded via GOT. */ - case UNSPEC_GOT: - if (!TARGET_64BIT - && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0))) - goto is_legitimate_pic; - /* FALLTHRU */ - case UNSPEC_GOTOFF: - gcc_assert (flag_pic); - if (!TARGET_64BIT) - goto is_legitimate_pic; - - /* 64bit address unspec. */ - return false; - - case UNSPEC_GOTPCREL: - if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0))) - goto is_legitimate_pic; - /* FALLTHRU */ - case UNSPEC_PCREL: - gcc_assert (flag_pic); - goto is_legitimate_pic; - - case UNSPEC_GOTTPOFF: - case UNSPEC_GOTNTPOFF: - case UNSPEC_INDNTPOFF: - case UNSPEC_NTPOFF: - case UNSPEC_DTPOFF: - break; + /* Print SImode register names to force addr32 prefix. */ + if (SImode_address_operand (addr, VOIDmode)) + { + if (flag_checking) + { + gcc_assert (TARGET_64BIT); + switch (GET_CODE (addr)) + { + case SUBREG: + gcc_assert (GET_MODE (addr) == SImode); + gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode); + break; + case ZERO_EXTEND: + case AND: + gcc_assert (GET_MODE (addr) == DImode); + break; + default: + gcc_unreachable (); + } + } + gcc_assert (!code); + code = 'k'; + } + else if (code == 0 + && TARGET_X32 + && disp + && CONST_INT_P (disp) + && INTVAL (disp) < -16*1024*1024) + { + /* X32 runs in 64-bit mode, where displacement, DISP, in + address DISP(%r64), is encoded as 32-bit immediate sign- + extended from 32-bit to 64-bit. For -0x40000300(%r64), + address is %r64 + 0xffffffffbffffd00. When %r64 < + 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64, + which is invalid for x32. The correct address is %r64 + - 0x40000300 == 0xf7ffdd64. To properly encode + -0x40000300(%r64) for x32, we zero-extend negative + displacement by forcing addr32 prefix which truncates + 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should + zero-extend all negative displacements, including -1(%rsp). + However, for small negative displacements, sign-extension + won't cause overflow. We only zero-extend negative + displacements if they < -16*1024*1024, which is also used + to check legitimate address displacements for PIC. */ + code = 'k'; + } - default: - /* Invalid address unspec. */ - return false; - } + /* Since the upper 32 bits of RSP are always zero for x32, + we can encode %esp as %rsp to avoid 0x67 prefix if + there is no index register. */ + if (TARGET_X32 && Pmode == SImode + && !index && base && REG_P (base) && REGNO (base) == SP_REG) + code = 'q'; - else if (SYMBOLIC_CONST (disp) - && (flag_pic - || (TARGET_MACHO -#if TARGET_MACHO - && MACHOPIC_INDIRECT - && !machopic_operand_p (disp) -#endif - ))) + if (ASSEMBLER_DIALECT == ASM_ATT) { - - is_legitimate_pic: - if (TARGET_64BIT && (index || base)) + if (disp) { - /* foo@dtpoff(%rX) is ok. */ - if (GET_CODE (disp) != CONST - || GET_CODE (XEXP (disp, 0)) != PLUS - || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC - || !CONST_INT_P (XEXP (XEXP (disp, 0), 1)) - || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF - && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF)) - /* Non-constant pic memory reference. */ - return false; + if (flag_pic) + output_pic_addr_const (file, disp, 0); + else if (GET_CODE (disp) == LABEL_REF) + output_asm_label (disp); + else + output_addr_const (file, disp); } - else if ((!TARGET_MACHO || flag_pic) - && ! legitimate_pic_address_disp_p (disp)) - /* Displacement is an invalid pic construct. */ - return false; -#if TARGET_MACHO - else if (MACHO_DYNAMIC_NO_PIC_P - && !ix86_legitimate_constant_p (Pmode, disp)) - /* displacment must be referenced via non_lazy_pointer */ - return false; -#endif - /* This code used to verify that a symbolic pic displacement - includes the pic_offset_table_rtx register. + putc ('(', file); + if (base) + print_reg (base, code, file); + if (index) + { + putc (',', file); + print_reg (index, vsib ? 0 : code, file); + if (scale != 1 || vsib) + fprintf (file, ",%d", scale); + } + putc (')', file); + } + else + { + rtx offset = NULL_RTX; - While this is good idea, unfortunately these constructs may - be created by "adds using lea" optimization for incorrect - code like: + if (disp) + { + /* Pull out the offset of a symbol; print any symbol itself. */ + if (GET_CODE (disp) == CONST + && GET_CODE (XEXP (disp, 0)) == PLUS + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) + { + offset = XEXP (XEXP (disp, 0), 1); + disp = gen_rtx_CONST (VOIDmode, + XEXP (XEXP (disp, 0), 0)); + } - int a; - int foo(int i) - { - return *(&a+i); - } + if (flag_pic) + output_pic_addr_const (file, disp, 0); + else if (GET_CODE (disp) == LABEL_REF) + output_asm_label (disp); + else if (CONST_INT_P (disp)) + offset = disp; + else + output_addr_const (file, disp); + } - This code is nonsensical, but results in addressing - GOT table with pic_offset_table_rtx base. We can't - just refuse it easily, since it gets matched by - "addsi3" pattern, that later gets split to lea in the - case output register differs from input. While this - can be handled by separate addsi pattern for this case - that never results in lea, this seems to be easier and - correct fix for crash to disable this test. */ + putc ('[', file); + if (base) + { + print_reg (base, code, file); + if (offset) + { + if (INTVAL (offset) >= 0) + putc ('+', file); + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset)); + } + } + else if (offset) + fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset)); + else + putc ('0', file); + + if (index) + { + putc ('+', file); + print_reg (index, vsib ? 0 : code, file); + if (scale != 1 || vsib) + fprintf (file, "*%d", scale); + } + putc (']', file); } - else if (GET_CODE (disp) != LABEL_REF - && !CONST_INT_P (disp) - && (GET_CODE (disp) != CONST - || !ix86_legitimate_constant_p (Pmode, disp)) - && (GET_CODE (disp) != SYMBOL_REF - || !ix86_legitimate_constant_p (Pmode, disp))) - /* Displacement is not constant. */ - return false; - else if (TARGET_64BIT - && !x86_64_immediate_operand (disp, VOIDmode)) - /* Displacement is out of range. */ - return false; - /* In x32 mode, constant addresses are sign extended to 64bit, so - we have to prevent addresses from 0x80000000 to 0xffffffff. */ - else if (TARGET_X32 && !(index || base) - && CONST_INT_P (disp) - && val_signbit_known_set_p (SImode, INTVAL (disp))) - return false; } - - /* Everything looks valid. */ - return true; } -/* Determine if a given RTX is a valid constant address. */ - -bool -constant_address_p (rtx x) +static void +ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr) { - return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1); + ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false); } - -/* Return a unique alias set for the GOT. */ -static alias_set_type -ix86_GOT_alias_set (void) +/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */ + +static bool +i386_asm_output_addr_const_extra (FILE *file, rtx x) { - static alias_set_type set = -1; - if (set == -1) - set = new_alias_set (); - return set; -} + rtx op; -/* Return a legitimate reference for ORIG (an address) using the - register REG. If REG is 0, a new pseudo is generated. + if (GET_CODE (x) != UNSPEC) + return false; - There are two types of references that must be handled: + op = XVECEXP (x, 0, 0); + switch (XINT (x, 1)) + { + case UNSPEC_GOTOFF: + output_addr_const (file, op); + fputs ("@gotoff", file); + break; + case UNSPEC_GOTTPOFF: + output_addr_const (file, op); + /* FIXME: This might be @TPOFF in Sun ld. */ + fputs ("@gottpoff", file); + break; + case UNSPEC_TPOFF: + output_addr_const (file, op); + fputs ("@tpoff", file); + break; + case UNSPEC_NTPOFF: + output_addr_const (file, op); + if (TARGET_64BIT) + fputs ("@tpoff", file); + else + fputs ("@ntpoff", file); + break; + case UNSPEC_DTPOFF: + output_addr_const (file, op); + fputs ("@dtpoff", file); + break; + case UNSPEC_GOTNTPOFF: + output_addr_const (file, op); + if (TARGET_64BIT) + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "@gottpoff(%rip)" : "@gottpoff[rip]", file); + else + fputs ("@gotntpoff", file); + break; + case UNSPEC_INDNTPOFF: + output_addr_const (file, op); + fputs ("@indntpoff", file); + break; +#if TARGET_MACHO + case UNSPEC_MACHOPIC_OFFSET: + output_addr_const (file, op); + putc ('-', file); + machopic_output_function_base_name (file); + break; +#endif - 1. Global data references must load the address from the GOT, via - the PIC reg. An insn is emitted to do this load, and the reg is - returned. + default: + return false; + } - 2. Static data references, constant pool addresses, and code labels - compute the address as an offset from the GOT, whose base is in - the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to - differentiate them from global data objects. The returned - address is the PIC reg + an unspec constant. + return true; +} + + +/* Output code to perform a 387 binary operation in INSN, one of PLUS, + MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3] + is the expression of the binary operation. The output may either be + emitted here, or returned to the caller, like all output_* functions. - TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC - reg also appears in the address. */ + There is no guarantee that the operands are the same mode, as they + might be within FLOAT or FLOAT_EXTEND expressions. */ -static rtx -legitimize_pic_address (rtx orig, rtx reg) +#ifndef SYSV386_COMPAT +/* Set to 1 for compatibility with brain-damaged assemblers. No-one + wants to fix the assemblers because that causes incompatibility + with gcc. No-one wants to fix gcc because that causes + incompatibility with assemblers... You can use the option of + -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */ +#define SYSV386_COMPAT 1 +#endif + +const char * +output_387_binary_op (rtx_insn *insn, rtx *operands) { - rtx addr = orig; - rtx new_rtx = orig; + static char buf[40]; + const char *p; + bool is_sse + = (SSE_REG_P (operands[0]) + || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2])); -#if TARGET_MACHO - if (TARGET_MACHO && !TARGET_64BIT) - { - if (reg == 0) - reg = gen_reg_rtx (Pmode); - /* Use the generic Mach-O PIC machinery. */ - return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg); - } -#endif + if (is_sse) + p = "%v"; + else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT + || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) + p = "fi"; + else + p = "f"; - if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES) - { - rtx tmp = legitimize_pe_coff_symbol (addr, true); - if (tmp) - return tmp; - } + strcpy (buf, p); - if (TARGET_64BIT && legitimate_pic_address_disp_p (addr)) - new_rtx = addr; - else if ((!TARGET_64BIT - || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC) - && !TARGET_PECOFF - && gotoff_operand (addr, Pmode)) + switch (GET_CODE (operands[3])) { - /* This symbol may be referenced via a displacement - from the PIC base address (@GOTOFF). */ - if (GET_CODE (addr) == CONST) - addr = XEXP (addr, 0); + case PLUS: + p = "add"; break; + case MINUS: + p = "sub"; break; + case MULT: + p = "mul"; break; + case DIV: + p = "div"; break; + default: + gcc_unreachable (); + } - if (GET_CODE (addr) == PLUS) - { - new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), - UNSPEC_GOTOFF); - new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1)); - } - else - new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF); + strcat (buf, p); - new_rtx = gen_rtx_CONST (Pmode, new_rtx); + if (is_sse) + { + p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd"; + strcat (buf, p); - if (TARGET_64BIT) - new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode); + if (TARGET_AVX) + p = "\t{%2, %1, %0|%0, %1, %2}"; + else + p = "\t{%2, %0|%0, %2}"; - if (reg != 0) - { - gcc_assert (REG_P (reg)); - new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx, - new_rtx, reg, 1, OPTAB_DIRECT); - } + strcat (buf, p); + return buf; + } + + /* Even if we do not want to check the inputs, this documents input + constraints. Which helps in understanding the following code. */ + if (flag_checking) + { + if (STACK_REG_P (operands[0]) + && ((REG_P (operands[1]) + && REGNO (operands[0]) == REGNO (operands[1]) + && (STACK_REG_P (operands[2]) || MEM_P (operands[2]))) + || (REG_P (operands[2]) + && REGNO (operands[0]) == REGNO (operands[2]) + && (STACK_REG_P (operands[1]) || MEM_P (operands[1])))) + && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2]))) + ; /* ok */ else - new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); + gcc_unreachable (); } - else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0) - /* We can't use @GOTOFF for text labels - on VxWorks, see gotoff_operand. */ - || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF)) + + switch (GET_CODE (operands[3])) { - rtx tmp = legitimize_pe_coff_symbol (addr, true); - if (tmp) - return tmp; + case MULT: + case PLUS: + if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2])) + std::swap (operands[1], operands[2]); - /* For x64 PE-COFF there is no GOT table, - so we use address directly. */ - if (TARGET_64BIT && TARGET_PECOFF) + /* know operands[0] == operands[1]. */ + + if (MEM_P (operands[2])) { - new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL); - new_rtx = gen_rtx_CONST (Pmode, new_rtx); + p = "%Z2\t%2"; + break; } - else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC) + + if (find_regno_note (insn, REG_DEAD, REGNO (operands[2]))) { - new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), - UNSPEC_GOTPCREL); - new_rtx = gen_rtx_CONST (Pmode, new_rtx); - new_rtx = gen_const_mem (Pmode, new_rtx); - set_mem_alias_set (new_rtx, ix86_GOT_alias_set ()); + if (STACK_TOP_P (operands[0])) + /* How is it that we are storing to a dead operand[2]? + Well, presumably operands[1] is dead too. We can't + store the result to st(0) as st(0) gets popped on this + instruction. Instead store to operands[2] (which I + think has to be st(1)). st(1) will be popped later. + gcc <= 2.8.1 didn't have this check and generated + assembly code that the Unixware assembler rejected. */ + p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */ + else + p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */ + break; } + + if (STACK_TOP_P (operands[0])) + p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */ else + p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */ + break; + + case MINUS: + case DIV: + if (MEM_P (operands[1])) { - /* This symbol must be referenced via a load - from the Global Offset Table (@GOT). */ - new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT); - new_rtx = gen_rtx_CONST (Pmode, new_rtx); - if (TARGET_64BIT) - new_rtx = force_reg (Pmode, new_rtx); - new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); - new_rtx = gen_const_mem (Pmode, new_rtx); - set_mem_alias_set (new_rtx, ix86_GOT_alias_set ()); + p = "r%Z1\t%1"; + break; } - new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode); - } - else - { - if (CONST_INT_P (addr) - && !x86_64_immediate_operand (addr, VOIDmode)) - new_rtx = copy_to_suggested_reg (addr, reg, Pmode); - else if (GET_CODE (addr) == CONST) + if (MEM_P (operands[2])) { - addr = XEXP (addr, 0); - - /* We must match stuff we generate before. Assume the only - unspecs that can get here are ours. Not that we could do - anything with them anyway.... */ - if (GET_CODE (addr) == UNSPEC - || (GET_CODE (addr) == PLUS - && GET_CODE (XEXP (addr, 0)) == UNSPEC)) - return orig; - gcc_assert (GET_CODE (addr) == PLUS); + p = "%Z2\t%2"; + break; } - if (GET_CODE (addr) == PLUS) + if (find_regno_note (insn, REG_DEAD, REGNO (operands[2]))) { - rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1); - - /* Check first to see if this is a constant - offset from a @GOTOFF symbol reference. */ - if (!TARGET_PECOFF - && gotoff_operand (op0, Pmode) - && CONST_INT_P (op1)) - { - if (!TARGET_64BIT) - { - new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0), - UNSPEC_GOTOFF); - new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1); - new_rtx = gen_rtx_CONST (Pmode, new_rtx); +#if SYSV386_COMPAT + /* The SystemV/386 SVR3.2 assembler, and probably all AT&T + derived assemblers, confusingly reverse the direction of + the operation for fsub{r} and fdiv{r} when the + destination register is not st(0). The Intel assembler + doesn't have this brain damage. Read !SYSV386_COMPAT to + figure out what the hardware really does. */ + if (STACK_TOP_P (operands[0])) + p = "{p\t%0, %2|rp\t%2, %0}"; + else + p = "{rp\t%2, %0|p\t%0, %2}"; +#else + if (STACK_TOP_P (operands[0])) + /* As above for fmul/fadd, we can't store to st(0). */ + p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */ + else + p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */ +#endif + break; + } - if (reg != 0) - { - gcc_assert (REG_P (reg)); - new_rtx = expand_simple_binop (Pmode, PLUS, - pic_offset_table_rtx, - new_rtx, reg, 1, - OPTAB_DIRECT); - } - else - new_rtx - = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx); - } - else - { - if (INTVAL (op1) < -16*1024*1024 - || INTVAL (op1) >= 16*1024*1024) - { - if (!x86_64_immediate_operand (op1, Pmode)) - op1 = force_reg (Pmode, op1); + if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + { +#if SYSV386_COMPAT + if (STACK_TOP_P (operands[0])) + p = "{rp\t%0, %1|p\t%1, %0}"; + else + p = "{p\t%1, %0|rp\t%0, %1}"; +#else + if (STACK_TOP_P (operands[0])) + p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */ + else + p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */ +#endif + break; + } - new_rtx - = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1); - } - } - } + if (STACK_TOP_P (operands[0])) + { + if (STACK_TOP_P (operands[1])) + p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */ else - { - rtx base = legitimize_pic_address (op0, reg); - machine_mode mode = GET_MODE (base); - new_rtx - = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg); + p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */ + break; + } + else if (STACK_TOP_P (operands[1])) + { +#if SYSV386_COMPAT + p = "{\t%1, %0|r\t%0, %1}"; +#else + p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */ +#endif + } + else + { +#if SYSV386_COMPAT + p = "{r\t%2, %0|\t%0, %2}"; +#else + p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */ +#endif + } + break; - if (CONST_INT_P (new_rtx)) - { - if (INTVAL (new_rtx) < -16*1024*1024 - || INTVAL (new_rtx) >= 16*1024*1024) - { - if (!x86_64_immediate_operand (new_rtx, mode)) - new_rtx = force_reg (mode, new_rtx); - - new_rtx - = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx); - } - else - new_rtx = plus_constant (mode, base, INTVAL (new_rtx)); - } - else - { - /* For %rip addressing, we have to use - just disp32, not base nor index. */ - if (TARGET_64BIT - && (GET_CODE (base) == SYMBOL_REF - || GET_CODE (base) == LABEL_REF)) - base = force_reg (mode, base); - if (GET_CODE (new_rtx) == PLUS - && CONSTANT_P (XEXP (new_rtx, 1))) - { - base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0)); - new_rtx = XEXP (new_rtx, 1); - } - new_rtx = gen_rtx_PLUS (mode, base, new_rtx); - } - } - } - } - return new_rtx; -} - -/* Load the thread pointer. If TO_REG is true, force it into a register. */ - -static rtx -get_thread_pointer (machine_mode tp_mode, bool to_reg) -{ - rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP); - - if (GET_MODE (tp) != tp_mode) - { - gcc_assert (GET_MODE (tp) == SImode); - gcc_assert (tp_mode == DImode); - - tp = gen_rtx_ZERO_EXTEND (tp_mode, tp); + default: + gcc_unreachable (); } - if (to_reg) - tp = copy_to_mode_reg (tp_mode, tp); - - return tp; + strcat (buf, p); + return buf; } -/* Construct the SYMBOL_REF for the tls_get_addr function. */ - -static GTY(()) rtx ix86_tls_symbol; +/* Return needed mode for entity in optimize_mode_switching pass. */ -static rtx -ix86_tls_get_addr (void) +static int +ix86_dirflag_mode_needed (rtx_insn *insn) { - if (!ix86_tls_symbol) + if (CALL_P (insn)) { - const char *sym - = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT) - ? "___tls_get_addr" : "__tls_get_addr"); - - ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym); + if (cfun->machine->func_type == TYPE_NORMAL) + return X86_DIRFLAG_ANY; + else + /* No need to emit CLD in interrupt handler for TARGET_CLD. */ + return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET; } - if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF) + if (recog_memoized (insn) < 0) + return X86_DIRFLAG_ANY; + + if (get_attr_type (insn) == TYPE_STR) { - rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol), - UNSPEC_PLTOFF); - return gen_rtx_PLUS (Pmode, pic_offset_table_rtx, - gen_rtx_CONST (Pmode, unspec)); + /* Emit cld instruction if stringops are used in the function. */ + if (cfun->machine->func_type == TYPE_NORMAL) + return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY; + else + return X86_DIRFLAG_RESET; } - return ix86_tls_symbol; + return X86_DIRFLAG_ANY; } -/* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */ - -static GTY(()) rtx ix86_tls_module_base_symbol; +/* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */ -rtx -ix86_tls_module_base (void) +static bool +ix86_check_avx_upper_register (const_rtx exp) { - if (!ix86_tls_module_base_symbol) - { - ix86_tls_module_base_symbol - = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_"); - - SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol) - |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT; - } - - return ix86_tls_module_base_symbol; + return SSE_REG_P (exp) && GET_MODE_BITSIZE (GET_MODE (exp)) > 128; } -/* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is - false if we expect this to be used for a memory address and true if - we expect to load the address into a register. */ +/* Return needed mode for entity in optimize_mode_switching pass. */ -static rtx -legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) +static int +ix86_avx_u128_mode_needed (rtx_insn *insn) { - rtx dest, base, off; - rtx pic = NULL_RTX, tp = NULL_RTX; - machine_mode tp_mode = Pmode; - int type; - - /* Fall back to global dynamic model if tool chain cannot support local - dynamic. */ - if (TARGET_SUN_TLS && !TARGET_64BIT - && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM - && model == TLS_MODEL_LOCAL_DYNAMIC) - model = TLS_MODEL_GLOBAL_DYNAMIC; - - switch (model) + if (CALL_P (insn)) { - case TLS_MODEL_GLOBAL_DYNAMIC: - dest = gen_reg_rtx (Pmode); + rtx link; - if (!TARGET_64BIT) + /* Needed mode is set to AVX_U128_CLEAN if there are + no 256bit or 512bit modes used in function arguments. */ + for (link = CALL_INSN_FUNCTION_USAGE (insn); + link; + link = XEXP (link, 1)) { - if (flag_pic && !TARGET_PECOFF) - pic = pic_offset_table_rtx; - else + if (GET_CODE (XEXP (link, 0)) == USE) { - pic = gen_reg_rtx (Pmode); - emit_insn (gen_set_got (pic)); + rtx arg = XEXP (XEXP (link, 0), 0); + + if (ix86_check_avx_upper_register (arg)) + return AVX_U128_DIRTY; } } - if (TARGET_GNU2_TLS) - { - if (TARGET_64BIT) - emit_insn (gen_tls_dynamic_gnu2_64 (dest, x)); - else - emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic)); + return AVX_U128_CLEAN; + } - tp = get_thread_pointer (Pmode, true); - dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest)); + /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced. + Hardware changes state only when a 256bit register is written to, + but we need to prevent the compiler from moving optimal insertion + point above eventual read from 256bit or 512 bit register. */ + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST) + if (ix86_check_avx_upper_register (*iter)) + return AVX_U128_DIRTY; - if (GET_MODE (x) != Pmode) - x = gen_rtx_ZERO_EXTEND (Pmode, x); + return AVX_U128_ANY; +} - set_unique_reg_note (get_last_insn (), REG_EQUAL, x); - } - else - { - rtx caddr = ix86_tls_get_addr (); +/* Return mode that i387 must be switched into + prior to the execution of insn. */ - if (TARGET_64BIT) - { - rtx rax = gen_rtx_REG (Pmode, AX_REG); - rtx_insn *insns; +static int +ix86_i387_mode_needed (int entity, rtx_insn *insn) +{ + enum attr_i387_cw mode; - start_sequence (); - emit_call_insn - (ix86_gen_tls_global_dynamic_64 (rax, x, caddr)); - insns = get_insns (); - end_sequence (); + /* The mode UNINITIALIZED is used to store control word after a + function call or ASM pattern. The mode ANY specify that function + has no requirements on the control word and make no changes in the + bits we are interested in. */ - if (GET_MODE (x) != Pmode) - x = gen_rtx_ZERO_EXTEND (Pmode, x); + if (CALL_P (insn) + || (NONJUMP_INSN_P (insn) + && (asm_noperands (PATTERN (insn)) >= 0 + || GET_CODE (PATTERN (insn)) == ASM_INPUT))) + return I387_CW_UNINITIALIZED; - RTL_CONST_CALL_P (insns) = 1; - emit_libcall_block (insns, dest, rax, x); - } - else - emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr)); - } + if (recog_memoized (insn) < 0) + return I387_CW_ANY; + + mode = get_attr_i387_cw (insn); + + switch (entity) + { + case I387_TRUNC: + if (mode == I387_CW_TRUNC) + return mode; break; - case TLS_MODEL_LOCAL_DYNAMIC: - base = gen_reg_rtx (Pmode); + case I387_FLOOR: + if (mode == I387_CW_FLOOR) + return mode; + break; - if (!TARGET_64BIT) - { - if (flag_pic) - pic = pic_offset_table_rtx; - else - { - pic = gen_reg_rtx (Pmode); - emit_insn (gen_set_got (pic)); - } - } + case I387_CEIL: + if (mode == I387_CW_CEIL) + return mode; + break; - if (TARGET_GNU2_TLS) - { - rtx tmp = ix86_tls_module_base (); + default: + gcc_unreachable (); + } - if (TARGET_64BIT) - emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp)); - else - emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic)); + return I387_CW_ANY; +} - tp = get_thread_pointer (Pmode, true); - set_unique_reg_note (get_last_insn (), REG_EQUAL, - gen_rtx_MINUS (Pmode, tmp, tp)); - } - else - { - rtx caddr = ix86_tls_get_addr (); - - if (TARGET_64BIT) - { - rtx rax = gen_rtx_REG (Pmode, AX_REG); - rtx_insn *insns; - rtx eqv; - - start_sequence (); - emit_call_insn - (ix86_gen_tls_local_dynamic_base_64 (rax, caddr)); - insns = get_insns (); - end_sequence (); - - /* Attach a unique REG_EQUAL, to allow the RTL optimizers to - share the LD_BASE result with other LD model accesses. */ - eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), - UNSPEC_TLS_LD_BASE); - - RTL_CONST_CALL_P (insns) = 1; - emit_libcall_block (insns, base, rax, eqv); - } - else - emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr)); - } - - off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF); - off = gen_rtx_CONST (Pmode, off); - - dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off)); - - if (TARGET_GNU2_TLS) - { - dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp)); +/* Return mode that entity must be switched into + prior to the execution of insn. */ - if (GET_MODE (x) != Pmode) - x = gen_rtx_ZERO_EXTEND (Pmode, x); +static int +ix86_mode_needed (int entity, rtx_insn *insn) +{ + switch (entity) + { + case X86_DIRFLAG: + return ix86_dirflag_mode_needed (insn); + case AVX_U128: + return ix86_avx_u128_mode_needed (insn); + case I387_TRUNC: + case I387_FLOOR: + case I387_CEIL: + return ix86_i387_mode_needed (entity, insn); + default: + gcc_unreachable (); + } + return 0; +} - set_unique_reg_note (get_last_insn (), REG_EQUAL, x); - } - break; +/* Check if a 256bit or 512bit AVX register is referenced in stores. */ + +static void +ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) + { + if (ix86_check_avx_upper_register (dest)) + { + bool *used = (bool *) data; + *used = true; + } + } - case TLS_MODEL_INITIAL_EXEC: - if (TARGET_64BIT) - { - if (TARGET_SUN_TLS && !TARGET_X32) - { - /* The Sun linker took the AMD64 TLS spec literally - and can only handle %rax as destination of the - initial executable code sequence. */ +/* Calculate mode of upper 128bit AVX registers after the insn. */ - dest = gen_reg_rtx (DImode); - emit_insn (gen_tls_initial_exec_64_sun (dest, x)); - return dest; - } +static int +ix86_avx_u128_mode_after (int mode, rtx_insn *insn) +{ + rtx pat = PATTERN (insn); - /* Generate DImode references to avoid %fs:(%reg32) - problems and linker IE->LE relaxation bug. */ - tp_mode = DImode; - pic = NULL; - type = UNSPEC_GOTNTPOFF; - } - else if (flag_pic) - { - pic = pic_offset_table_rtx; - type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF; - } - else if (!TARGET_ANY_GNU_TLS) - { - pic = gen_reg_rtx (Pmode); - emit_insn (gen_set_got (pic)); - type = UNSPEC_GOTTPOFF; - } - else - { - pic = NULL; - type = UNSPEC_INDNTPOFF; - } + if (vzeroupper_pattern (pat, VOIDmode) + || vzeroall_pattern (pat, VOIDmode)) + return AVX_U128_CLEAN; - off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type); - off = gen_rtx_CONST (tp_mode, off); - if (pic) - off = gen_rtx_PLUS (tp_mode, pic, off); - off = gen_const_mem (tp_mode, off); - set_mem_alias_set (off, ix86_GOT_alias_set ()); + /* We know that state is clean after CALL insn if there are no + 256bit or 512bit registers used in the function return register. */ + if (CALL_P (insn)) + { + bool avx_upper_reg_found = false; + note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found); - if (TARGET_64BIT || TARGET_ANY_GNU_TLS) - { - base = get_thread_pointer (tp_mode, - for_mov || !TARGET_TLS_DIRECT_SEG_REFS); - off = force_reg (tp_mode, off); - dest = gen_rtx_PLUS (tp_mode, base, off); - if (tp_mode != Pmode) - dest = convert_to_mode (Pmode, dest, 1); - } - else - { - base = get_thread_pointer (Pmode, true); - dest = gen_reg_rtx (Pmode); - emit_insn (ix86_gen_sub3 (dest, base, off)); - } - break; + return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN; + } - case TLS_MODEL_LOCAL_EXEC: - off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), - (TARGET_64BIT || TARGET_ANY_GNU_TLS) - ? UNSPEC_NTPOFF : UNSPEC_TPOFF); - off = gen_rtx_CONST (Pmode, off); + /* Otherwise, return current mode. Remember that if insn + references AVX 256bit or 512bit registers, the mode was already + changed to DIRTY from MODE_NEEDED. */ + return mode; +} - if (TARGET_64BIT || TARGET_ANY_GNU_TLS) - { - base = get_thread_pointer (Pmode, - for_mov || !TARGET_TLS_DIRECT_SEG_REFS); - return gen_rtx_PLUS (Pmode, base, off); - } - else - { - base = get_thread_pointer (Pmode, true); - dest = gen_reg_rtx (Pmode); - emit_insn (ix86_gen_sub3 (dest, base, off)); - } - break; +/* Return the mode that an insn results in. */ +static int +ix86_mode_after (int entity, int mode, rtx_insn *insn) +{ + switch (entity) + { + case X86_DIRFLAG: + return mode; + case AVX_U128: + return ix86_avx_u128_mode_after (mode, insn); + case I387_TRUNC: + case I387_FLOOR: + case I387_CEIL: + return mode; default: gcc_unreachable (); } +} - return dest; +static int +ix86_dirflag_mode_entry (void) +{ + /* For TARGET_CLD or in the interrupt handler we can't assume + direction flag state at function entry. */ + if (TARGET_CLD + || cfun->machine->func_type != TYPE_NORMAL) + return X86_DIRFLAG_ANY; + + return X86_DIRFLAG_RESET; } -/* Return true if OP refers to a TLS address. */ -bool -ix86_tls_address_pattern_p (rtx op) +static int +ix86_avx_u128_mode_entry (void) { - subrtx_var_iterator::array_type array; - FOR_EACH_SUBRTX_VAR (iter, array, op, ALL) + tree arg; + + /* Entry mode is set to AVX_U128_DIRTY if there are + 256bit or 512bit modes used in function arguments. */ + for (arg = DECL_ARGUMENTS (current_function_decl); arg; + arg = TREE_CHAIN (arg)) { - rtx op = *iter; - if (MEM_P (op)) - { - rtx *x = &XEXP (op, 0); - while (GET_CODE (*x) == PLUS) - { - int i; - for (i = 0; i < 2; i++) - { - rtx u = XEXP (*x, i); - if (GET_CODE (u) == ZERO_EXTEND) - u = XEXP (u, 0); - if (GET_CODE (u) == UNSPEC - && XINT (u, 1) == UNSPEC_TP) - return true; - } - x = &XEXP (*x, 0); - } + rtx incoming = DECL_INCOMING_RTL (arg); - iter.skip_subrtxes (); - } + if (incoming && ix86_check_avx_upper_register (incoming)) + return AVX_U128_DIRTY; } - return false; + return AVX_U128_CLEAN; } -/* Rewrite *LOC so that it refers to a default TLS address space. */ -void -ix86_rewrite_tls_address_1 (rtx *loc) +/* Return a mode that ENTITY is assumed to be + switched to at function entry. */ + +static int +ix86_mode_entry (int entity) { - subrtx_ptr_iterator::array_type array; - FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL) + switch (entity) { - rtx *loc = *iter; - if (MEM_P (*loc)) - { - rtx addr = XEXP (*loc, 0); - rtx *x = &addr; - while (GET_CODE (*x) == PLUS) - { - int i; - for (i = 0; i < 2; i++) - { - rtx u = XEXP (*x, i); - if (GET_CODE (u) == ZERO_EXTEND) - u = XEXP (u, 0); - if (GET_CODE (u) == UNSPEC - && XINT (u, 1) == UNSPEC_TP) - { - addr_space_t as = DEFAULT_TLS_SEG_REG; + case X86_DIRFLAG: + return ix86_dirflag_mode_entry (); + case AVX_U128: + return ix86_avx_u128_mode_entry (); + case I387_TRUNC: + case I387_FLOOR: + case I387_CEIL: + return I387_CW_ANY; + default: + gcc_unreachable (); + } +} - *x = XEXP (*x, 1 - i); +static int +ix86_avx_u128_mode_exit (void) +{ + rtx reg = crtl->return_rtx; - *loc = replace_equiv_address_nv (*loc, addr, true); - set_mem_addr_space (*loc, as); - return; - } - } - x = &XEXP (*x, 0); - } + /* Exit mode is set to AVX_U128_DIRTY if there are 256bit + or 512 bit modes used in the function return register. */ + if (reg && ix86_check_avx_upper_register (reg)) + return AVX_U128_DIRTY; - iter.skip_subrtxes (); - } + /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit + modes used in function arguments, otherwise return AVX_U128_CLEAN. + */ + return ix86_avx_u128_mode_entry (); +} + +/* Return a mode that ENTITY is assumed to be + switched to at function exit. */ + +static int +ix86_mode_exit (int entity) +{ + switch (entity) + { + case X86_DIRFLAG: + return X86_DIRFLAG_ANY; + case AVX_U128: + return ix86_avx_u128_mode_exit (); + case I387_TRUNC: + case I387_FLOOR: + case I387_CEIL: + return I387_CW_ANY; + default: + gcc_unreachable (); } } -/* Rewrite instruction pattern involvning TLS address - so that it refers to a default TLS address space. */ -rtx -ix86_rewrite_tls_address (rtx pattern) +static int +ix86_mode_priority (int, int n) { - pattern = copy_insn (pattern); - ix86_rewrite_tls_address_1 (&pattern); - return pattern; + return n; } -/* Create or return the unique __imp_DECL dllimport symbol corresponding - to symbol DECL if BEIMPORT is true. Otherwise create or return the - unique refptr-DECL symbol corresponding to symbol DECL. */ +/* Output code to initialize control word copies used by trunc?f?i and + rounding patterns. CURRENT_MODE is set to current control word, + while NEW_MODE is set to new control word. */ -struct dllimport_hasher : ggc_cache_ptr_hash +static void +emit_i387_cw_initialization (int mode) { - static inline hashval_t hash (tree_map *m) { return m->hash; } - static inline bool - equal (tree_map *a, tree_map *b) - { - return a->base.from == b->base.from; - } + rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED); + rtx new_mode; - static int - keep_cache_entry (tree_map *&m) - { - return ggc_marked_p (m->base.from); - } -}; + enum ix86_stack_slot slot; -static GTY((cache)) hash_table *dllimport_map; + rtx reg = gen_reg_rtx (HImode); -static tree -get_dllimport_decl (tree decl, bool beimport) -{ - struct tree_map *h, in; - const char *name; - const char *prefix; - size_t namelen, prefixlen; - char *imp_name; - tree to; - rtx rtl; + emit_insn (gen_x86_fnstcw_1 (stored_mode)); + emit_move_insn (reg, copy_rtx (stored_mode)); - if (!dllimport_map) - dllimport_map = hash_table::create_ggc (512); + switch (mode) + { + case I387_CW_TRUNC: + /* round toward zero (truncate) */ + emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00))); + slot = SLOT_CW_TRUNC; + break; - in.hash = htab_hash_pointer (decl); - in.base.from = decl; - tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT); - h = *loc; - if (h) - return h->to; + case I387_CW_FLOOR: + /* round down toward -oo */ + emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00))); + emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400))); + slot = SLOT_CW_FLOOR; + break; - *loc = h = ggc_alloc (); - h->hash = in.hash; - h->base.from = decl; - h->to = to = build_decl (DECL_SOURCE_LOCATION (decl), - VAR_DECL, NULL, ptr_type_node); - DECL_ARTIFICIAL (to) = 1; - DECL_IGNORED_P (to) = 1; - DECL_EXTERNAL (to) = 1; - TREE_READONLY (to) = 1; + case I387_CW_CEIL: + /* round up toward +oo */ + emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00))); + emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800))); + slot = SLOT_CW_CEIL; + break; - name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); - name = targetm.strip_name_encoding (name); - if (beimport) - prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0 - ? "*__imp_" : "*__imp__"; - else - prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr."; - namelen = strlen (name); - prefixlen = strlen (prefix); - imp_name = (char *) alloca (namelen + prefixlen + 1); - memcpy (imp_name, prefix, prefixlen); - memcpy (imp_name + prefixlen, name, namelen + 1); + default: + gcc_unreachable (); + } - name = ggc_alloc_string (imp_name, namelen + prefixlen); - rtl = gen_rtx_SYMBOL_REF (Pmode, name); - SET_SYMBOL_REF_DECL (rtl, to); - SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR; - if (!beimport) - { - SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL; -#ifdef SUB_TARGET_RECORD_STUB - SUB_TARGET_RECORD_STUB (name); -#endif - } + gcc_assert (slot < MAX_386_STACK_LOCALS); - rtl = gen_const_mem (Pmode, rtl); - set_mem_alias_set (rtl, ix86_GOT_alias_set ()); + new_mode = assign_386_stack_local (HImode, slot); + emit_move_insn (new_mode, reg); +} - SET_DECL_RTL (to, rtl); - SET_DECL_ASSEMBLER_NAME (to, get_identifier (name)); +/* Generate one or more insns to set ENTITY to MODE. */ - return to; +static void +ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED, + HARD_REG_SET regs_live ATTRIBUTE_UNUSED) +{ + switch (entity) + { + case X86_DIRFLAG: + if (mode == X86_DIRFLAG_RESET) + emit_insn (gen_cld ()); + break; + case AVX_U128: + if (mode == AVX_U128_CLEAN) + emit_insn (gen_avx_vzeroupper ()); + break; + case I387_TRUNC: + case I387_FLOOR: + case I387_CEIL: + if (mode != I387_CW_ANY + && mode != I387_CW_UNINITIALIZED) + emit_i387_cw_initialization (mode); + break; + default: + gcc_unreachable (); + } } -/* Expand SYMBOL into its corresponding far-address symbol. - WANT_REG is true if we require the result be a register. */ +/* Output code for INSN to convert a float to a signed int. OPERANDS + are the insn operands. The output may be [HSD]Imode and the input + operand may be [SDX]Fmode. */ -static rtx -legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg) +const char * +output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp) { - tree imp_decl; - rtx x; + bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG); + bool dimode_p = GET_MODE (operands[0]) == DImode; + int round_mode = get_attr_i387_cw (insn); - gcc_assert (SYMBOL_REF_DECL (symbol)); - imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false); + static char buf[40]; + const char *p; - x = DECL_RTL (imp_decl); - if (want_reg) - x = force_reg (Pmode, x); - return x; -} + /* Jump through a hoop or two for DImode, since the hardware has no + non-popping instruction. We used to do this a different way, but + that was somewhat fragile and broke with post-reload splitters. */ + if ((dimode_p || fisttp) && !stack_top_dies) + output_asm_insn ("fld\t%y1", operands); -/* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is - true if we require the result be a register. */ + gcc_assert (STACK_TOP_P (operands[1])); + gcc_assert (MEM_P (operands[0])); + gcc_assert (GET_MODE (operands[1]) != TFmode); -static rtx -legitimize_dllimport_symbol (rtx symbol, bool want_reg) -{ - tree imp_decl; - rtx x; + if (fisttp) + return "fisttp%Z0\t%0"; - gcc_assert (SYMBOL_REF_DECL (symbol)); - imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true); + strcpy (buf, "fist"); - x = DECL_RTL (imp_decl); - if (want_reg) - x = force_reg (Pmode, x); - return x; -} + if (round_mode != I387_CW_ANY) + output_asm_insn ("fldcw\t%3", operands); -/* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG - is true if we require the result be a register. */ + p = "p%Z0\t%0"; + strcat (buf, p + !(stack_top_dies || dimode_p)); -static rtx -legitimize_pe_coff_symbol (rtx addr, bool inreg) -{ - if (!TARGET_PECOFF) - return NULL_RTX; + output_asm_insn (buf, operands); - if (TARGET_DLLIMPORT_DECL_ATTRIBUTES) - { - if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr)) - return legitimize_dllimport_symbol (addr, inreg); - if (GET_CODE (addr) == CONST - && GET_CODE (XEXP (addr, 0)) == PLUS - && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF - && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0))) - { - rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg); - return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1)); - } - } + if (round_mode != I387_CW_ANY) + output_asm_insn ("fldcw\t%2", operands); - if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC) - return NULL_RTX; - if (GET_CODE (addr) == SYMBOL_REF - && !is_imported_p (addr) - && SYMBOL_REF_EXTERNAL_P (addr) - && SYMBOL_REF_DECL (addr)) - return legitimize_pe_coff_extern_decl (addr, inreg); + return ""; +} - if (GET_CODE (addr) == CONST - && GET_CODE (XEXP (addr, 0)) == PLUS - && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF - && !is_imported_p (XEXP (XEXP (addr, 0), 0)) - && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0)) - && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0))) +/* Output code for x87 ffreep insn. The OPNO argument, which may only + have the values zero or one, indicates the ffreep insn's operand + from the OPERANDS array. */ + +static const char * +output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno) +{ + if (TARGET_USE_FFREEP) +#ifdef HAVE_AS_IX86_FFREEP + return opno ? "ffreep\t%y1" : "ffreep\t%y0"; +#else { - rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg); - return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1)); + static char retval[32]; + int regno = REGNO (operands[opno]); + + gcc_assert (STACK_REGNO_P (regno)); + + regno -= FIRST_STACK_REG; + + snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno); + return retval; } - return NULL_RTX; +#endif + + return opno ? "fstp\t%y1" : "fstp\t%y0"; } -/* Try machine-dependent ways of modifying an illegitimate address - to be legitimate. If we find one, return the new, valid address. - This macro is used in only one place: `memory_address' in explow.c. - OLDX is the address as it was before break_out_memory_refs was called. - In some cases it is useful to look at this to decide what needs to be done. +/* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi + should be used. UNORDERED_P is true when fucom should be used. */ - It is always safe for this macro to do nothing. It exists to recognize - opportunities to optimize the output. +const char * +output_fp_compare (rtx_insn *insn, rtx *operands, + bool eflags_p, bool unordered_p) +{ + rtx *xops = eflags_p ? &operands[0] : &operands[1]; + bool stack_top_dies; - For the 80386, we handle X+REG by loading X into a register R and - using R+REG. R will go in a general reg and indexing will be used. - However, if REG is a broken-out memory address or multiplication, - nothing needs to be done because REG can certainly go in a general reg. + static char buf[40]; + const char *p; - When -fpic is used, special handling is needed for symbolic references. - See comments by legitimize_pic_address in i386.c for details. */ + gcc_assert (STACK_TOP_P (xops[0])); -static rtx -ix86_legitimize_address (rtx x, rtx, machine_mode mode) -{ - bool changed = false; - unsigned log; + stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG); - log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0; - if (log) - return legitimize_tls_address (x, (enum tls_model) log, false); - if (GET_CODE (x) == CONST - && GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF - && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0)))) + if (eflags_p) { - rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0), - (enum tls_model) log, false); - return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1)); - } + p = unordered_p ? "fucomi" : "fcomi"; + strcpy (buf, p); - if (TARGET_DLLIMPORT_DECL_ATTRIBUTES) - { - rtx tmp = legitimize_pe_coff_symbol (x, true); - if (tmp) - return tmp; - } + p = "p\t{%y1, %0|%0, %y1}"; + strcat (buf, p + !stack_top_dies); - if (flag_pic && SYMBOLIC_CONST (x)) - return legitimize_pic_address (x, 0); + return buf; + } -#if TARGET_MACHO - if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x)) - return machopic_indirect_data_reference (x, 0); -#endif + if (STACK_REG_P (xops[1]) + && stack_top_dies + && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1)) + { + gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1); - /* Canonicalize shifts by 0, 1, 2, 3 into multiply */ - if (GET_CODE (x) == ASHIFT - && CONST_INT_P (XEXP (x, 1)) - && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4) + /* If both the top of the 387 stack die, and the other operand + is also a stack register that dies, then this must be a + `fcompp' float compare. */ + p = unordered_p ? "fucompp" : "fcompp"; + strcpy (buf, p); + } + else if (const0_operand (xops[1], VOIDmode)) { - changed = true; - log = INTVAL (XEXP (x, 1)); - x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)), - GEN_INT (1 << log)); + gcc_assert (!unordered_p); + strcpy (buf, "ftst"); } - - if (GET_CODE (x) == PLUS) + else { - /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */ - - if (GET_CODE (XEXP (x, 0)) == ASHIFT - && CONST_INT_P (XEXP (XEXP (x, 0), 1)) - && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4) - { - changed = true; - log = INTVAL (XEXP (XEXP (x, 0), 1)); - XEXP (x, 0) = gen_rtx_MULT (Pmode, - force_reg (Pmode, XEXP (XEXP (x, 0), 0)), - GEN_INT (1 << log)); - } - - if (GET_CODE (XEXP (x, 1)) == ASHIFT - && CONST_INT_P (XEXP (XEXP (x, 1), 1)) - && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4) + if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT) { - changed = true; - log = INTVAL (XEXP (XEXP (x, 1), 1)); - XEXP (x, 1) = gen_rtx_MULT (Pmode, - force_reg (Pmode, XEXP (XEXP (x, 1), 0)), - GEN_INT (1 << log)); + gcc_assert (!unordered_p); + p = "ficom"; } + else + p = unordered_p ? "fucom" : "fcom"; - /* Put multiply first if it isn't already. */ - if (GET_CODE (XEXP (x, 1)) == MULT) - { - std::swap (XEXP (x, 0), XEXP (x, 1)); - changed = true; - } + strcpy (buf, p); - /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const))) - into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be - created by virtual register instantiation, register elimination, and - similar optimizations. */ - if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS) - { - changed = true; - x = gen_rtx_PLUS (Pmode, - gen_rtx_PLUS (Pmode, XEXP (x, 0), - XEXP (XEXP (x, 1), 0)), - XEXP (XEXP (x, 1), 1)); - } + p = "p%Z2\t%y2"; + strcat (buf, p + !stack_top_dies); + } - /* Canonicalize - (plus (plus (mult (reg) (const)) (plus (reg) (const))) const) - into (plus (plus (mult (reg) (const)) (reg)) (const)). */ - else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT - && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS - && CONSTANT_P (XEXP (x, 1))) - { - rtx constant; - rtx other = NULL_RTX; + output_asm_insn (buf, operands); + return "fnstsw\t%0"; +} - if (CONST_INT_P (XEXP (x, 1))) - { - constant = XEXP (x, 1); - other = XEXP (XEXP (XEXP (x, 0), 1), 1); - } - else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1))) - { - constant = XEXP (XEXP (XEXP (x, 0), 1), 1); - other = XEXP (x, 1); - } - else - constant = 0; +void +ix86_output_addr_vec_elt (FILE *file, int value) +{ + const char *directive = ASM_LONG; - if (constant) - { - changed = true; - x = gen_rtx_PLUS (Pmode, - gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0), - XEXP (XEXP (XEXP (x, 0), 1), 0)), - plus_constant (Pmode, other, - INTVAL (constant))); - } - } +#ifdef ASM_QUAD + if (TARGET_LP64) + directive = ASM_QUAD; +#else + gcc_assert (!TARGET_64BIT); +#endif - if (changed && ix86_legitimate_address_p (mode, x, false)) - return x; + fprintf (file, "%s%s%d\n", directive, LPREFIX, value); +} - if (GET_CODE (XEXP (x, 0)) == MULT) - { - changed = true; - XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0)); - } +void +ix86_output_addr_diff_elt (FILE *file, int value, int rel) +{ + const char *directive = ASM_LONG; - if (GET_CODE (XEXP (x, 1)) == MULT) - { - changed = true; - XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1)); - } +#ifdef ASM_QUAD + if (TARGET_64BIT && CASE_VECTOR_MODE == DImode) + directive = ASM_QUAD; +#else + gcc_assert (!TARGET_64BIT); +#endif + /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */ + if (TARGET_64BIT || TARGET_VXWORKS_RTP) + fprintf (file, "%s%s%d-%s%d\n", + directive, LPREFIX, value, LPREFIX, rel); +#if TARGET_MACHO + else if (TARGET_MACHO) + { + fprintf (file, ASM_LONG "%s%d-", LPREFIX, value); + machopic_output_function_base_name (file); + putc ('\n', file); + } +#endif + else if (HAVE_AS_GOTOFF_IN_DATA) + fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value); + else + asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n", + GOT_SYMBOL_NAME, LPREFIX, value); +} + +#define LEA_MAX_STALL (3) +#define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1) - if (changed - && REG_P (XEXP (x, 1)) - && REG_P (XEXP (x, 0))) - return x; +/* Increase given DISTANCE in half-cycles according to + dependencies between PREV and NEXT instructions. + Add 1 half-cycle if there is no dependency and + go to next cycle if there is some dependecy. */ - if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1))) - { - changed = true; - x = legitimize_pic_address (x, 0); - } +static unsigned int +increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance) +{ + df_ref def, use; - if (changed && ix86_legitimate_address_p (mode, x, false)) - return x; + if (!prev || !next) + return distance + (distance & 1) + 2; - if (REG_P (XEXP (x, 0))) - { - rtx temp = gen_reg_rtx (Pmode); - rtx val = force_operand (XEXP (x, 1), temp); - if (val != temp) - { - val = convert_to_mode (Pmode, val, 1); - emit_move_insn (temp, val); - } + if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev)) + return distance + 1; - XEXP (x, 1) = temp; - return x; - } + FOR_EACH_INSN_USE (use, next) + FOR_EACH_INSN_DEF (def, prev) + if (!DF_REF_IS_ARTIFICIAL (def) + && DF_REF_REGNO (use) == DF_REF_REGNO (def)) + return distance + (distance & 1) + 2; - else if (REG_P (XEXP (x, 1))) + return distance + 1; +} + +/* Function checks if instruction INSN defines register number + REGNO1 or REGNO2. */ + +bool +insn_defines_reg (unsigned int regno1, unsigned int regno2, + rtx_insn *insn) +{ + df_ref def; + + FOR_EACH_INSN_DEF (def, insn) + if (DF_REF_REG_DEF_P (def) + && !DF_REF_IS_ARTIFICIAL (def) + && (regno1 == DF_REF_REGNO (def) + || regno2 == DF_REF_REGNO (def))) + return true; + + return false; +} + +/* Function checks if instruction INSN uses register number + REGNO as a part of address expression. */ + +static bool +insn_uses_reg_mem (unsigned int regno, rtx insn) +{ + df_ref use; + + FOR_EACH_INSN_USE (use, insn) + if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use)) + return true; + + return false; +} + +/* Search backward for non-agu definition of register number REGNO1 + or register number REGNO2 in basic block starting from instruction + START up to head of basic block or instruction INSN. + + Function puts true value into *FOUND var if definition was found + and false otherwise. + + Distance in half-cycles between START and found instruction or head + of BB is added to DISTANCE and returned. */ + +static int +distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2, + rtx_insn *insn, int distance, + rtx_insn *start, bool *found) +{ + basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL; + rtx_insn *prev = start; + rtx_insn *next = NULL; + + *found = false; + + while (prev + && prev != insn + && distance < LEA_SEARCH_THRESHOLD) + { + if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev)) { - rtx temp = gen_reg_rtx (Pmode); - rtx val = force_operand (XEXP (x, 0), temp); - if (val != temp) + distance = increase_distance (prev, next, distance); + if (insn_defines_reg (regno1, regno2, prev)) { - val = convert_to_mode (Pmode, val, 1); - emit_move_insn (temp, val); + if (recog_memoized (prev) < 0 + || get_attr_type (prev) != TYPE_LEA) + { + *found = true; + return distance; + } } - XEXP (x, 0) = temp; - return x; + next = prev; } + if (prev == BB_HEAD (bb)) + break; + + prev = PREV_INSN (prev); } - return x; + return distance; } - -/* Print an integer constant expression in assembler syntax. Addition - and subtraction are the only arithmetic that may appear in these - expressions. FILE is the stdio stream to write to, X is the rtx, and - CODE is the operand print code from the output string. */ -static void -output_pic_addr_const (FILE *file, rtx x, int code) +/* Search backward for non-agu definition of register number REGNO1 + or register number REGNO2 in INSN's basic block until + 1. Pass LEA_SEARCH_THRESHOLD instructions, or + 2. Reach neighbor BBs boundary, or + 3. Reach agu definition. + Returns the distance between the non-agu definition point and INSN. + If no definition point, returns -1. */ + +static int +distance_non_agu_define (unsigned int regno1, unsigned int regno2, + rtx_insn *insn) { - char buf[256]; + basic_block bb = BLOCK_FOR_INSN (insn); + int distance = 0; + bool found = false; - switch (GET_CODE (x)) + if (insn != BB_HEAD (bb)) + distance = distance_non_agu_define_in_bb (regno1, regno2, insn, + distance, PREV_INSN (insn), + &found); + + if (!found && distance < LEA_SEARCH_THRESHOLD) { - case PC: - gcc_assert (flag_pic); - putc ('.', file); - break; + edge e; + edge_iterator ei; + bool simple_loop = false; - case SYMBOL_REF: - if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS) - output_addr_const (file, x); + FOR_EACH_EDGE (e, ei, bb->preds) + if (e->src == bb) + { + simple_loop = true; + break; + } + + if (simple_loop) + distance = distance_non_agu_define_in_bb (regno1, regno2, + insn, distance, + BB_END (bb), &found); else { - const char *name = XSTR (x, 0); + int shortest_dist = -1; + bool found_in_bb = false; - /* Mark the decl as referenced so that cgraph will - output the function. */ - if (SYMBOL_REF_DECL (x)) - mark_decl_referenced (SYMBOL_REF_DECL (x)); + FOR_EACH_EDGE (e, ei, bb->preds) + { + int bb_dist + = distance_non_agu_define_in_bb (regno1, regno2, + insn, distance, + BB_END (e->src), + &found_in_bb); + if (found_in_bb) + { + if (shortest_dist < 0) + shortest_dist = bb_dist; + else if (bb_dist > 0) + shortest_dist = MIN (bb_dist, shortest_dist); -#if TARGET_MACHO - if (MACHOPIC_INDIRECT - && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION) - name = machopic_indirection_name (x, /*stub_p=*/true); -#endif - assemble_name (file, name); + found = true; + } + } + + distance = shortest_dist; } - if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF) - && code == 'P' && ! SYMBOL_REF_LOCAL_P (x)) - fputs ("@PLT", file); - break; + } - case LABEL_REF: - x = XEXP (x, 0); - /* FALLTHRU */ - case CODE_LABEL: - ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x)); - assemble_name (asm_out_file, buf); - break; + /* get_attr_type may modify recog data. We want to make sure + that recog data is valid for instruction INSN, on which + distance_non_agu_define is called. INSN is unchanged here. */ + extract_insn_cached (insn); - case CONST_INT: - fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); - break; + if (!found) + return -1; - case CONST: - /* This used to output parentheses around the expression, - but that does not work on the 386 (either ATT or BSD assembler). */ - output_pic_addr_const (file, XEXP (x, 0), code); - break; + return distance >> 1; +} - case CONST_DOUBLE: - /* We can't handle floating point constants; - TARGET_PRINT_OPERAND must handle them. */ - output_operand_lossage ("floating constant misused"); - break; +/* Return the distance in half-cycles between INSN and the next + insn that uses register number REGNO in memory address added + to DISTANCE. Return -1 if REGNO0 is set. - case PLUS: - /* Some assemblers need integer constants to appear first. */ - if (CONST_INT_P (XEXP (x, 0))) - { - output_pic_addr_const (file, XEXP (x, 0), code); - putc ('+', file); - output_pic_addr_const (file, XEXP (x, 1), code); - } - else - { - gcc_assert (CONST_INT_P (XEXP (x, 1))); - output_pic_addr_const (file, XEXP (x, 1), code); - putc ('+', file); - output_pic_addr_const (file, XEXP (x, 0), code); - } - break; + Put true value into *FOUND if register usage was found and + false otherwise. + Put true value into *REDEFINED if register redefinition was + found and false otherwise. */ - case MINUS: - if (!TARGET_MACHO) - putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file); - output_pic_addr_const (file, XEXP (x, 0), code); - putc ('-', file); - output_pic_addr_const (file, XEXP (x, 1), code); - if (!TARGET_MACHO) - putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file); - break; +static int +distance_agu_use_in_bb (unsigned int regno, + rtx_insn *insn, int distance, rtx_insn *start, + bool *found, bool *redefined) +{ + basic_block bb = NULL; + rtx_insn *next = start; + rtx_insn *prev = NULL; - case UNSPEC: - gcc_assert (XVECLEN (x, 0) == 1); - output_pic_addr_const (file, XVECEXP (x, 0, 0), code); - switch (XINT (x, 1)) - { - case UNSPEC_GOT: - fputs ("@GOT", file); - break; - case UNSPEC_GOTOFF: - fputs ("@GOTOFF", file); - break; - case UNSPEC_PLTOFF: - fputs ("@PLTOFF", file); - break; - case UNSPEC_PCREL: - fputs (ASSEMBLER_DIALECT == ASM_ATT ? - "(%rip)" : "[rip]", file); - break; - case UNSPEC_GOTPCREL: - fputs (ASSEMBLER_DIALECT == ASM_ATT ? - "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file); - break; - case UNSPEC_GOTTPOFF: - /* FIXME: This might be @TPOFF in Sun ld too. */ - fputs ("@gottpoff", file); - break; - case UNSPEC_TPOFF: - fputs ("@tpoff", file); - break; - case UNSPEC_NTPOFF: - if (TARGET_64BIT) - fputs ("@tpoff", file); - else - fputs ("@ntpoff", file); - break; - case UNSPEC_DTPOFF: - fputs ("@dtpoff", file); - break; - case UNSPEC_GOTNTPOFF: - if (TARGET_64BIT) - fputs (ASSEMBLER_DIALECT == ASM_ATT ? - "@gottpoff(%rip)": "@gottpoff[rip]", file); - else - fputs ("@gotntpoff", file); - break; - case UNSPEC_INDNTPOFF: - fputs ("@indntpoff", file); - break; -#if TARGET_MACHO - case UNSPEC_MACHOPIC_OFFSET: - putc ('-', file); - machopic_output_function_base_name (file); - break; -#endif - default: - output_operand_lossage ("invalid UNSPEC as operand"); - break; - } - break; + *found = false; + *redefined = false; - default: - output_operand_lossage ("invalid expression as operand"); + if (start != NULL_RTX) + { + bb = BLOCK_FOR_INSN (start); + if (start != BB_HEAD (bb)) + /* If insn and start belong to the same bb, set prev to insn, + so the call to increase_distance will increase the distance + between insns by 1. */ + prev = insn; } -} -/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL. - We need to emit DTP-relative relocations. */ - -static void ATTRIBUTE_UNUSED -i386_output_dwarf_dtprel (FILE *file, int size, rtx x) -{ - fputs (ASM_LONG, file); - output_addr_const (file, x); - fputs ("@dtpoff", file); - switch (size) + while (next + && next != insn + && distance < LEA_SEARCH_THRESHOLD) { - case 4: - break; - case 8: - fputs (", 0", file); - break; - default: - gcc_unreachable (); - } -} + if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next)) + { + distance = increase_distance(prev, next, distance); + if (insn_uses_reg_mem (regno, next)) + { + /* Return DISTANCE if OP0 is used in memory + address in NEXT. */ + *found = true; + return distance; + } -/* Return true if X is a representation of the PIC register. This copes - with calls from ix86_find_base_term, where the register might have - been replaced by a cselib value. */ + if (insn_defines_reg (regno, INVALID_REGNUM, next)) + { + /* Return -1 if OP0 is set in NEXT. */ + *redefined = true; + return -1; + } -static bool -ix86_pic_register_p (rtx x) -{ - if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x)) - return (pic_offset_table_rtx - && rtx_equal_for_cselib_p (x, pic_offset_table_rtx)); - else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SET_GOT) - return true; - else if (!REG_P (x)) - return false; - else if (pic_offset_table_rtx) - { - if (REGNO (x) == REGNO (pic_offset_table_rtx)) - return true; - if (HARD_REGISTER_P (x) - && !HARD_REGISTER_P (pic_offset_table_rtx) - && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx)) - return true; - return false; + prev = next; + } + + if (next == BB_END (bb)) + break; + + next = NEXT_INSN (next); } - else - return REGNO (x) == PIC_OFFSET_TABLE_REGNUM; + + return distance; } -/* Helper function for ix86_delegitimize_address. - Attempt to delegitimize TLS local-exec accesses. */ +/* Return the distance between INSN and the next insn that uses + register number REGNO0 in memory address. Return -1 if no such + a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */ -static rtx -ix86_delegitimize_tls_address (rtx orig_x) +static int +distance_agu_use (unsigned int regno0, rtx_insn *insn) { - rtx x = orig_x, unspec; - struct ix86_address addr; + basic_block bb = BLOCK_FOR_INSN (insn); + int distance = 0; + bool found = false; + bool redefined = false; - if (!TARGET_TLS_DIRECT_SEG_REFS) - return orig_x; - if (MEM_P (x)) - x = XEXP (x, 0); - if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode) - return orig_x; - if (ix86_decompose_address (x, &addr) == 0 - || addr.seg != DEFAULT_TLS_SEG_REG - || addr.disp == NULL_RTX - || GET_CODE (addr.disp) != CONST) - return orig_x; - unspec = XEXP (addr.disp, 0); - if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1))) - unspec = XEXP (unspec, 0); - if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF) - return orig_x; - x = XVECEXP (unspec, 0, 0); - gcc_assert (GET_CODE (x) == SYMBOL_REF); - if (unspec != XEXP (addr.disp, 0)) - x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1)); - if (addr.index) - { - rtx idx = addr.index; - if (addr.scale != 1) - idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale)); - x = gen_rtx_PLUS (Pmode, idx, x); - } - if (addr.base) - x = gen_rtx_PLUS (Pmode, addr.base, x); - if (MEM_P (orig_x)) - x = replace_equiv_address_nv (orig_x, x); - return x; -} + if (insn != BB_END (bb)) + distance = distance_agu_use_in_bb (regno0, insn, distance, + NEXT_INSN (insn), + &found, &redefined); -/* In the name of slightly smaller debug output, and to cater to - general assembler lossage, recognize PIC+GOTOFF and turn it back - into a direct symbol reference. + if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD) + { + edge e; + edge_iterator ei; + bool simple_loop = false; - On Darwin, this is necessary to avoid a crash, because Darwin - has a different PIC label for each routine but the DWARF debugging - information is not associated with any particular routine, so it's - necessary to remove references to the PIC label from RTL stored by - the DWARF output code. - - This helper is used in the normal ix86_delegitimize_address - entrypoint (e.g. used in the target delegitimization hook) and - in ix86_find_base_term. As compile time memory optimization, we - avoid allocating rtxes that will not change anything on the outcome - of the callers (find_base_value and find_base_term). */ + FOR_EACH_EDGE (e, ei, bb->succs) + if (e->dest == bb) + { + simple_loop = true; + break; + } -static inline rtx -ix86_delegitimize_address_1 (rtx x, bool base_term_p) -{ - rtx orig_x = delegitimize_mem_from_attrs (x); - /* addend is NULL or some rtx if x is something+GOTOFF where - something doesn't include the PIC register. */ - rtx addend = NULL_RTX; - /* reg_addend is NULL or a multiple of some register. */ - rtx reg_addend = NULL_RTX; - /* const_addend is NULL or a const_int. */ - rtx const_addend = NULL_RTX; - /* This is the result, or NULL. */ - rtx result = NULL_RTX; + if (simple_loop) + distance = distance_agu_use_in_bb (regno0, insn, + distance, BB_HEAD (bb), + &found, &redefined); + else + { + int shortest_dist = -1; + bool found_in_bb = false; + bool redefined_in_bb = false; - x = orig_x; + FOR_EACH_EDGE (e, ei, bb->succs) + { + int bb_dist + = distance_agu_use_in_bb (regno0, insn, + distance, BB_HEAD (e->dest), + &found_in_bb, &redefined_in_bb); + if (found_in_bb) + { + if (shortest_dist < 0) + shortest_dist = bb_dist; + else if (bb_dist > 0) + shortest_dist = MIN (bb_dist, shortest_dist); - if (MEM_P (x)) - x = XEXP (x, 0); + found = true; + } + } - if (TARGET_64BIT) - { - if (GET_CODE (x) == CONST - && GET_CODE (XEXP (x, 0)) == PLUS - && GET_MODE (XEXP (x, 0)) == Pmode - && CONST_INT_P (XEXP (XEXP (x, 0), 1)) - && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC - && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL) - { - /* find_base_{value,term} only care about MEMs with arg_pointer_rtx - base. A CONST can't be arg_pointer_rtx based. */ - if (base_term_p && MEM_P (orig_x)) - return orig_x; - rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0); - x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2); - if (MEM_P (orig_x)) - x = replace_equiv_address_nv (orig_x, x); - return x; + distance = shortest_dist; } + } - if (GET_CODE (x) == CONST - && GET_CODE (XEXP (x, 0)) == UNSPEC - && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL - || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL) - && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)) - { - x = XVECEXP (XEXP (x, 0), 0, 0); - if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x)) - { - x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x)); - if (x == NULL_RTX) - return orig_x; - } - return x; - } + if (!found || redefined) + return -1; - if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC) - return ix86_delegitimize_tls_address (orig_x); + return distance >> 1; +} - /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic - and -mcmodel=medium -fpic. */ - } +/* Define this macro to tune LEA priority vs ADD, it take effect when + there is a dilemma of choicing LEA or ADD + Negative value: ADD is more preferred than LEA + Zero: Netrual + Positive value: LEA is more preferred than ADD*/ +#define IX86_LEA_PRIORITY 0 - if (GET_CODE (x) != PLUS - || GET_CODE (XEXP (x, 1)) != CONST) - return ix86_delegitimize_tls_address (orig_x); +/* Return true if usage of lea INSN has performance advantage + over a sequence of instructions. Instructions sequence has + SPLIT_COST cycles higher latency than lea latency. */ - if (ix86_pic_register_p (XEXP (x, 0))) - /* %ebx + GOT/GOTOFF */ - ; - else if (GET_CODE (XEXP (x, 0)) == PLUS) - { - /* %ebx + %reg * scale + GOT/GOTOFF */ - reg_addend = XEXP (x, 0); - if (ix86_pic_register_p (XEXP (reg_addend, 0))) - reg_addend = XEXP (reg_addend, 1); - else if (ix86_pic_register_p (XEXP (reg_addend, 1))) - reg_addend = XEXP (reg_addend, 0); - else - { - reg_addend = NULL_RTX; - addend = XEXP (x, 0); - } - } - else - addend = XEXP (x, 0); +static bool +ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1, + unsigned int regno2, int split_cost, bool has_scale) +{ + int dist_define, dist_use; - x = XEXP (XEXP (x, 1), 0); - if (GET_CODE (x) == PLUS - && CONST_INT_P (XEXP (x, 1))) + /* For Silvermont if using a 2-source or 3-source LEA for + non-destructive destination purposes, or due to wanting + ability to use SCALE, the use of LEA is justified. */ + if (TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS + || TARGET_TREMONT || TARGET_INTEL) { - const_addend = XEXP (x, 1); - x = XEXP (x, 0); + if (has_scale) + return true; + if (split_cost < 1) + return false; + if (regno0 == regno1 || regno0 == regno2) + return false; + return true; } - if (GET_CODE (x) == UNSPEC - && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend) - || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x)) - || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC - && !MEM_P (orig_x) && !addend))) - result = XVECEXP (x, 0, 0); - - if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x) - && !MEM_P (orig_x)) - result = XVECEXP (x, 0, 0); - - if (! result) - return ix86_delegitimize_tls_address (orig_x); + dist_define = distance_non_agu_define (regno1, regno2, insn); + dist_use = distance_agu_use (regno0, insn); - /* For (PLUS something CONST_INT) both find_base_{value,term} just - recurse on the first operand. */ - if (const_addend && !base_term_p) - result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend)); - if (reg_addend) - result = gen_rtx_PLUS (Pmode, reg_addend, result); - if (addend) + if (dist_define < 0 || dist_define >= LEA_MAX_STALL) { - /* If the rest of original X doesn't involve the PIC register, add - addend and subtract pic_offset_table_rtx. This can happen e.g. - for code like: - leal (%ebx, %ecx, 4), %ecx - ... - movl foo@GOTOFF(%ecx), %edx - in which case we return (%ecx - %ebx) + foo - or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg - and reload has completed. Don't do the latter for debug, - as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */ - if (pic_offset_table_rtx - && (!reload_completed || !ix86_use_pseudo_pic_reg ())) - result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend), - pic_offset_table_rtx), - result); - else if (base_term_p - && pic_offset_table_rtx - && !TARGET_MACHO - && !TARGET_VXWORKS_RTP) - { - rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); - tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp); - result = gen_rtx_PLUS (Pmode, tmp, result); - } + /* If there is no non AGU operand definition, no AGU + operand usage and split cost is 0 then both lea + and non lea variants have same priority. Currently + we prefer lea for 64 bit code and non lea on 32 bit + code. */ + if (dist_use < 0 && split_cost == 0) + return TARGET_64BIT || IX86_LEA_PRIORITY; else - return orig_x; - } - if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x)) - { - result = lowpart_subreg (GET_MODE (orig_x), result, Pmode); - if (result == NULL_RTX) - return orig_x; + return true; } - return result; -} -/* The normal instantiation of the above template. */ + /* With longer definitions distance lea is more preferable. + Here we change it to take into account splitting cost and + lea priority. */ + dist_define += split_cost + IX86_LEA_PRIORITY; -static rtx -ix86_delegitimize_address (rtx x) -{ - return ix86_delegitimize_address_1 (x, false); + /* If there is no use in memory addess then we just check + that split cost exceeds AGU stall. */ + if (dist_use < 0) + return dist_define > LEA_MAX_STALL; + + /* If this insn has both backward non-agu dependence and forward + agu dependence, the one with short distance takes effect. */ + return dist_define >= dist_use; } -/* If X is a machine specific address (i.e. a symbol or label being - referenced as a displacement from the GOT implemented using an - UNSPEC), then return the base term. Otherwise return X. */ +/* Return true if it is legal to clobber flags by INSN and + false otherwise. */ -rtx -ix86_find_base_term (rtx x) +static bool +ix86_ok_to_clobber_flags (rtx_insn *insn) { - rtx term; + basic_block bb = BLOCK_FOR_INSN (insn); + df_ref use; + bitmap live; - if (TARGET_64BIT) + while (insn) { - if (GET_CODE (x) != CONST) - return x; - term = XEXP (x, 0); - if (GET_CODE (term) == PLUS - && CONST_INT_P (XEXP (term, 1))) - term = XEXP (term, 0); - if (GET_CODE (term) != UNSPEC - || (XINT (term, 1) != UNSPEC_GOTPCREL - && XINT (term, 1) != UNSPEC_PCREL)) - return x; + if (NONDEBUG_INSN_P (insn)) + { + FOR_EACH_INSN_USE (use, insn) + if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG) + return false; - return XVECEXP (term, 0, 0); + if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn)) + return true; + } + + if (insn == BB_END (bb)) + break; + + insn = NEXT_INSN (insn); } - return ix86_delegitimize_address_1 (x, true); + live = df_get_live_out(bb); + return !REGNO_REG_SET_P (live, FLAGS_REG); } -/* Return true if X shouldn't be emitted into the debug info. - Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_ - symbol easily into the .debug_info section, so we need not to - delegitimize, but instead assemble as @gotoff. - Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically - assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */ +/* Return true if we need to split op0 = op1 + op2 into a sequence of + move and add to avoid AGU stalls. */ -static bool -ix86_const_not_ok_for_debug_p (rtx x) +bool +ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[]) { - if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF) - return true; + unsigned int regno0, regno1, regno2; - if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0) - return true; + /* Check if we need to optimize. */ + if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) + return false; - return false; + /* Check it is correct to split here. */ + if (!ix86_ok_to_clobber_flags(insn)) + return false; + + regno0 = true_regnum (operands[0]); + regno1 = true_regnum (operands[1]); + regno2 = true_regnum (operands[2]); + + /* We need to split only adds with non destructive + destination operand. */ + if (regno0 == regno1 || regno0 == regno2) + return false; + else + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false); } - -static void -put_condition_code (enum rtx_code code, machine_mode mode, bool reverse, - bool fp, FILE *file) -{ - const char *suffix; - if (mode == CCFPmode) - { - code = ix86_fp_compare_code_to_integer (code); - mode = CCmode; - } - if (reverse) - code = reverse_condition (code); +/* Return true if we should emit lea instruction instead of mov + instruction. */ - switch (code) - { - case EQ: - gcc_assert (mode != CCGZmode); - switch (mode) - { - case E_CCAmode: - suffix = "a"; - break; - case E_CCCmode: - suffix = "c"; - break; - case E_CCOmode: - suffix = "o"; - break; - case E_CCPmode: - suffix = "p"; - break; - case E_CCSmode: - suffix = "s"; - break; - default: - suffix = "e"; - break; - } - break; - case NE: - gcc_assert (mode != CCGZmode); - switch (mode) - { - case E_CCAmode: - suffix = "na"; - break; - case E_CCCmode: - suffix = "nc"; - break; - case E_CCOmode: - suffix = "no"; - break; - case E_CCPmode: - suffix = "np"; - break; - case E_CCSmode: - suffix = "ns"; - break; - default: - suffix = "ne"; - break; - } - break; - case GT: - gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode); - suffix = "g"; - break; - case GTU: - /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers. - Those same assemblers have the same but opposite lossage on cmov. */ - if (mode == CCmode) - suffix = fp ? "nbe" : "a"; - else - gcc_unreachable (); - break; - case LT: - switch (mode) - { - case E_CCNOmode: - case E_CCGOCmode: - suffix = "s"; - break; +bool +ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[]) +{ + unsigned int regno0, regno1; - case E_CCmode: - case E_CCGCmode: - case E_CCGZmode: - suffix = "l"; - break; + /* Check if we need to optimize. */ + if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) + return false; - default: - gcc_unreachable (); - } - break; - case LTU: - if (mode == CCmode || mode == CCGZmode) - suffix = "b"; - else if (mode == CCCmode) - suffix = fp ? "b" : "c"; - else - gcc_unreachable (); - break; - case GE: - switch (mode) - { - case E_CCNOmode: - case E_CCGOCmode: - suffix = "ns"; - break; + /* Use lea for reg to reg moves only. */ + if (!REG_P (operands[0]) || !REG_P (operands[1])) + return false; - case E_CCmode: - case E_CCGCmode: - case E_CCGZmode: - suffix = "ge"; - break; + regno0 = true_regnum (operands[0]); + regno1 = true_regnum (operands[1]); - default: - gcc_unreachable (); - } - break; - case GEU: - if (mode == CCmode || mode == CCGZmode) - suffix = "nb"; - else if (mode == CCCmode) - suffix = fp ? "nb" : "nc"; - else - gcc_unreachable (); - break; - case LE: - gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode); - suffix = "le"; - break; - case LEU: - if (mode == CCmode) - suffix = "be"; - else - gcc_unreachable (); - break; - case UNORDERED: - suffix = fp ? "u" : "p"; - break; - case ORDERED: - suffix = fp ? "nu" : "np"; - break; - default: - gcc_unreachable (); - } - fputs (suffix, file); + return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false); } -/* Print the name of register X to FILE based on its machine mode and number. - If CODE is 'w', pretend the mode is HImode. - If CODE is 'b', pretend the mode is QImode. - If CODE is 'k', pretend the mode is SImode. - If CODE is 'q', pretend the mode is DImode. - If CODE is 'x', pretend the mode is V4SFmode. - If CODE is 't', pretend the mode is V8SFmode. - If CODE is 'g', pretend the mode is V16SFmode. - If CODE is 'h', pretend the reg is the 'high' byte register. - If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op. - If CODE is 'd', duplicate the operand for AVX instruction. - If CODE is 'V', print naked full integer register name without %. - */ +/* Return true if we need to split lea into a sequence of + instructions to avoid AGU stalls. */ -void -print_reg (rtx x, int code, FILE *file) +bool +ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[]) { - const char *reg; - int msize; - unsigned int regno; - bool duplicated; - - if (ASSEMBLER_DIALECT == ASM_ATT && code != 'V') - putc ('%', file); + unsigned int regno0, regno1, regno2; + int split_cost; + struct ix86_address parts; + int ok; - if (x == pc_rtx) - { - gcc_assert (TARGET_64BIT); - fputs ("rip", file); - return; - } + /* Check we need to optimize. */ + if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun)) + return false; - if (code == 'y' && STACK_TOP_P (x)) - { - fputs ("st(0)", file); - return; - } + /* The "at least two components" test below might not catch simple + move or zero extension insns if parts.base is non-NULL and parts.disp + is const0_rtx as the only components in the address, e.g. if the + register is %rbp or %r13. As this test is much cheaper and moves or + zero extensions are the common case, do this check first. */ + if (REG_P (operands[1]) + || (SImode_address_operand (operands[1], VOIDmode) + && REG_P (XEXP (operands[1], 0)))) + return false; - if (code == 'w') - msize = 2; - else if (code == 'b') - msize = 1; - else if (code == 'k') - msize = 4; - else if (code == 'q') - msize = 8; - else if (code == 'h') - msize = 0; - else if (code == 'x') - msize = 16; - else if (code == 't') - msize = 32; - else if (code == 'g') - msize = 64; - else - msize = GET_MODE_SIZE (GET_MODE (x)); + /* Check if it is OK to split here. */ + if (!ix86_ok_to_clobber_flags (insn)) + return false; - regno = REGNO (x); + ok = ix86_decompose_address (operands[1], &parts); + gcc_assert (ok); - if (regno == ARG_POINTER_REGNUM - || regno == FRAME_POINTER_REGNUM - || regno == FPSR_REG) - { - output_operand_lossage - ("invalid use of register '%s'", reg_names[regno]); - return; - } - else if (regno == FLAGS_REG) - { - output_operand_lossage ("invalid use of asm flag output"); - return; - } + /* There should be at least two components in the address. */ + if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX) + + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2) + return false; - if (code == 'V') + /* We should not split into add if non legitimate pic + operand is used as displacement. */ + if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp)) + return false; + + regno0 = true_regnum (operands[0]) ; + regno1 = INVALID_REGNUM; + regno2 = INVALID_REGNUM; + + if (parts.base) + regno1 = true_regnum (parts.base); + if (parts.index) + regno2 = true_regnum (parts.index); + + split_cost = 0; + + /* Compute how many cycles we will add to execution time + if split lea into a sequence of instructions. */ + if (parts.base || parts.index) { - if (GENERAL_REGNO_P (regno)) - msize = GET_MODE_SIZE (word_mode); - else - error ("% modifier on non-integer register"); + /* Have to use mov instruction if non desctructive + destination form is used. */ + if (regno1 != regno0 && regno2 != regno0) + split_cost += 1; + + /* Have to add index to base if both exist. */ + if (parts.base && parts.index) + split_cost += 1; + + /* Have to use shift and adds if scale is 2 or greater. */ + if (parts.scale > 1) + { + if (regno0 != regno1) + split_cost += 1; + else if (regno2 == regno0) + split_cost += 4; + else + split_cost += parts.scale; + } + + /* Have to use add instruction with immediate if + disp is non zero. */ + if (parts.disp && parts.disp != const0_rtx) + split_cost += 1; + + /* Subtract the price of lea. */ + split_cost -= 1; } - duplicated = code == 'd' && TARGET_AVX; + return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost, + parts.scale > 1); +} - switch (msize) +/* Return true if it is ok to optimize an ADD operation to LEA + operation to avoid flag register consumation. For most processors, + ADD is faster than LEA. For the processors like BONNELL, if the + destination register of LEA holds an actual address which will be + used soon, LEA is better and otherwise ADD is better. */ + +bool +ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[]) +{ + unsigned int regno0 = true_regnum (operands[0]); + unsigned int regno1 = true_regnum (operands[1]); + unsigned int regno2 = true_regnum (operands[2]); + + /* If a = b + c, (a!=b && a!=c), must use lea form. */ + if (regno0 != regno1 && regno0 != regno2) + return true; + + if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) + return false; + + return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false); +} + +/* Return true if destination reg of SET_BODY is shift count of + USE_BODY. */ + +static bool +ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body) +{ + rtx set_dest; + rtx shift_rtx; + int i; + + /* Retrieve destination of SET_BODY. */ + switch (GET_CODE (set_body)) { - case 16: - case 12: - case 8: - if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode)) - warning (0, "unsupported size for integer register"); - /* FALLTHRU */ - case 4: - if (LEGACY_INT_REGNO_P (regno)) - putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file); - /* FALLTHRU */ - case 2: - normal: - reg = hi_reg_name[regno]; - break; - case 1: - if (regno >= ARRAY_SIZE (qi_reg_name)) - goto normal; - if (!ANY_QI_REGNO_P (regno)) - error ("unsupported size for integer register"); - reg = qi_reg_name[regno]; - break; - case 0: - if (regno >= ARRAY_SIZE (qi_high_reg_name)) - goto normal; - reg = qi_high_reg_name[regno]; + case SET: + set_dest = SET_DEST (set_body); + if (!set_dest || !REG_P (set_dest)) + return false; break; - case 32: - case 64: - if (SSE_REGNO_P (regno)) - { - gcc_assert (!duplicated); - putc (msize == 32 ? 'y' : 'z', file); - reg = hi_reg_name[regno] + 1; - break; - } - goto normal; + case PARALLEL: + for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--) + if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i), + use_body)) + return true; + /* FALLTHROUGH */ default: - gcc_unreachable (); + return false; } - fputs (reg, file); + /* Retrieve shift count of USE_BODY. */ + switch (GET_CODE (use_body)) + { + case SET: + shift_rtx = XEXP (use_body, 1); + break; + case PARALLEL: + for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--) + if (ix86_dep_by_shift_count_body (set_body, + XVECEXP (use_body, 0, i))) + return true; + /* FALLTHROUGH */ + default: + return false; + } - /* Irritatingly, AMD extended registers use - different naming convention: "r%d[bwd]" */ - if (REX_INT_REGNO_P (regno)) + if (shift_rtx + && (GET_CODE (shift_rtx) == ASHIFT + || GET_CODE (shift_rtx) == LSHIFTRT + || GET_CODE (shift_rtx) == ASHIFTRT + || GET_CODE (shift_rtx) == ROTATE + || GET_CODE (shift_rtx) == ROTATERT)) { - gcc_assert (TARGET_64BIT); - switch (msize) + rtx shift_count = XEXP (shift_rtx, 1); + + /* Return true if shift count is dest of SET_BODY. */ + if (REG_P (shift_count)) { - case 0: - error ("extended registers have no high halves"); - break; - case 1: - putc ('b', file); - break; - case 2: - putc ('w', file); - break; - case 4: - putc ('d', file); - break; - case 8: - /* no suffix */ - break; - default: - error ("unsupported operand size for extended register"); - break; + /* Add check since it can be invoked before register + allocation in pre-reload schedule. */ + if (reload_completed + && true_regnum (set_dest) == true_regnum (shift_count)) + return true; + else if (REGNO(set_dest) == REGNO(shift_count)) + return true; } - return; } - if (duplicated) - { - if (ASSEMBLER_DIALECT == ASM_ATT) - fprintf (file, ", %%%s", reg); - else - fprintf (file, ", %s", reg); - } + return false; } -/* Meaning of CODE: - L,W,B,Q,S,T -- print the opcode suffix for specified size of operand. - C -- print opcode suffix for set/cmov insn. - c -- like C, but print reversed condition - F,f -- likewise, but for floating-point. - O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.", - otherwise nothing - R -- print embedded rounding and sae. - r -- print only sae. - z -- print the opcode suffix for the size of the current operand. - Z -- likewise, with special suffixes for x87 instructions. - * -- print a star (in certain assembler syntax) - A -- print an absolute memory reference. - E -- print address with DImode register names if TARGET_64BIT. - w -- print the operand as if it's a "word" (HImode) even if it isn't. - s -- print a shift double count, followed by the assemblers argument - delimiter. - b -- print the QImode name of the register for the indicated operand. - %b0 would print %al if operands[0] is reg 0. - w -- likewise, print the HImode name of the register. - k -- likewise, print the SImode name of the register. - q -- likewise, print the DImode name of the register. - x -- likewise, print the V4SFmode name of the register. - t -- likewise, print the V8SFmode name of the register. - g -- likewise, print the V16SFmode name of the register. - h -- print the QImode name for a "high" register, either ah, bh, ch or dh. - y -- print "st(0)" instead of "st" as a register. - d -- print duplicated register operand for AVX instruction. - D -- print condition for SSE cmp instruction. - P -- if PIC, print an @PLT suffix. - p -- print raw symbol name. - X -- don't print any sort of PIC '@' suffix for a symbol. - & -- print some in-use local-dynamic symbol name. - H -- print a memory address offset by 8; used for sse high-parts - Y -- print condition for XOP pcom* instruction. - V -- print naked full integer register name without %. - + -- print a branch hint as 'cs' or 'ds' prefix - ; -- print a semicolon (after prefixes due to bug in older gas). - ~ -- print "i" if TARGET_AVX2, "f" otherwise. - ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode - M -- print addr32 prefix for TARGET_X32 with VSIB address. - ! -- print NOTRACK prefix for jxx/call/ret instructions if required. - */ +/* Return true if destination reg of SET_INSN is shift count of + USE_INSN. */ -void -ix86_print_operand (FILE *file, rtx x, int code) +bool +ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn) { - if (code) - { - switch (code) - { - case 'A': - switch (ASSEMBLER_DIALECT) - { - case ASM_ATT: - putc ('*', file); - break; - - case ASM_INTEL: - /* Intel syntax. For absolute addresses, registers should not - be surrounded by braces. */ - if (!REG_P (x)) - { - putc ('[', file); - ix86_print_operand (file, x, 0); - putc (']', file); - return; - } - break; + return ix86_dep_by_shift_count_body (PATTERN (set_insn), + PATTERN (use_insn)); +} - default: - gcc_unreachable (); - } +/* Return TRUE or FALSE depending on whether the unary operator meets the + appropriate constraints. */ - ix86_print_operand (file, x, 0); - return; +bool +ix86_unary_operator_ok (enum rtx_code, + machine_mode, + rtx operands[2]) +{ + /* If one of operands is memory, source and destination must match. */ + if ((MEM_P (operands[0]) + || MEM_P (operands[1])) + && ! rtx_equal_p (operands[0], operands[1])) + return false; + return true; +} - case 'E': - /* Wrap address in an UNSPEC to declare special handling. */ - if (TARGET_64BIT) - x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR); +/* Return TRUE if the operands to a vec_interleave_{high,low}v2df + are ok, keeping in mind the possible movddup alternative. */ - output_address (VOIDmode, x); - return; +bool +ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high) +{ + if (MEM_P (operands[0])) + return rtx_equal_p (operands[0], operands[1 + high]); + if (MEM_P (operands[1]) && MEM_P (operands[2])) + return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]); + return true; +} - case 'L': - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('l', file); - return; +/* A subroutine of ix86_build_signbit_mask. If VECT is true, + then replicate the value for all elements of the vector + register. */ - case 'W': - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('w', file); - return; +rtx +ix86_build_const_vector (machine_mode mode, bool vect, rtx value) +{ + int i, n_elt; + rtvec v; + machine_mode scalar_mode; - case 'B': - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('b', file); - return; + switch (mode) + { + case E_V64QImode: + case E_V32QImode: + case E_V16QImode: + case E_V32HImode: + case E_V16HImode: + case E_V8HImode: + case E_V16SImode: + case E_V8SImode: + case E_V4SImode: + case E_V8DImode: + case E_V4DImode: + case E_V2DImode: + gcc_assert (vect); + /* FALLTHRU */ + case E_V16SFmode: + case E_V8SFmode: + case E_V4SFmode: + case E_V8DFmode: + case E_V4DFmode: + case E_V2DFmode: + n_elt = GET_MODE_NUNITS (mode); + v = rtvec_alloc (n_elt); + scalar_mode = GET_MODE_INNER (mode); - case 'Q': - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('l', file); - return; + RTVEC_ELT (v, 0) = value; - case 'S': - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('s', file); - return; + for (i = 1; i < n_elt; ++i) + RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode); - case 'T': - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('t', file); - return; + return gen_rtx_CONST_VECTOR (mode, v); - case 'O': -#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX - if (ASSEMBLER_DIALECT != ASM_ATT) - return; + default: + gcc_unreachable (); + } +} - switch (GET_MODE_SIZE (GET_MODE (x))) - { - case 2: - putc ('w', file); - break; - - case 4: - putc ('l', file); - break; +/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders + and ix86_expand_int_vcond. Create a mask for the sign bit in MODE + for an SSE register. If VECT is true, then replicate the mask for + all elements of the vector register. If INVERT is true, then create + a mask excluding the sign bit. */ - case 8: - putc ('q', file); - break; +rtx +ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert) +{ + machine_mode vec_mode, imode; + wide_int w; + rtx mask, v; - default: - output_operand_lossage ("invalid operand size for operand " - "code 'O'"); - return; - } + switch (mode) + { + case E_V16SImode: + case E_V16SFmode: + case E_V8SImode: + case E_V4SImode: + case E_V8SFmode: + case E_V4SFmode: + vec_mode = mode; + imode = SImode; + break; - putc ('.', file); -#endif - return; + case E_V8DImode: + case E_V4DImode: + case E_V2DImode: + case E_V8DFmode: + case E_V4DFmode: + case E_V2DFmode: + vec_mode = mode; + imode = DImode; + break; - case 'z': - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) - { - /* Opcodes don't get size suffixes if using Intel opcodes. */ - if (ASSEMBLER_DIALECT == ASM_INTEL) - return; + case E_TImode: + case E_TFmode: + vec_mode = VOIDmode; + imode = TImode; + break; - switch (GET_MODE_SIZE (GET_MODE (x))) - { - case 1: - putc ('b', file); - return; + default: + gcc_unreachable (); + } - case 2: - putc ('w', file); - return; + machine_mode inner_mode = GET_MODE_INNER (mode); + w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1, + GET_MODE_BITSIZE (inner_mode)); + if (invert) + w = wi::bit_not (w); - case 4: - putc ('l', file); - return; + /* Force this value into the low part of a fp vector constant. */ + mask = immed_wide_int_const (w, imode); + mask = gen_lowpart (inner_mode, mask); - case 8: - putc ('q', file); - return; + if (vec_mode == VOIDmode) + return force_reg (inner_mode, mask); - default: - output_operand_lossage ("invalid operand size for operand " - "code 'z'"); - return; - } - } + v = ix86_build_const_vector (vec_mode, vect, mask); + return force_reg (vec_mode, v); +} - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT) - warning (0, "non-integer operand used with operand code %"); - /* FALLTHRU */ +/* Return TRUE or FALSE depending on whether the first SET in INSN + has source and destination with matching CC modes, and that the + CC mode is at least as constrained as REQ_MODE. */ - case 'Z': - /* 387 opcodes don't get size suffixes if using Intel opcodes. */ - if (ASSEMBLER_DIALECT == ASM_INTEL) - return; +bool +ix86_match_ccmode (rtx insn, machine_mode req_mode) +{ + rtx set; + machine_mode set_mode; - if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT) - { - switch (GET_MODE_SIZE (GET_MODE (x))) - { - case 2: -#ifdef HAVE_AS_IX86_FILDS - putc ('s', file); -#endif - return; + set = PATTERN (insn); + if (GET_CODE (set) == PARALLEL) + set = XVECEXP (set, 0, 0); + gcc_assert (GET_CODE (set) == SET); + gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE); - case 4: - putc ('l', file); - return; + set_mode = GET_MODE (SET_DEST (set)); + switch (set_mode) + { + case E_CCNOmode: + if (req_mode != CCNOmode + && (req_mode != CCmode + || XEXP (SET_SRC (set), 1) != const0_rtx)) + return false; + break; + case E_CCmode: + if (req_mode == CCGCmode) + return false; + /* FALLTHRU */ + case E_CCGCmode: + if (req_mode == CCGOCmode || req_mode == CCNOmode) + return false; + /* FALLTHRU */ + case E_CCGOCmode: + if (req_mode == CCZmode) + return false; + /* FALLTHRU */ + case E_CCZmode: + break; - case 8: -#ifdef HAVE_AS_IX86_FILDQ - putc ('q', file); -#else - fputs ("ll", file); -#endif - return; + case E_CCGZmode: - default: - break; - } - } - else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT) - { - /* 387 opcodes don't get size suffixes - if the operands are registers. */ - if (STACK_REG_P (x)) - return; + case E_CCAmode: + case E_CCCmode: + case E_CCOmode: + case E_CCPmode: + case E_CCSmode: + if (set_mode != req_mode) + return false; + break; - switch (GET_MODE_SIZE (GET_MODE (x))) - { - case 4: - putc ('s', file); - return; + default: + gcc_unreachable (); + } - case 8: - putc ('l', file); - return; + return GET_MODE (SET_SRC (set)) == set_mode; +} - case 12: - case 16: - putc ('t', file); - return; +machine_mode +ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1) +{ + machine_mode mode = GET_MODE (op0); - default: - break; - } - } - else - { - output_operand_lossage ("invalid operand type used with " - "operand code 'Z'"); - return; - } + if (SCALAR_FLOAT_MODE_P (mode)) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); + return CCFPmode; + } - output_operand_lossage ("invalid operand size for operand code 'Z'"); - return; + switch (code) + { + /* Only zero flag is needed. */ + case EQ: /* ZF=0 */ + case NE: /* ZF!=0 */ + return CCZmode; + /* Codes needing carry flag. */ + case GEU: /* CF=0 */ + case LTU: /* CF=1 */ + /* Detect overflow checks. They need just the carry flag. */ + if (GET_CODE (op0) == PLUS + && (rtx_equal_p (op1, XEXP (op0, 0)) + || rtx_equal_p (op1, XEXP (op0, 1)))) + return CCCmode; + else + return CCmode; + case GTU: /* CF=0 & ZF=0 */ + case LEU: /* CF=1 | ZF=1 */ + return CCmode; + /* Codes possibly doable only with sign flag when + comparing against zero. */ + case GE: /* SF=OF or SF=0 */ + case LT: /* SF<>OF or SF=1 */ + if (op1 == const0_rtx) + return CCGOCmode; + else + /* For other cases Carry flag is not required. */ + return CCGCmode; + /* Codes doable only with sign flag when comparing + against zero, but we miss jump instruction for it + so we need to use relational tests against overflow + that thus needs to be zero. */ + case GT: /* ZF=0 & SF=OF */ + case LE: /* ZF=1 | SF<>OF */ + if (op1 == const0_rtx) + return CCNOmode; + else + return CCGCmode; + /* strcmp pattern do (use flags) and combine may ask us for proper + mode. */ + case USE: + return CCmode; + default: + gcc_unreachable (); + } +} - case 'd': - case 'b': - case 'w': - case 'k': - case 'q': - case 'h': - case 't': - case 'g': - case 'y': - case 'x': - case 'X': - case 'P': - case 'p': - case 'V': - break; +/* Return the fixed registers used for condition codes. */ - case 's': - if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT) - { - ix86_print_operand (file, x, 0); - fputs (", ", file); - } - return; +static bool +ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2) +{ + *p1 = FLAGS_REG; + *p2 = INVALID_REGNUM; + return true; +} - case 'Y': - switch (GET_CODE (x)) - { - case NE: - fputs ("neq", file); - break; - case EQ: - fputs ("eq", file); - break; - case GE: - case GEU: - fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file); - break; - case GT: - case GTU: - fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file); - break; - case LE: - case LEU: - fputs ("le", file); - break; - case LT: - case LTU: - fputs ("lt", file); - break; - case UNORDERED: - fputs ("unord", file); - break; - case ORDERED: - fputs ("ord", file); - break; - case UNEQ: - fputs ("ueq", file); - break; - case UNGE: - fputs ("nlt", file); - break; - case UNGT: - fputs ("nle", file); - break; - case UNLE: - fputs ("ule", file); - break; - case UNLT: - fputs ("ult", file); - break; - case LTGT: - fputs ("une", file); - break; - default: - output_operand_lossage ("operand is not a condition code, " - "invalid operand code 'Y'"); - return; - } - return; +/* If two condition code modes are compatible, return a condition code + mode which is compatible with both. Otherwise, return + VOIDmode. */ - case 'D': - /* Little bit of braindamage here. The SSE compare instructions - does use completely different names for the comparisons that the - fp conditional moves. */ - switch (GET_CODE (x)) - { - case UNEQ: - if (TARGET_AVX) - { - fputs ("eq_us", file); - break; - } - /* FALLTHRU */ - case EQ: - fputs ("eq", file); - break; - case UNLT: - if (TARGET_AVX) - { - fputs ("nge", file); - break; - } - /* FALLTHRU */ - case LT: - fputs ("lt", file); - break; - case UNLE: - if (TARGET_AVX) - { - fputs ("ngt", file); - break; - } - /* FALLTHRU */ - case LE: - fputs ("le", file); - break; - case UNORDERED: - fputs ("unord", file); - break; - case LTGT: - if (TARGET_AVX) - { - fputs ("neq_oq", file); - break; - } - /* FALLTHRU */ - case NE: - fputs ("neq", file); - break; - case GE: - if (TARGET_AVX) - { - fputs ("ge", file); - break; - } - /* FALLTHRU */ - case UNGE: - fputs ("nlt", file); - break; - case GT: - if (TARGET_AVX) - { - fputs ("gt", file); - break; - } - /* FALLTHRU */ - case UNGT: - fputs ("nle", file); - break; - case ORDERED: - fputs ("ord", file); - break; - default: - output_operand_lossage ("operand is not a condition code, " - "invalid operand code 'D'"); - return; - } - return; +static machine_mode +ix86_cc_modes_compatible (machine_mode m1, machine_mode m2) +{ + if (m1 == m2) + return m1; - case 'F': - case 'f': -#ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('.', file); - gcc_fallthrough (); -#endif + if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC) + return VOIDmode; - case 'C': - case 'c': - if (!COMPARISON_P (x)) - { - output_operand_lossage ("operand is not a condition code, " - "invalid operand code '%c'", code); - return; - } - put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), - code == 'c' || code == 'f', - code == 'F' || code == 'f', - file); - return; + if ((m1 == CCGCmode && m2 == CCGOCmode) + || (m1 == CCGOCmode && m2 == CCGCmode)) + return CCGCmode; - case 'H': - if (!offsettable_memref_p (x)) - { - output_operand_lossage ("operand is not an offsettable memory " - "reference, invalid operand code 'H'"); - return; - } - /* It doesn't actually matter what mode we use here, as we're - only going to use this for printing. */ - x = adjust_address_nv (x, DImode, 8); - /* Output 'qword ptr' for intel assembler dialect. */ - if (ASSEMBLER_DIALECT == ASM_INTEL) - code = 'q'; - break; + if ((m1 == CCNOmode && m2 == CCGOCmode) + || (m1 == CCGOCmode && m2 == CCNOmode)) + return CCNOmode; - case 'K': - if (!CONST_INT_P (x)) - { - output_operand_lossage ("operand is not an integer, invalid " - "operand code 'K'"); - return; - } + if (m1 == CCZmode + && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode)) + return m2; + else if (m2 == CCZmode + && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode)) + return m1; - if (INTVAL (x) & IX86_HLE_ACQUIRE) -#ifdef HAVE_AS_IX86_HLE - fputs ("xacquire ", file); -#else - fputs ("\n" ASM_BYTE "0xf2\n\t", file); -#endif - else if (INTVAL (x) & IX86_HLE_RELEASE) -#ifdef HAVE_AS_IX86_HLE - fputs ("xrelease ", file); -#else - fputs ("\n" ASM_BYTE "0xf3\n\t", file); -#endif - /* We do not want to print value of the operand. */ - return; + switch (m1) + { + default: + gcc_unreachable (); - case 'N': - if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x))) - fputs ("{z}", file); - return; + case E_CCmode: + case E_CCGCmode: + case E_CCGOCmode: + case E_CCNOmode: + case E_CCAmode: + case E_CCCmode: + case E_CCOmode: + case E_CCPmode: + case E_CCSmode: + case E_CCZmode: + switch (m2) + { + default: + return VOIDmode; - case 'r': - if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE) - { - output_operand_lossage ("operand is not a specific integer, " - "invalid operand code 'r'"); - return; - } + case E_CCmode: + case E_CCGCmode: + case E_CCGOCmode: + case E_CCNOmode: + case E_CCAmode: + case E_CCCmode: + case E_CCOmode: + case E_CCPmode: + case E_CCSmode: + case E_CCZmode: + return CCmode; + } - if (ASSEMBLER_DIALECT == ASM_INTEL) - fputs (", ", file); + case E_CCFPmode: + /* These are only compatible with themselves, which we already + checked above. */ + return VOIDmode; + } +} - fputs ("{sae}", file); +/* Return strategy to use for floating-point. We assume that fcomi is always + preferrable where available, since that is also true when looking at size + (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */ - if (ASSEMBLER_DIALECT == ASM_ATT) - fputs (", ", file); +enum ix86_fpcmp_strategy +ix86_fp_comparison_strategy (enum rtx_code) +{ + /* Do fcomi/sahf based test when profitable. */ - return; + if (TARGET_CMOVE) + return IX86_FPCMP_COMI; - case 'R': - if (!CONST_INT_P (x)) - { - output_operand_lossage ("operand is not an integer, invalid " - "operand code 'R'"); - return; - } + if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ())) + return IX86_FPCMP_SAHF; - if (ASSEMBLER_DIALECT == ASM_INTEL) - fputs (", ", file); + return IX86_FPCMP_ARITH; +} - switch (INTVAL (x)) - { - case ROUND_NEAREST_INT | ROUND_SAE: - fputs ("{rn-sae}", file); - break; - case ROUND_NEG_INF | ROUND_SAE: - fputs ("{rd-sae}", file); - break; - case ROUND_POS_INF | ROUND_SAE: - fputs ("{ru-sae}", file); - break; - case ROUND_ZERO | ROUND_SAE: - fputs ("{rz-sae}", file); - break; - default: - output_operand_lossage ("operand is not a specific integer, " - "invalid operand code 'R'"); - } - - if (ASSEMBLER_DIALECT == ASM_ATT) - fputs (", ", file); - - return; - - case '*': - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('*', file); - return; - - case '&': - { - const char *name = get_some_local_dynamic_name (); - if (name == NULL) - output_operand_lossage ("'%%&' used without any " - "local dynamic TLS references"); - else - assemble_name (file, name); - return; - } +/* Convert comparison codes we use to represent FP comparison to integer + code that will result in proper branch. Return UNKNOWN if no such code + is available. */ - case '+': - { - rtx x; +enum rtx_code +ix86_fp_compare_code_to_integer (enum rtx_code code) +{ + switch (code) + { + case GT: + return GTU; + case GE: + return GEU; + case ORDERED: + case UNORDERED: + return code; + case UNEQ: + return EQ; + case UNLT: + return LTU; + case UNLE: + return LEU; + case LTGT: + return NE; + default: + return UNKNOWN; + } +} - if (!optimize - || optimize_function_for_size_p (cfun) - || !TARGET_BRANCH_PREDICTION_HINTS) - return; +/* Zero extend possibly SImode EXP to Pmode register. */ +rtx +ix86_zero_extend_to_Pmode (rtx exp) +{ + return force_reg (Pmode, convert_to_mode (Pmode, exp, 1)); +} - x = find_reg_note (current_output_insn, REG_BR_PROB, 0); - if (x) - { - int pred_val = profile_probability::from_reg_br_prob_note - (XINT (x, 0)).to_reg_br_prob_base (); +/* Return true if the function being called was marked with attribute + "noplt" or using -fno-plt and we are compiling for non-PIC. We need + to handle the non-PIC case in the backend because there is no easy + interface for the front-end to force non-PLT calls to use the GOT. + This is currently used only with 64-bit or 32-bit GOT32X ELF targets + to call the function marked "noplt" indirectly. */ - if (pred_val < REG_BR_PROB_BASE * 45 / 100 - || pred_val > REG_BR_PROB_BASE * 55 / 100) - { - bool taken = pred_val > REG_BR_PROB_BASE / 2; - bool cputaken - = final_forward_branch_p (current_output_insn) == 0; +static bool +ix86_nopic_noplt_attribute_p (rtx call_op) +{ + if (flag_pic || ix86_cmodel == CM_LARGE + || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X) + || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF + || SYMBOL_REF_LOCAL_P (call_op)) + return false; - /* Emit hints only in the case default branch prediction - heuristics would fail. */ - if (taken != cputaken) - { - /* We use 3e (DS) prefix for taken branches and - 2e (CS) prefix for not taken branches. */ - if (taken) - fputs ("ds ; ", file); - else - fputs ("cs ; ", file); - } - } - } - return; - } + tree symbol_decl = SYMBOL_REF_DECL (call_op); - case ';': -#ifndef HAVE_AS_IX86_REP_LOCK_PREFIX - putc (';', file); -#endif - return; + if (!flag_plt + || (symbol_decl != NULL_TREE + && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl)))) + return true; - case '~': - putc (TARGET_AVX2 ? 'i' : 'f', file); - return; + return false; +} - case 'M': - if (TARGET_X32) - { - /* NB: 32-bit indices in VSIB address are sign-extended - to 64 bits. In x32, if 32-bit address 0xf7fa3010 is - sign-extended to 0xfffffffff7fa3010 which is invalid - address. Add addr32 prefix if there is no base - register nor symbol. */ - bool ok; - struct ix86_address parts; - ok = ix86_decompose_address (x, &parts); - gcc_assert (ok && parts.index == NULL_RTX); - if (parts.base == NULL_RTX - && (parts.disp == NULL_RTX - || !symbolic_operand (parts.disp, - GET_MODE (parts.disp)))) - fputs ("addr32 ", file); - } - return; +/* Output indirect branch via a call and return thunk. CALL_OP is a + register which contains the branch target. XASM is the assembly + template for CALL_OP. Branch is a tail call if SIBCALL_P is true. + A normal call is converted to: - case '^': - if (TARGET_64BIT && Pmode != word_mode) - fputs ("addr32 ", file); - return; + call __x86_indirect_thunk_reg - case '!': - if (ix86_notrack_prefixed_insn_p (current_output_insn)) - fputs ("notrack ", file); - return; + and a tail call is converted to: - default: - output_operand_lossage ("invalid operand code '%c'", code); - } - } + jmp __x86_indirect_thunk_reg + */ - if (REG_P (x)) - print_reg (x, code, file); +static void +ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p) +{ + char thunk_name_buf[32]; + char *thunk_name; + enum indirect_thunk_prefix need_prefix + = indirect_thunk_need_prefix (current_output_insn); + int regno = REGNO (call_op); - else if (MEM_P (x)) + if (cfun->machine->indirect_branch_type + != indirect_branch_thunk_inline) { - rtx addr = XEXP (x, 0); - - /* No `byte ptr' prefix for call instructions ... */ - if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P') + if (cfun->machine->indirect_branch_type == indirect_branch_thunk) { - machine_mode mode = GET_MODE (x); - const char *size; - - /* Check for explicit size override codes. */ - if (code == 'b') - size = "BYTE"; - else if (code == 'w') - size = "WORD"; - else if (code == 'k') - size = "DWORD"; - else if (code == 'q') - size = "QWORD"; - else if (code == 'x') - size = "XMMWORD"; - else if (code == 't') - size = "YMMWORD"; - else if (code == 'g') - size = "ZMMWORD"; - else if (mode == BLKmode) - /* ... or BLKmode operands, when not overridden. */ - size = NULL; - else - switch (GET_MODE_SIZE (mode)) - { - case 1: size = "BYTE"; break; - case 2: size = "WORD"; break; - case 4: size = "DWORD"; break; - case 8: size = "QWORD"; break; - case 12: size = "TBYTE"; break; - case 16: - if (mode == XFmode) - size = "TBYTE"; - else - size = "XMMWORD"; - break; - case 32: size = "YMMWORD"; break; - case 64: size = "ZMMWORD"; break; - default: - gcc_unreachable (); - } - if (size) - { - fputs (size, file); - fputs (" PTR ", file); - } + int i = regno; + if (i >= FIRST_REX_INT_REG) + i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1); + indirect_thunks_used |= 1 << i; } + indirect_thunk_name (thunk_name_buf, regno, need_prefix, false); + thunk_name = thunk_name_buf; + } + else + thunk_name = NULL; - if (this_is_asm_operands && ! address_operand (addr, VOIDmode)) - output_operand_lossage ("invalid constraints for operand"); + if (sibcall_p) + { + if (thunk_name != NULL) + fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); else - ix86_print_operand_address_as - (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P'); + output_indirect_thunk (regno); } - - else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode) + else { - long l; + if (thunk_name != NULL) + { + fprintf (asm_out_file, "\tcall\t%s\n", thunk_name); + return; + } - REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l); + char indirectlabel1[32]; + char indirectlabel2[32]; - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('$', file); - /* Sign extend 32bit SFmode immediate to 8 bytes. */ - if (code == 'q') - fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x", - (unsigned long long) (int) l); + ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, + INDIRECT_LABEL, + indirectlabelno++); + ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, + INDIRECT_LABEL, + indirectlabelno++); + + /* Jump. */ + fputs ("\tjmp\t", asm_out_file); + assemble_name_raw (asm_out_file, indirectlabel2); + fputc ('\n', asm_out_file); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1); + + if (thunk_name != NULL) + fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); else - fprintf (file, "0x%08x", (unsigned int) l); + output_indirect_thunk (regno); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2); + + /* Call. */ + fputs ("\tcall\t", asm_out_file); + assemble_name_raw (asm_out_file, indirectlabel1); + fputc ('\n', asm_out_file); } +} - else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode) - { - long l[2]; +/* Output indirect branch via a call and return thunk. CALL_OP is + the branch target. XASM is the assembly template for CALL_OP. + Branch is a tail call if SIBCALL_P is true. A normal call is + converted to: - REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l); + jmp L2 + L1: + push CALL_OP + jmp __x86_indirect_thunk + L2: + call L1 - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('$', file); - fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff); - } + and a tail call is converted to: - /* These float cases don't actually occur as immediate operands. */ - else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode) - { - char dstr[30]; + push CALL_OP + jmp __x86_indirect_thunk + */ - real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1); - fputs (dstr, file); +static void +ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm, + bool sibcall_p) +{ + char thunk_name_buf[32]; + char *thunk_name; + char push_buf[64]; + enum indirect_thunk_prefix need_prefix + = indirect_thunk_need_prefix (current_output_insn); + int regno = -1; + + if (cfun->machine->indirect_branch_type + != indirect_branch_thunk_inline) + { + if (cfun->machine->indirect_branch_type == indirect_branch_thunk) + indirect_thunk_needed = true; + indirect_thunk_name (thunk_name_buf, regno, need_prefix, false); + thunk_name = thunk_name_buf; } + else + thunk_name = NULL; + + snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s", + TARGET_64BIT ? 'q' : 'l', xasm); + if (sibcall_p) + { + output_asm_insn (push_buf, &call_op); + if (thunk_name != NULL) + fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); + else + output_indirect_thunk (regno); + } else { - /* We have patterns that allow zero sets of memory, for instance. - In 64-bit mode, we should probably support all 8-byte vectors, - since we can in fact encode that into an immediate. */ - if (GET_CODE (x) == CONST_VECTOR) - { - if (x != CONST0_RTX (GET_MODE (x))) - output_operand_lossage ("invalid vector immediate"); - x = const0_rtx; - } + char indirectlabel1[32]; + char indirectlabel2[32]; - if (code != 'P' && code != 'p') + ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, + INDIRECT_LABEL, + indirectlabelno++); + ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, + INDIRECT_LABEL, + indirectlabelno++); + + /* Jump. */ + fputs ("\tjmp\t", asm_out_file); + assemble_name_raw (asm_out_file, indirectlabel2); + fputc ('\n', asm_out_file); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1); + + /* An external function may be called via GOT, instead of PLT. */ + if (MEM_P (call_op)) { - if (CONST_INT_P (x)) - { - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('$', file); - } - else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF - || GET_CODE (x) == LABEL_REF) + struct ix86_address parts; + rtx addr = XEXP (call_op, 0); + if (ix86_decompose_address (addr, &parts) + && parts.base == stack_pointer_rtx) { - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('$', file); + /* Since call will adjust stack by -UNITS_PER_WORD, + we must convert "disp(stack, index, scale)" to + "disp+UNITS_PER_WORD(stack, index, scale)". */ + if (parts.index) + { + addr = gen_rtx_MULT (Pmode, parts.index, + GEN_INT (parts.scale)); + addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx, + addr); + } else - fputs ("OFFSET FLAT:", file); + addr = stack_pointer_rtx; + + rtx disp; + if (parts.disp != NULL_RTX) + disp = plus_constant (Pmode, parts.disp, + UNITS_PER_WORD); + else + disp = GEN_INT (UNITS_PER_WORD); + + addr = gen_rtx_PLUS (Pmode, addr, disp); + call_op = gen_rtx_MEM (GET_MODE (call_op), addr); } } - if (CONST_INT_P (x)) - fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); - else if (flag_pic || MACHOPIC_INDIRECT) - output_pic_addr_const (file, x, code); + + output_asm_insn (push_buf, &call_op); + + if (thunk_name != NULL) + fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); else - output_addr_const (file, x); + output_indirect_thunk (regno); + + ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2); + + /* Call. */ + fputs ("\tcall\t", asm_out_file); + assemble_name_raw (asm_out_file, indirectlabel1); + fputc ('\n', asm_out_file); } } -static bool -ix86_print_operand_punct_valid_p (unsigned char code) -{ - return (code == '*' || code == '+' || code == '&' || code == ';' - || code == '~' || code == '^' || code == '!'); -} - -/* Print a memory operand whose address is ADDR. */ +/* Output indirect branch via a call and return thunk. CALL_OP is + the branch target. XASM is the assembly template for CALL_OP. + Branch is a tail call if SIBCALL_P is true. */ static void -ix86_print_operand_address_as (FILE *file, rtx addr, - addr_space_t as, bool no_rip) +ix86_output_indirect_branch (rtx call_op, const char *xasm, + bool sibcall_p) { - struct ix86_address parts; - rtx base, index, disp; - int scale; - int ok; - bool vsib = false; - int code = 0; - - if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR) - { - ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts); - gcc_assert (parts.index == NULL_RTX); - parts.index = XVECEXP (addr, 0, 1); - parts.scale = INTVAL (XVECEXP (addr, 0, 2)); - addr = XVECEXP (addr, 0, 0); - vsib = true; - } - else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR) - { - gcc_assert (TARGET_64BIT); - ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts); - code = 'q'; - } + if (REG_P (call_op)) + ix86_output_indirect_branch_via_reg (call_op, sibcall_p); else - ok = ix86_decompose_address (addr, &parts); + ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p); +} - gcc_assert (ok); +/* Output indirect jump. CALL_OP is the jump target. */ - base = parts.base; - index = parts.index; - disp = parts.disp; - scale = parts.scale; +const char * +ix86_output_indirect_jmp (rtx call_op) +{ + if (cfun->machine->indirect_branch_type != indirect_branch_keep) + { + /* We can't have red-zone since "call" in the indirect thunk + pushes the return address onto stack, destroying red-zone. */ + if (ix86_red_zone_size != 0) + gcc_unreachable (); - if (ADDR_SPACE_GENERIC_P (as)) - as = parts.seg; + ix86_output_indirect_branch (call_op, "%0", true); + return ""; + } else - gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg)); + return "%!jmp\t%A0"; +} - if (!ADDR_SPACE_GENERIC_P (as)) - { - if (ASSEMBLER_DIALECT == ASM_ATT) - putc ('%', file); +/* Output return instrumentation for current function if needed. */ - switch (as) +static void +output_return_instrumentation (void) +{ + if (ix86_instrument_return != instrument_return_none + && flag_fentry + && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl)) + { + if (ix86_flag_record_return) + fprintf (asm_out_file, "1:\n"); + switch (ix86_instrument_return) { - case ADDR_SPACE_SEG_FS: - fputs ("fs:", file); + case instrument_return_call: + fprintf (asm_out_file, "\tcall\t__return__\n"); break; - case ADDR_SPACE_SEG_GS: - fputs ("gs:", file); + case instrument_return_nop5: + /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */ + fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n"); + break; + case instrument_return_none: break; - default: - gcc_unreachable (); + } + + if (ix86_flag_record_return) + { + fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n"); + fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long"); + fprintf (asm_out_file, "\t.previous\n"); } } +} - /* Use one byte shorter RIP relative addressing for 64bit mode. */ - if (TARGET_64BIT && !base && !index && !no_rip) +/* Output function return. CALL_OP is the jump target. Add a REP + prefix to RET if LONG_P is true and function return is kept. */ + +const char * +ix86_output_function_return (bool long_p) +{ + output_return_instrumentation (); + + if (cfun->machine->function_return_type != indirect_branch_keep) { - rtx symbol = disp; + char thunk_name[32]; + enum indirect_thunk_prefix need_prefix + = indirect_thunk_need_prefix (current_output_insn); - if (GET_CODE (disp) == CONST - && GET_CODE (XEXP (disp, 0)) == PLUS - && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) - symbol = XEXP (XEXP (disp, 0), 0); + if (cfun->machine->function_return_type + != indirect_branch_thunk_inline) + { + bool need_thunk = (cfun->machine->function_return_type + == indirect_branch_thunk); + indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix, + true); + indirect_return_needed |= need_thunk; + fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); + } + else + output_indirect_thunk (INVALID_REGNUM); - if (GET_CODE (symbol) == LABEL_REF - || (GET_CODE (symbol) == SYMBOL_REF - && SYMBOL_REF_TLS_MODEL (symbol) == 0)) - base = pc_rtx; + return ""; } - if (!base && !index) + if (!long_p) + return "%!ret"; + + return "rep%; ret"; +} + +/* Output indirect function return. RET_OP is the function return + target. */ + +const char * +ix86_output_indirect_function_return (rtx ret_op) +{ + if (cfun->machine->function_return_type != indirect_branch_keep) { - /* Displacement only requires special attention. */ - if (CONST_INT_P (disp)) + char thunk_name[32]; + enum indirect_thunk_prefix need_prefix + = indirect_thunk_need_prefix (current_output_insn); + unsigned int regno = REGNO (ret_op); + gcc_assert (regno == CX_REG); + + if (cfun->machine->function_return_type + != indirect_branch_thunk_inline) { - if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as)) - fputs ("ds:", file); - fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp)); + bool need_thunk = (cfun->machine->function_return_type + == indirect_branch_thunk); + indirect_thunk_name (thunk_name, regno, need_prefix, true); + + if (need_thunk) + { + indirect_return_via_cx = true; + indirect_thunks_used |= 1 << CX_REG; + } + fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); } - /* Load the external function address via the GOT slot to avoid PLT. */ - else if (GET_CODE (disp) == CONST - && GET_CODE (XEXP (disp, 0)) == UNSPEC - && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL - || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT) - && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0))) - output_pic_addr_const (file, disp, 0); - else if (flag_pic) - output_pic_addr_const (file, disp, 0); else - output_addr_const (file, disp); + output_indirect_thunk (regno); + + return ""; } else + return "%!jmp\t%A0"; +} + +/* Output the assembly for a call instruction. */ + +const char * +ix86_output_call_insn (rtx_insn *insn, rtx call_op) +{ + bool direct_p = constant_call_address_operand (call_op, VOIDmode); + bool output_indirect_p + = (!TARGET_SEH + && cfun->machine->indirect_branch_type != indirect_branch_keep); + bool seh_nop_p = false; + const char *xasm; + + if (SIBLING_CALL_P (insn)) { - /* Print SImode register names to force addr32 prefix. */ - if (SImode_address_operand (addr, VOIDmode)) + output_return_instrumentation (); + if (direct_p) { - if (flag_checking) + if (ix86_nopic_noplt_attribute_p (call_op)) { - gcc_assert (TARGET_64BIT); - switch (GET_CODE (addr)) + direct_p = false; + if (TARGET_64BIT) { - case SUBREG: - gcc_assert (GET_MODE (addr) == SImode); - gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode); - break; - case ZERO_EXTEND: - case AND: - gcc_assert (GET_MODE (addr) == DImode); - break; - default: - gcc_unreachable (); + if (output_indirect_p) + xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}"; + else + xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}"; + } + else + { + if (output_indirect_p) + xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}"; + else + xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}"; } } - gcc_assert (!code); - code = 'k'; + else + xasm = "%!jmp\t%P0"; } - else if (code == 0 - && TARGET_X32 - && disp - && CONST_INT_P (disp) - && INTVAL (disp) < -16*1024*1024) + /* SEH epilogue detection requires the indirect branch case + to include REX.W. */ + else if (TARGET_SEH) + xasm = "%!rex.W jmp\t%A0"; + else { - /* X32 runs in 64-bit mode, where displacement, DISP, in - address DISP(%r64), is encoded as 32-bit immediate sign- - extended from 32-bit to 64-bit. For -0x40000300(%r64), - address is %r64 + 0xffffffffbffffd00. When %r64 < - 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64, - which is invalid for x32. The correct address is %r64 - - 0x40000300 == 0xf7ffdd64. To properly encode - -0x40000300(%r64) for x32, we zero-extend negative - displacement by forcing addr32 prefix which truncates - 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should - zero-extend all negative displacements, including -1(%rsp). - However, for small negative displacements, sign-extension - won't cause overflow. We only zero-extend negative - displacements if they < -16*1024*1024, which is also used - to check legitimate address displacements for PIC. */ - code = 'k'; + if (output_indirect_p) + xasm = "%0"; + else + xasm = "%!jmp\t%A0"; } - /* Since the upper 32 bits of RSP are always zero for x32, - we can encode %esp as %rsp to avoid 0x67 prefix if - there is no index register. */ - if (TARGET_X32 && Pmode == SImode - && !index && base && REG_P (base) && REGNO (base) == SP_REG) - code = 'q'; + if (output_indirect_p && !direct_p) + ix86_output_indirect_branch (call_op, xasm, true); + else + output_asm_insn (xasm, &call_op); + return ""; + } - if (ASSEMBLER_DIALECT == ASM_ATT) + /* SEH unwinding can require an extra nop to be emitted in several + circumstances. Determine if we have one of those. */ + if (TARGET_SEH) + { + rtx_insn *i; + + for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i)) { - if (disp) + /* Prevent a catch region from being adjacent to a jump that would + be interpreted as an epilogue sequence by the unwinder. */ + if (JUMP_P(i) && CROSSING_JUMP_P (i)) { - if (flag_pic) - output_pic_addr_const (file, disp, 0); - else if (GET_CODE (disp) == LABEL_REF) - output_asm_label (disp); - else - output_addr_const (file, disp); + seh_nop_p = true; + break; } + + /* If we get to another real insn, we don't need the nop. */ + if (INSN_P (i)) + break; - putc ('(', file); - if (base) - print_reg (base, code, file); - if (index) + /* If we get to the epilogue note, prevent a catch region from + being adjacent to the standard epilogue sequence. If non- + call-exceptions, we'll have done this during epilogue emission. */ + if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG + && !flag_non_call_exceptions + && !can_throw_internal (insn)) { - putc (',', file); - print_reg (index, vsib ? 0 : code, file); - if (scale != 1 || vsib) - fprintf (file, ",%d", scale); + seh_nop_p = true; + break; } - putc (')', file); } - else - { - rtx offset = NULL_RTX; - if (disp) - { - /* Pull out the offset of a symbol; print any symbol itself. */ - if (GET_CODE (disp) == CONST - && GET_CODE (XEXP (disp, 0)) == PLUS - && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) - { - offset = XEXP (XEXP (disp, 0), 1); - disp = gen_rtx_CONST (VOIDmode, - XEXP (XEXP (disp, 0), 0)); - } + /* If we didn't find a real insn following the call, prevent the + unwinder from looking into the next function. */ + if (i == NULL) + seh_nop_p = true; + } - if (flag_pic) - output_pic_addr_const (file, disp, 0); - else if (GET_CODE (disp) == LABEL_REF) - output_asm_label (disp); - else if (CONST_INT_P (disp)) - offset = disp; - else - output_addr_const (file, disp); - } - - putc ('[', file); - if (base) + if (direct_p) + { + if (ix86_nopic_noplt_attribute_p (call_op)) + { + direct_p = false; + if (TARGET_64BIT) { - print_reg (base, code, file); - if (offset) - { - if (INTVAL (offset) >= 0) - putc ('+', file); - fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset)); - } + if (output_indirect_p) + xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}"; + else + xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}"; } - else if (offset) - fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset)); else - putc ('0', file); - - if (index) { - putc ('+', file); - print_reg (index, vsib ? 0 : code, file); - if (scale != 1 || vsib) - fprintf (file, "*%d", scale); + if (output_indirect_p) + xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}"; + else + xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}"; } - putc (']', file); } + else + xasm = "%!call\t%P0"; + } + else + { + if (output_indirect_p) + xasm = "%0"; + else + xasm = "%!call\t%A0"; } -} -static void -ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr) -{ - ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false); + if (output_indirect_p && !direct_p) + ix86_output_indirect_branch (call_op, xasm, false); + else + output_asm_insn (xasm, &call_op); + + if (seh_nop_p) + return "nop"; + + return ""; } + +/* Return a MEM corresponding to a stack slot with mode MODE. + Allocate a new slot if necessary. -/* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */ + The RTL for a function can have several slots available: N is + which slot to use. */ -static bool -i386_asm_output_addr_const_extra (FILE *file, rtx x) +rtx +assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n) { - rtx op; + struct stack_local_entry *s; - if (GET_CODE (x) != UNSPEC) - return false; + gcc_assert (n < MAX_386_STACK_LOCALS); - op = XVECEXP (x, 0, 0); - switch (XINT (x, 1)) - { - case UNSPEC_GOTOFF: - output_addr_const (file, op); - fputs ("@gotoff", file); - break; - case UNSPEC_GOTTPOFF: - output_addr_const (file, op); - /* FIXME: This might be @TPOFF in Sun ld. */ - fputs ("@gottpoff", file); - break; - case UNSPEC_TPOFF: - output_addr_const (file, op); - fputs ("@tpoff", file); - break; - case UNSPEC_NTPOFF: - output_addr_const (file, op); - if (TARGET_64BIT) - fputs ("@tpoff", file); - else - fputs ("@ntpoff", file); - break; - case UNSPEC_DTPOFF: - output_addr_const (file, op); - fputs ("@dtpoff", file); - break; - case UNSPEC_GOTNTPOFF: - output_addr_const (file, op); - if (TARGET_64BIT) - fputs (ASSEMBLER_DIALECT == ASM_ATT ? - "@gottpoff(%rip)" : "@gottpoff[rip]", file); - else - fputs ("@gotntpoff", file); - break; - case UNSPEC_INDNTPOFF: - output_addr_const (file, op); - fputs ("@indntpoff", file); - break; -#if TARGET_MACHO - case UNSPEC_MACHOPIC_OFFSET: - output_addr_const (file, op); - putc ('-', file); - machopic_output_function_base_name (file); - break; -#endif + for (s = ix86_stack_locals; s; s = s->next) + if (s->mode == mode && s->n == n) + return validize_mem (copy_rtx (s->rtl)); - default: - return false; - } + s = ggc_alloc (); + s->n = n; + s->mode = mode; + s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0); - return true; + s->next = ix86_stack_locals; + ix86_stack_locals = s; + return validize_mem (copy_rtx (s->rtl)); } - -/* Split one or more double-mode RTL references into pairs of half-mode - references. The RTL can be REG, offsettable MEM, integer constant, or - CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to - split and "num" is its length. lo_half and hi_half are output arrays - that parallel "operands". */ -void -split_double_mode (machine_mode mode, rtx operands[], - int num, rtx lo_half[], rtx hi_half[]) +static void +ix86_instantiate_decls (void) { - machine_mode half_mode; - unsigned int byte; + struct stack_local_entry *s; - switch (mode) - { - case E_TImode: - half_mode = DImode; - break; - case E_DImode: - half_mode = SImode; - break; - default: - gcc_unreachable (); - } + for (s = ix86_stack_locals; s; s = s->next) + if (s->rtl != NULL_RTX) + instantiate_decl_rtl (s->rtl); +} + +/* Check whether x86 address PARTS is a pc-relative address. */ - byte = GET_MODE_SIZE (half_mode); +bool +ix86_rip_relative_addr_p (struct ix86_address *parts) +{ + rtx base, index, disp; - while (num--) - { - rtx op = operands[num]; + base = parts->base; + index = parts->index; + disp = parts->disp; - /* simplify_subreg refuse to split volatile memory addresses, - but we still have to handle it. */ - if (MEM_P (op)) - { - lo_half[num] = adjust_address (op, half_mode, 0); - hi_half[num] = adjust_address (op, half_mode, byte); - } - else + if (disp && !base && !index) + { + if (TARGET_64BIT) { - lo_half[num] = simplify_gen_subreg (half_mode, op, - GET_MODE (op) == VOIDmode - ? mode : GET_MODE (op), 0); - hi_half[num] = simplify_gen_subreg (half_mode, op, - GET_MODE (op) == VOIDmode - ? mode : GET_MODE (op), byte); + rtx symbol = disp; + + if (GET_CODE (disp) == CONST) + symbol = XEXP (disp, 0); + if (GET_CODE (symbol) == PLUS + && CONST_INT_P (XEXP (symbol, 1))) + symbol = XEXP (symbol, 0); + + if (GET_CODE (symbol) == LABEL_REF + || (GET_CODE (symbol) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (symbol) == 0) + || (GET_CODE (symbol) == UNSPEC + && (XINT (symbol, 1) == UNSPEC_GOTPCREL + || XINT (symbol, 1) == UNSPEC_PCREL + || XINT (symbol, 1) == UNSPEC_GOTNTPOFF))) + return true; } } + return false; } - -/* Output code to perform a 387 binary operation in INSN, one of PLUS, - MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3] - is the expression of the binary operation. The output may either be - emitted here, or returned to the caller, like all output_* functions. - - There is no guarantee that the operands are the same mode, as they - might be within FLOAT or FLOAT_EXTEND expressions. */ -#ifndef SYSV386_COMPAT -/* Set to 1 for compatibility with brain-damaged assemblers. No-one - wants to fix the assemblers because that causes incompatibility - with gcc. No-one wants to fix gcc because that causes - incompatibility with assemblers... You can use the option of - -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */ -#define SYSV386_COMPAT 1 -#endif +/* Calculate the length of the memory address in the instruction encoding. + Includes addr32 prefix, does not include the one-byte modrm, opcode, + or other prefixes. We never generate addr32 prefix for LEA insn. */ -const char * -output_387_binary_op (rtx_insn *insn, rtx *operands) +int +memory_address_length (rtx addr, bool lea) { - static char buf[40]; - const char *p; - bool is_sse - = (SSE_REG_P (operands[0]) - || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2])); - - if (is_sse) - p = "%v"; - else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT - || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT) - p = "fi"; - else - p = "f"; + struct ix86_address parts; + rtx base, index, disp; + int len; + int ok; - strcpy (buf, p); + if (GET_CODE (addr) == PRE_DEC + || GET_CODE (addr) == POST_INC + || GET_CODE (addr) == PRE_MODIFY + || GET_CODE (addr) == POST_MODIFY) + return 0; - switch (GET_CODE (operands[3])) - { - case PLUS: - p = "add"; break; - case MINUS: - p = "sub"; break; - case MULT: - p = "mul"; break; - case DIV: - p = "div"; break; - default: - gcc_unreachable (); - } + ok = ix86_decompose_address (addr, &parts); + gcc_assert (ok); - strcat (buf, p); + len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1; - if (is_sse) - { - p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd"; - strcat (buf, p); + /* If this is not LEA instruction, add the length of addr32 prefix. */ + if (TARGET_64BIT && !lea + && (SImode_address_operand (addr, VOIDmode) + || (parts.base && GET_MODE (parts.base) == SImode) + || (parts.index && GET_MODE (parts.index) == SImode))) + len++; - if (TARGET_AVX) - p = "\t{%2, %1, %0|%0, %1, %2}"; - else - p = "\t{%2, %0|%0, %2}"; + base = parts.base; + index = parts.index; + disp = parts.disp; - strcat (buf, p); - return buf; - } + if (base && SUBREG_P (base)) + base = SUBREG_REG (base); + if (index && SUBREG_P (index)) + index = SUBREG_REG (index); - /* Even if we do not want to check the inputs, this documents input - constraints. Which helps in understanding the following code. */ - if (flag_checking) + gcc_assert (base == NULL_RTX || REG_P (base)); + gcc_assert (index == NULL_RTX || REG_P (index)); + + /* Rule of thumb: + - esp as the base always wants an index, + - ebp as the base always wants a displacement, + - r12 as the base always wants an index, + - r13 as the base always wants a displacement. */ + + /* Register Indirect. */ + if (base && !index && !disp) { - if (STACK_REG_P (operands[0]) - && ((REG_P (operands[1]) - && REGNO (operands[0]) == REGNO (operands[1]) - && (STACK_REG_P (operands[2]) || MEM_P (operands[2]))) - || (REG_P (operands[2]) - && REGNO (operands[0]) == REGNO (operands[2]) - && (STACK_REG_P (operands[1]) || MEM_P (operands[1])))) - && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2]))) - ; /* ok */ - else - gcc_unreachable (); + /* esp (for its index) and ebp (for its displacement) need + the two-byte modrm form. Similarly for r12 and r13 in 64-bit + code. */ + if (base == arg_pointer_rtx + || base == frame_pointer_rtx + || REGNO (base) == SP_REG + || REGNO (base) == BP_REG + || REGNO (base) == R12_REG + || REGNO (base) == R13_REG) + len++; } - switch (GET_CODE (operands[3])) + /* Direct Addressing. In 64-bit mode mod 00 r/m 5 + is not disp32, but disp32(%rip), so for disp32 + SIB byte is needed, unless print_operand_address + optimizes it into disp32(%rip) or (%rip) is implied + by UNSPEC. */ + else if (disp && !base && !index) { - case MULT: - case PLUS: - if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2])) - std::swap (operands[1], operands[2]); - - /* know operands[0] == operands[1]. */ - - if (MEM_P (operands[2])) - { - p = "%Z2\t%2"; - break; - } - - if (find_regno_note (insn, REG_DEAD, REGNO (operands[2]))) + len += 4; + if (!ix86_rip_relative_addr_p (&parts)) + len++; + } + else + { + /* Find the length of the displacement constant. */ + if (disp) { - if (STACK_TOP_P (operands[0])) - /* How is it that we are storing to a dead operand[2]? - Well, presumably operands[1] is dead too. We can't - store the result to st(0) as st(0) gets popped on this - instruction. Instead store to operands[2] (which I - think has to be st(1)). st(1) will be popped later. - gcc <= 2.8.1 didn't have this check and generated - assembly code that the Unixware assembler rejected. */ - p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */ + if (base && satisfies_constraint_K (disp)) + len += 1; else - p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */ - break; + len += 4; } + /* ebp always wants a displacement. Similarly r13. */ + else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG)) + len++; - if (STACK_TOP_P (operands[0])) - p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */ - else - p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */ - break; + /* An index requires the two-byte modrm form.... */ + if (index + /* ...like esp (or r12), which always wants an index. */ + || base == arg_pointer_rtx + || base == frame_pointer_rtx + || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG))) + len++; + } - case MINUS: - case DIV: - if (MEM_P (operands[1])) - { - p = "r%Z1\t%1"; - break; - } + return len; +} - if (MEM_P (operands[2])) - { - p = "%Z2\t%2"; - break; - } +/* Compute default value for "length_immediate" attribute. When SHORTFORM + is set, expect that insn have 8bit immediate alternative. */ +int +ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform) +{ + int len = 0; + int i; + extract_insn_cached (insn); + for (i = recog_data.n_operands - 1; i >= 0; --i) + if (CONSTANT_P (recog_data.operand[i])) + { + enum attr_mode mode = get_attr_mode (insn); - if (find_regno_note (insn, REG_DEAD, REGNO (operands[2]))) - { -#if SYSV386_COMPAT - /* The SystemV/386 SVR3.2 assembler, and probably all AT&T - derived assemblers, confusingly reverse the direction of - the operation for fsub{r} and fdiv{r} when the - destination register is not st(0). The Intel assembler - doesn't have this brain damage. Read !SYSV386_COMPAT to - figure out what the hardware really does. */ - if (STACK_TOP_P (operands[0])) - p = "{p\t%0, %2|rp\t%2, %0}"; - else - p = "{rp\t%2, %0|p\t%0, %2}"; -#else - if (STACK_TOP_P (operands[0])) - /* As above for fmul/fadd, we can't store to st(0). */ - p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */ - else - p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */ -#endif - break; + gcc_assert (!len); + if (shortform && CONST_INT_P (recog_data.operand[i])) + { + HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]); + switch (mode) + { + case MODE_QI: + len = 1; + continue; + case MODE_HI: + ival = trunc_int_for_mode (ival, HImode); + break; + case MODE_SI: + ival = trunc_int_for_mode (ival, SImode); + break; + default: + break; + } + if (IN_RANGE (ival, -128, 127)) + { + len = 1; + continue; + } + } + switch (mode) + { + case MODE_QI: + len = 1; + break; + case MODE_HI: + len = 2; + break; + case MODE_SI: + len = 4; + break; + /* Immediates for DImode instructions are encoded + as 32bit sign extended values. */ + case MODE_DI: + len = 4; + break; + default: + fatal_insn ("unknown insn mode", insn); } + } + return len; +} - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - { -#if SYSV386_COMPAT - if (STACK_TOP_P (operands[0])) - p = "{rp\t%0, %1|p\t%1, %0}"; - else - p = "{p\t%1, %0|rp\t%0, %1}"; -#else - if (STACK_TOP_P (operands[0])) - p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */ - else - p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */ -#endif - break; - } +/* Compute default value for "length_address" attribute. */ +int +ix86_attr_length_address_default (rtx_insn *insn) +{ + int i; - if (STACK_TOP_P (operands[0])) + if (get_attr_type (insn) == TYPE_LEA) + { + rtx set = PATTERN (insn), addr; + + if (GET_CODE (set) == PARALLEL) + set = XVECEXP (set, 0, 0); + + gcc_assert (GET_CODE (set) == SET); + + addr = SET_SRC (set); + + return memory_address_length (addr, true); + } + + extract_insn_cached (insn); + for (i = recog_data.n_operands - 1; i >= 0; --i) + { + rtx op = recog_data.operand[i]; + if (MEM_P (op)) { - if (STACK_TOP_P (operands[1])) - p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */ - else - p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */ - break; - } - else if (STACK_TOP_P (operands[1])) - { -#if SYSV386_COMPAT - p = "{\t%1, %0|r\t%0, %1}"; -#else - p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */ -#endif - } - else - { -#if SYSV386_COMPAT - p = "{r\t%2, %0|\t%0, %2}"; -#else - p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */ -#endif - } - break; + constrain_operands_cached (insn, reload_completed); + if (which_alternative != -1) + { + const char *constraints = recog_data.constraints[i]; + int alt = which_alternative; - default: - gcc_unreachable (); + while (*constraints == '=' || *constraints == '+') + constraints++; + while (alt-- > 0) + while (*constraints++ != ',') + ; + /* Skip ignored operands. */ + if (*constraints == 'X') + continue; + } + + int len = memory_address_length (XEXP (op, 0), false); + + /* Account for segment prefix for non-default addr spaces. */ + if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op))) + len++; + + return len; + } } + return 0; +} - strcat (buf, p); - return buf; +/* Compute default value for "length_vex" attribute. It includes + 2 or 3 byte VEX prefix and 1 opcode byte. */ + +int +ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode, + bool has_vex_w) +{ + int i; + + /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3 + byte VEX prefix. */ + if (!has_0f_opcode || has_vex_w) + return 3 + 1; + + /* We can always use 2 byte VEX prefix in 32bit. */ + if (!TARGET_64BIT) + return 2 + 1; + + extract_insn_cached (insn); + + for (i = recog_data.n_operands - 1; i >= 0; --i) + if (REG_P (recog_data.operand[i])) + { + /* REX.W bit uses 3 byte VEX prefix. */ + if (GET_MODE (recog_data.operand[i]) == DImode + && GENERAL_REG_P (recog_data.operand[i])) + return 3 + 1; + } + else + { + /* REX.X or REX.B bits use 3 byte VEX prefix. */ + if (MEM_P (recog_data.operand[i]) + && x86_extended_reg_mentioned_p (recog_data.operand[i])) + return 3 + 1; + } + + return 2 + 1; } + -/* Return needed mode for entity in optimize_mode_switching pass. */ +static bool +ix86_class_likely_spilled_p (reg_class_t); -static int -ix86_dirflag_mode_needed (rtx_insn *insn) +/* Returns true if lhs of insn is HW function argument register and set up + is_spilled to true if it is likely spilled HW register. */ +static bool +insn_is_function_arg (rtx insn, bool* is_spilled) { + rtx dst; + + if (!NONDEBUG_INSN_P (insn)) + return false; + /* Call instructions are not movable, ignore it. */ if (CALL_P (insn)) + return false; + insn = PATTERN (insn); + if (GET_CODE (insn) == PARALLEL) + insn = XVECEXP (insn, 0, 0); + if (GET_CODE (insn) != SET) + return false; + dst = SET_DEST (insn); + if (REG_P (dst) && HARD_REGISTER_P (dst) + && ix86_function_arg_regno_p (REGNO (dst))) { - if (cfun->machine->func_type == TYPE_NORMAL) - return X86_DIRFLAG_ANY; - else - /* No need to emit CLD in interrupt handler for TARGET_CLD. */ - return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET; + /* Is it likely spilled HW register? */ + if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst)) + && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))) + *is_spilled = true; + return true; } + return false; +} - if (recog_memoized (insn) < 0) - return X86_DIRFLAG_ANY; +/* Add output dependencies for chain of function adjacent arguments if only + there is a move to likely spilled HW register. Return first argument + if at least one dependence was added or NULL otherwise. */ +static rtx_insn * +add_parameter_dependencies (rtx_insn *call, rtx_insn *head) +{ + rtx_insn *insn; + rtx_insn *last = call; + rtx_insn *first_arg = NULL; + bool is_spilled = false; - if (get_attr_type (insn) == TYPE_STR) + head = PREV_INSN (head); + + /* Find nearest to call argument passing instruction. */ + while (true) { - /* Emit cld instruction if stringops are used in the function. */ - if (cfun->machine->func_type == TYPE_NORMAL) - return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY; - else - return X86_DIRFLAG_RESET; + last = PREV_INSN (last); + if (last == head) + return NULL; + if (!NONDEBUG_INSN_P (last)) + continue; + if (insn_is_function_arg (last, &is_spilled)) + break; + return NULL; } - return X86_DIRFLAG_ANY; + first_arg = last; + while (true) + { + insn = PREV_INSN (last); + if (!INSN_P (insn)) + break; + if (insn == head) + break; + if (!NONDEBUG_INSN_P (insn)) + { + last = insn; + continue; + } + if (insn_is_function_arg (insn, &is_spilled)) + { + /* Add output depdendence between two function arguments if chain + of output arguments contains likely spilled HW registers. */ + if (is_spilled) + add_dependence (first_arg, insn, REG_DEP_OUTPUT); + first_arg = last = insn; + } + else + break; + } + if (!is_spilled) + return NULL; + return first_arg; } -/* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */ - -static bool -ix86_check_avx_upper_register (const_rtx exp) +/* Add output or anti dependency from insn to first_arg to restrict its code + motion. */ +static void +avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn) { - return SSE_REG_P (exp) && GET_MODE_BITSIZE (GET_MODE (exp)) > 128; -} + rtx set; + rtx tmp; -/* Return needed mode for entity in optimize_mode_switching pass. */ + set = single_set (insn); + if (!set) + return; + tmp = SET_DEST (set); + if (REG_P (tmp)) + { + /* Add output dependency to the first function argument. */ + add_dependence (first_arg, insn, REG_DEP_OUTPUT); + return; + } + /* Add anti dependency. */ + add_dependence (first_arg, insn, REG_DEP_ANTI); +} -static int -ix86_avx_u128_mode_needed (rtx_insn *insn) +/* Avoid cross block motion of function argument through adding dependency + from the first non-jump instruction in bb. */ +static void +add_dependee_for_func_arg (rtx_insn *arg, basic_block bb) { - if (CALL_P (insn)) - { - rtx link; + rtx_insn *insn = BB_END (bb); - /* Needed mode is set to AVX_U128_CLEAN if there are - no 256bit or 512bit modes used in function arguments. */ - for (link = CALL_INSN_FUNCTION_USAGE (insn); - link; - link = XEXP (link, 1)) + while (insn) + { + if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn)) { - if (GET_CODE (XEXP (link, 0)) == USE) + rtx set = single_set (insn); + if (set) { - rtx arg = XEXP (XEXP (link, 0), 0); - - if (ix86_check_avx_upper_register (arg)) - return AVX_U128_DIRTY; + avoid_func_arg_motion (arg, insn); + return; } } - - return AVX_U128_CLEAN; + if (insn == BB_HEAD (bb)) + return; + insn = PREV_INSN (insn); } - - /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced. - Hardware changes state only when a 256bit register is written to, - but we need to prevent the compiler from moving optimal insertion - point above eventual read from 256bit or 512 bit register. */ - subrtx_iterator::array_type array; - FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST) - if (ix86_check_avx_upper_register (*iter)) - return AVX_U128_DIRTY; - - return AVX_U128_ANY; } -/* Return mode that i387 must be switched into - prior to the execution of insn. */ - -static int -ix86_i387_mode_needed (int entity, rtx_insn *insn) +/* Hook for pre-reload schedule - avoid motion of function arguments + passed in likely spilled HW registers. */ +static void +ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail) { - enum attr_i387_cw mode; - - /* The mode UNINITIALIZED is used to store control word after a - function call or ASM pattern. The mode ANY specify that function - has no requirements on the control word and make no changes in the - bits we are interested in. */ - - if (CALL_P (insn) - || (NONJUMP_INSN_P (insn) - && (asm_noperands (PATTERN (insn)) >= 0 - || GET_CODE (PATTERN (insn)) == ASM_INPUT))) - return I387_CW_UNINITIALIZED; - - if (recog_memoized (insn) < 0) - return I387_CW_ANY; + rtx_insn *insn; + rtx_insn *first_arg = NULL; + if (reload_completed) + return; + while (head != tail && DEBUG_INSN_P (head)) + head = NEXT_INSN (head); + for (insn = tail; insn != head; insn = PREV_INSN (insn)) + if (INSN_P (insn) && CALL_P (insn)) + { + first_arg = add_parameter_dependencies (insn, head); + if (first_arg) + { + /* Add dependee for first argument to predecessors if only + region contains more than one block. */ + basic_block bb = BLOCK_FOR_INSN (insn); + int rgn = CONTAINING_RGN (bb->index); + int nr_blks = RGN_NR_BLOCKS (rgn); + /* Skip trivial regions and region head blocks that can have + predecessors outside of region. */ + if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0) + { + edge e; + edge_iterator ei; - mode = get_attr_i387_cw (insn); + /* Regions are SCCs with the exception of selective + scheduling with pipelining of outer blocks enabled. + So also check that immediate predecessors of a non-head + block are in the same region. */ + FOR_EACH_EDGE (e, ei, bb->preds) + { + /* Avoid creating of loop-carried dependencies through + using topological ordering in the region. */ + if (rgn == CONTAINING_RGN (e->src->index) + && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index)) + add_dependee_for_func_arg (first_arg, e->src); + } + } + insn = first_arg; + if (insn == head) + break; + } + } + else if (first_arg) + avoid_func_arg_motion (first_arg, insn); +} - switch (entity) - { - case I387_TRUNC: - if (mode == I387_CW_TRUNC) - return mode; - break; +/* Hook for pre-reload schedule - set priority of moves from likely spilled + HW registers to maximum, to schedule them at soon as possible. These are + moves from function argument registers at the top of the function entry + and moves from function return value registers after call. */ +static int +ix86_adjust_priority (rtx_insn *insn, int priority) +{ + rtx set; - case I387_FLOOR: - if (mode == I387_CW_FLOOR) - return mode; - break; + if (reload_completed) + return priority; - case I387_CEIL: - if (mode == I387_CW_CEIL) - return mode; - break; + if (!NONDEBUG_INSN_P (insn)) + return priority; - default: - gcc_unreachable (); + set = single_set (insn); + if (set) + { + rtx tmp = SET_SRC (set); + if (REG_P (tmp) + && HARD_REGISTER_P (tmp) + && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp)) + && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp)))) + return current_sched_info->sched_max_insns_priority; } - return I387_CW_ANY; + return priority; } -/* Return mode that entity must be switched into - prior to the execution of insn. */ - -static int -ix86_mode_needed (int entity, rtx_insn *insn) +/* Prepare for scheduling pass. */ +static void +ix86_sched_init_global (FILE *, int, int) { - switch (entity) + /* Install scheduling hooks for current CPU. Some of these hooks are used + in time-critical parts of the scheduler, so we only set them up when + they are actually used. */ + switch (ix86_tune) { - case X86_DIRFLAG: - return ix86_dirflag_mode_needed (insn); - case AVX_U128: - return ix86_avx_u128_mode_needed (insn); - case I387_TRUNC: - case I387_FLOOR: - case I387_CEIL: - return ix86_i387_mode_needed (entity, insn); + case PROCESSOR_CORE2: + case PROCESSOR_NEHALEM: + case PROCESSOR_SANDYBRIDGE: + case PROCESSOR_HASWELL: + case PROCESSOR_GENERIC: + /* Do not perform multipass scheduling for pre-reload schedule + to save compile time. */ + if (reload_completed) + { + ix86_core2i7_init_hooks (); + break; + } + /* Fall through. */ default: - gcc_unreachable (); + targetm.sched.dfa_post_advance_cycle = NULL; + targetm.sched.first_cycle_multipass_init = NULL; + targetm.sched.first_cycle_multipass_begin = NULL; + targetm.sched.first_cycle_multipass_issue = NULL; + targetm.sched.first_cycle_multipass_backtrack = NULL; + targetm.sched.first_cycle_multipass_end = NULL; + targetm.sched.first_cycle_multipass_fini = NULL; + break; } - return 0; } -/* Check if a 256bit or 512bit AVX register is referenced in stores. */ - -static void -ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) - { - if (ix86_check_avx_upper_register (dest)) - { - bool *used = (bool *) data; - *used = true; - } - } - -/* Calculate mode of upper 128bit AVX registers after the insn. */ + +/* Implement TARGET_STATIC_RTX_ALIGNMENT. */ -static int -ix86_avx_u128_mode_after (int mode, rtx_insn *insn) +static HOST_WIDE_INT +ix86_static_rtx_alignment (machine_mode mode) { - rtx pat = PATTERN (insn); + if (mode == DFmode) + return 64; + if (ALIGN_MODE_128 (mode)) + return MAX (128, GET_MODE_ALIGNMENT (mode)); + return GET_MODE_ALIGNMENT (mode); +} - if (vzeroupper_pattern (pat, VOIDmode) - || vzeroall_pattern (pat, VOIDmode)) - return AVX_U128_CLEAN; +/* Implement TARGET_CONSTANT_ALIGNMENT. */ - /* We know that state is clean after CALL insn if there are no - 256bit or 512bit registers used in the function return register. */ - if (CALL_P (insn)) +static HOST_WIDE_INT +ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align) +{ + if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST + || TREE_CODE (exp) == INTEGER_CST) { - bool avx_upper_reg_found = false; - note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found); - - return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN; + machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); + HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode); + return MAX (mode_align, align); } + else if (!optimize_size && TREE_CODE (exp) == STRING_CST + && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD) + return BITS_PER_WORD; - /* Otherwise, return current mode. Remember that if insn - references AVX 256bit or 512bit registers, the mode was already - changed to DIRTY from MODE_NEEDED. */ - return mode; + return align; } -/* Return the mode that an insn results in. */ +/* Implement TARGET_EMPTY_RECORD_P. */ -static int -ix86_mode_after (int entity, int mode, rtx_insn *insn) +static bool +ix86_is_empty_record (const_tree type) { - switch (entity) - { - case X86_DIRFLAG: - return mode; - case AVX_U128: - return ix86_avx_u128_mode_after (mode, insn); - case I387_TRUNC: - case I387_FLOOR: - case I387_CEIL: - return mode; - default: - gcc_unreachable (); - } + if (!TARGET_64BIT) + return false; + return default_is_empty_record (type); } -static int -ix86_dirflag_mode_entry (void) -{ - /* For TARGET_CLD or in the interrupt handler we can't assume - direction flag state at function entry. */ - if (TARGET_CLD - || cfun->machine->func_type != TYPE_NORMAL) - return X86_DIRFLAG_ANY; - - return X86_DIRFLAG_RESET; -} +/* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */ -static int -ix86_avx_u128_mode_entry (void) +static void +ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type) { - tree arg; - - /* Entry mode is set to AVX_U128_DIRTY if there are - 256bit or 512bit modes used in function arguments. */ - for (arg = DECL_ARGUMENTS (current_function_decl); arg; - arg = TREE_CHAIN (arg)) - { - rtx incoming = DECL_INCOMING_RTL (arg); + CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); - if (incoming && ix86_check_avx_upper_register (incoming)) - return AVX_U128_DIRTY; - } + if (!cum->warn_empty) + return; - return AVX_U128_CLEAN; -} + if (!TYPE_EMPTY_P (type)) + return; -/* Return a mode that ENTITY is assumed to be - switched to at function entry. */ + /* Don't warn if the function isn't visible outside of the TU. */ + if (cum->decl && !TREE_PUBLIC (cum->decl)) + return; -static int -ix86_mode_entry (int entity) -{ - switch (entity) - { - case X86_DIRFLAG: - return ix86_dirflag_mode_entry (); - case AVX_U128: - return ix86_avx_u128_mode_entry (); - case I387_TRUNC: - case I387_FLOOR: - case I387_CEIL: - return I387_CW_ANY; - default: - gcc_unreachable (); - } -} + const_tree ctx = get_ultimate_context (cum->decl); + if (ctx != NULL_TREE + && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx)) + return; -static int -ix86_avx_u128_mode_exit (void) -{ - rtx reg = crtl->return_rtx; + /* If the actual size of the type is zero, then there is no change + in how objects of this size are passed. */ + if (int_size_in_bytes (type) == 0) + return; - /* Exit mode is set to AVX_U128_DIRTY if there are 256bit - or 512 bit modes used in the function return register. */ - if (reg && ix86_check_avx_upper_register (reg)) - return AVX_U128_DIRTY; + warning (OPT_Wabi, "empty class %qT parameter passing ABI " + "changes in %<-fabi-version=12%> (GCC 8)", type); - /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit - modes used in function arguments, otherwise return AVX_U128_CLEAN. - */ - return ix86_avx_u128_mode_entry (); + /* Only warn once. */ + cum->warn_empty = false; } -/* Return a mode that ENTITY is assumed to be - switched to at function exit. */ +/* This hook returns name of multilib ABI. */ -static int -ix86_mode_exit (int entity) +static const char * +ix86_get_multilib_abi_name (void) { - switch (entity) - { - case X86_DIRFLAG: - return X86_DIRFLAG_ANY; - case AVX_U128: - return ix86_avx_u128_mode_exit (); - case I387_TRUNC: - case I387_FLOOR: - case I387_CEIL: - return I387_CW_ANY; - default: - gcc_unreachable (); - } + if (!(TARGET_64BIT_P (ix86_isa_flags))) + return "i386"; + else if (TARGET_X32_P (ix86_isa_flags)) + return "x32"; + else + return "x86_64"; } +/* Compute the alignment for a variable for Intel MCU psABI. TYPE is + the data type, and ALIGN is the alignment that the object would + ordinarily have. */ + static int -ix86_mode_priority (int, int n) +iamcu_alignment (tree type, int align) { - return n; + machine_mode mode; + + if (align < 32 || TYPE_USER_ALIGN (type)) + return align; + + /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4 + bytes. */ + mode = TYPE_MODE (strip_array_types (type)); + switch (GET_MODE_CLASS (mode)) + { + case MODE_INT: + case MODE_COMPLEX_INT: + case MODE_COMPLEX_FLOAT: + case MODE_FLOAT: + case MODE_DECIMAL_FLOAT: + return 32; + default: + return align; + } } -/* Output code to initialize control word copies used by trunc?f?i and - rounding patterns. CURRENT_MODE is set to current control word, - while NEW_MODE is set to new control word. */ +/* Compute the alignment for a static variable. + TYPE is the data type, and ALIGN is the alignment that + the object would ordinarily have. The value of this function is used + instead of that alignment to align the object. */ -static void -emit_i387_cw_initialization (int mode) +int +ix86_data_alignment (tree type, unsigned int align, bool opt) { - rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED); - rtx new_mode; + /* GCC 4.8 and earlier used to incorrectly assume this alignment even + for symbols from other compilation units or symbols that don't need + to bind locally. In order to preserve some ABI compatibility with + those compilers, ensure we don't decrease alignment from what we + used to assume. */ - enum ix86_stack_slot slot; + unsigned int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT); - rtx reg = gen_reg_rtx (HImode); + /* A data structure, equal or greater than the size of a cache line + (64 bytes in the Pentium 4 and other recent Intel processors, including + processors based on Intel Core microarchitecture) should be aligned + so that its base address is a multiple of a cache line size. */ - emit_insn (gen_x86_fnstcw_1 (stored_mode)); - emit_move_insn (reg, copy_rtx (stored_mode)); + unsigned int max_align + = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT); - switch (mode) - { - case I387_CW_TRUNC: - /* round toward zero (truncate) */ - emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00))); - slot = SLOT_CW_TRUNC; - break; + if (max_align < BITS_PER_WORD) + max_align = BITS_PER_WORD; - case I387_CW_FLOOR: - /* round down toward -oo */ - emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00))); - emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400))); - slot = SLOT_CW_FLOOR; - break; + switch (ix86_align_data_type) + { + case ix86_align_data_type_abi: opt = false; break; + case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break; + case ix86_align_data_type_cacheline: break; + } - case I387_CW_CEIL: - /* round up toward +oo */ - emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00))); - emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800))); - slot = SLOT_CW_CEIL; - break; + if (TARGET_IAMCU) + align = iamcu_alignment (type, align); - default: - gcc_unreachable (); + if (opt + && AGGREGATE_TYPE_P (type) + && TYPE_SIZE (type) + && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST) + { + if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat) + && align < max_align_compat) + align = max_align_compat; + if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align) + && align < max_align) + align = max_align; } - gcc_assert (slot < MAX_386_STACK_LOCALS); + /* x86-64 ABI requires arrays greater than 16 bytes to be aligned + to 16byte boundary. */ + if (TARGET_64BIT) + { + if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE) + && TYPE_SIZE (type) + && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST + && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128) + && align < 128) + return 128; + } - new_mode = assign_386_stack_local (HImode, slot); - emit_move_insn (new_mode, reg); -} + if (!opt) + return align; -/* Generate one or more insns to set ENTITY to MODE. */ + if (TREE_CODE (type) == ARRAY_TYPE) + { + if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128) + return 128; + } + else if (TREE_CODE (type) == COMPLEX_TYPE) + { -static void -ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED, - HARD_REG_SET regs_live ATTRIBUTE_UNUSED) -{ - switch (entity) + if (TYPE_MODE (type) == DCmode && align < 64) + return 64; + if ((TYPE_MODE (type) == XCmode + || TYPE_MODE (type) == TCmode) && align < 128) + return 128; + } + else if ((TREE_CODE (type) == RECORD_TYPE + || TREE_CODE (type) == UNION_TYPE + || TREE_CODE (type) == QUAL_UNION_TYPE) + && TYPE_FIELDS (type)) { - case X86_DIRFLAG: - if (mode == X86_DIRFLAG_RESET) - emit_insn (gen_cld ()); - break; - case AVX_U128: - if (mode == AVX_U128_CLEAN) - emit_insn (gen_avx_vzeroupper ()); - break; - case I387_TRUNC: - case I387_FLOOR: - case I387_CEIL: - if (mode != I387_CW_ANY - && mode != I387_CW_UNINITIALIZED) - emit_i387_cw_initialization (mode); - break; - default: - gcc_unreachable (); + if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128) + return 128; + } + else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE + || TREE_CODE (type) == INTEGER_TYPE) + { + if (TYPE_MODE (type) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128) + return 128; } + + return align; } -/* Output code for INSN to convert a float to a signed int. OPERANDS - are the insn operands. The output may be [HSD]Imode and the input - operand may be [SDX]Fmode. */ +/* Compute the alignment for a local variable or a stack slot. EXP is + the data type or decl itself, MODE is the widest mode available and + ALIGN is the alignment that the object would ordinarily have. The + value of this macro is used instead of that alignment to align the + object. */ -const char * -output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp) +unsigned int +ix86_local_alignment (tree exp, machine_mode mode, + unsigned int align) { - bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG); - bool dimode_p = GET_MODE (operands[0]) == DImode; - int round_mode = get_attr_i387_cw (insn); - - static char buf[40]; - const char *p; + tree type, decl; - /* Jump through a hoop or two for DImode, since the hardware has no - non-popping instruction. We used to do this a different way, but - that was somewhat fragile and broke with post-reload splitters. */ - if ((dimode_p || fisttp) && !stack_top_dies) - output_asm_insn ("fld\t%y1", operands); - - gcc_assert (STACK_TOP_P (operands[1])); - gcc_assert (MEM_P (operands[0])); - gcc_assert (GET_MODE (operands[1]) != TFmode); - - if (fisttp) - return "fisttp%Z0\t%0"; - - strcpy (buf, "fist"); - - if (round_mode != I387_CW_ANY) - output_asm_insn ("fldcw\t%3", operands); - - p = "p%Z0\t%0"; - strcat (buf, p + !(stack_top_dies || dimode_p)); - - output_asm_insn (buf, operands); - - if (round_mode != I387_CW_ANY) - output_asm_insn ("fldcw\t%2", operands); - - return ""; -} - -/* Output code for x87 ffreep insn. The OPNO argument, which may only - have the values zero or one, indicates the ffreep insn's operand - from the OPERANDS array. */ - -static const char * -output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno) -{ - if (TARGET_USE_FFREEP) -#ifdef HAVE_AS_IX86_FFREEP - return opno ? "ffreep\t%y1" : "ffreep\t%y0"; -#else + if (exp && DECL_P (exp)) { - static char retval[32]; - int regno = REGNO (operands[opno]); - - gcc_assert (STACK_REGNO_P (regno)); - - regno -= FIRST_STACK_REG; - - snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno); - return retval; + type = TREE_TYPE (exp); + decl = exp; + } + else + { + type = exp; + decl = NULL; } -#endif - return opno ? "fstp\t%y1" : "fstp\t%y0"; -} + /* Don't do dynamic stack realignment for long long objects with + -mpreferred-stack-boundary=2. */ + if (!TARGET_64BIT + && align == 64 + && ix86_preferred_stack_boundary < 64 + && (mode == DImode || (type && TYPE_MODE (type) == DImode)) + && (!type || !TYPE_USER_ALIGN (type)) + && (!decl || !DECL_USER_ALIGN (decl))) + align = 32; + /* If TYPE is NULL, we are allocating a stack slot for caller-save + register in MODE. We will return the largest alignment of XF + and DF. */ + if (!type) + { + if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode)) + align = GET_MODE_ALIGNMENT (DFmode); + return align; + } -/* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi - should be used. UNORDERED_P is true when fucom should be used. */ + /* Don't increase alignment for Intel MCU psABI. */ + if (TARGET_IAMCU) + return align; -const char * -output_fp_compare (rtx_insn *insn, rtx *operands, - bool eflags_p, bool unordered_p) -{ - rtx *xops = eflags_p ? &operands[0] : &operands[1]; - bool stack_top_dies; + /* x86-64 ABI requires arrays greater than 16 bytes to be aligned + to 16byte boundary. Exact wording is: - static char buf[40]; - const char *p; + An array uses the same alignment as its elements, except that a local or + global array variable of length at least 16 bytes or + a C99 variable-length array variable always has alignment of at least 16 bytes. - gcc_assert (STACK_TOP_P (xops[0])); + This was added to allow use of aligned SSE instructions at arrays. This + rule is meant for static storage (where compiler cannot do the analysis + by itself). We follow it for automatic variables only when convenient. + We fully control everything in the function compiled and functions from + other unit cannot rely on the alignment. - stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG); + Exclude va_list type. It is the common case of local array where + we cannot benefit from the alignment. - if (eflags_p) + TODO: Probably one should optimize for size only when var is not escaping. */ + if (TARGET_64BIT && optimize_function_for_speed_p (cfun) + && TARGET_SSE) { - p = unordered_p ? "fucomi" : "fcomi"; - strcpy (buf, p); - - p = "p\t{%y1, %0|%0, %y1}"; - strcat (buf, p + !stack_top_dies); - - return buf; + if (AGGREGATE_TYPE_P (type) + && (va_list_type_node == NULL_TREE + || (TYPE_MAIN_VARIANT (type) + != TYPE_MAIN_VARIANT (va_list_type_node))) + && TYPE_SIZE (type) + && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST + && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128) + && align < 128) + return 128; } - - if (STACK_REG_P (xops[1]) - && stack_top_dies - && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1)) + if (TREE_CODE (type) == ARRAY_TYPE) { - gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1); - - /* If both the top of the 387 stack die, and the other operand - is also a stack register that dies, then this must be a - `fcompp' float compare. */ - p = unordered_p ? "fucompp" : "fcompp"; - strcpy (buf, p); + if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128) + return 128; } - else if (const0_operand (xops[1], VOIDmode)) + else if (TREE_CODE (type) == COMPLEX_TYPE) { - gcc_assert (!unordered_p); - strcpy (buf, "ftst"); + if (TYPE_MODE (type) == DCmode && align < 64) + return 64; + if ((TYPE_MODE (type) == XCmode + || TYPE_MODE (type) == TCmode) && align < 128) + return 128; } - else + else if ((TREE_CODE (type) == RECORD_TYPE + || TREE_CODE (type) == UNION_TYPE + || TREE_CODE (type) == QUAL_UNION_TYPE) + && TYPE_FIELDS (type)) { - if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT) - { - gcc_assert (!unordered_p); - p = "ficom"; - } - else - p = unordered_p ? "fucom" : "fcom"; - - strcpy (buf, p); - - p = "p%Z2\t%y2"; - strcat (buf, p + !stack_top_dies); + if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128) + return 128; } + else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE + || TREE_CODE (type) == INTEGER_TYPE) + { - output_asm_insn (buf, operands); - return "fnstsw\t%0"; + if (TYPE_MODE (type) == DFmode && align < 64) + return 64; + if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128) + return 128; + } + return align; } -void -ix86_output_addr_vec_elt (FILE *file, int value) -{ - const char *directive = ASM_LONG; - -#ifdef ASM_QUAD - if (TARGET_LP64) - directive = ASM_QUAD; -#else - gcc_assert (!TARGET_64BIT); -#endif - - fprintf (file, "%s%s%d\n", directive, LPREFIX, value); -} +/* Compute the minimum required alignment for dynamic stack realignment + purposes for a local variable, parameter or a stack slot. EXP is + the data type or decl itself, MODE is its mode and ALIGN is the + alignment that the object would ordinarily have. */ -void -ix86_output_addr_diff_elt (FILE *file, int value, int rel) +unsigned int +ix86_minimum_alignment (tree exp, machine_mode mode, + unsigned int align) { - const char *directive = ASM_LONG; + tree type, decl; -#ifdef ASM_QUAD - if (TARGET_64BIT && CASE_VECTOR_MODE == DImode) - directive = ASM_QUAD; -#else - gcc_assert (!TARGET_64BIT); -#endif - /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */ - if (TARGET_64BIT || TARGET_VXWORKS_RTP) - fprintf (file, "%s%s%d-%s%d\n", - directive, LPREFIX, value, LPREFIX, rel); -#if TARGET_MACHO - else if (TARGET_MACHO) + if (exp && DECL_P (exp)) { - fprintf (file, ASM_LONG "%s%d-", LPREFIX, value); - machopic_output_function_base_name (file); - putc ('\n', file); + type = TREE_TYPE (exp); + decl = exp; } -#endif - else if (HAVE_AS_GOTOFF_IN_DATA) - fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value); else - asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n", - GOT_SYMBOL_NAME, LPREFIX, value); -} - -/* Generate either "mov $0, reg" or "xor reg, reg", as appropriate - for the target. */ - -void -ix86_expand_clear (rtx dest) -{ - rtx tmp; - - /* We play register width games, which are only valid after reload. */ - gcc_assert (reload_completed); + { + type = exp; + decl = NULL; + } - /* Avoid HImode and its attendant prefix byte. */ - if (GET_MODE_SIZE (GET_MODE (dest)) < 4) - dest = gen_rtx_REG (SImode, REGNO (dest)); - tmp = gen_rtx_SET (dest, const0_rtx); + if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64) + return align; - if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ()) + /* Don't do dynamic stack realignment for long long objects with + -mpreferred-stack-boundary=2. */ + if ((mode == DImode || (type && TYPE_MODE (type) == DImode)) + && (!type || !TYPE_USER_ALIGN (type)) + && (!decl || !DECL_USER_ALIGN (decl))) { - rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob)); + gcc_checking_assert (!TARGET_STV); + return 32; } - emit_insn (tmp); + return align; } + +/* Find a location for the static chain incoming to a nested function. + This is a register, unless all free registers are used by arguments. */ -void -ix86_expand_move (machine_mode mode, rtx operands[]) -{ - rtx op0, op1; - rtx tmp, addend = NULL_RTX; - enum tls_model model; - - op0 = operands[0]; - op1 = operands[1]; +static rtx +ix86_static_chain (const_tree fndecl_or_type, bool incoming_p) +{ + unsigned regno; - switch (GET_CODE (op1)) + if (TARGET_64BIT) { - case CONST: - tmp = XEXP (op1, 0); - - if (GET_CODE (tmp) != PLUS - || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF) - break; - - op1 = XEXP (tmp, 0); - addend = XEXP (tmp, 1); - /* FALLTHRU */ + /* We always use R10 in 64-bit mode. */ + regno = R10_REG; + } + else + { + const_tree fntype, fndecl; + unsigned int ccvt; - case SYMBOL_REF: - model = SYMBOL_REF_TLS_MODEL (op1); + /* By default in 32-bit mode we use ECX to pass the static chain. */ + regno = CX_REG; - if (model) - op1 = legitimize_tls_address (op1, model, true); - else if (ix86_force_load_from_GOT_p (op1)) + if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL) { - /* Load the external function address via GOT slot to avoid PLT. */ - op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1), - (TARGET_64BIT - ? UNSPEC_GOTPCREL - : UNSPEC_GOT)); - op1 = gen_rtx_CONST (Pmode, op1); - op1 = gen_const_mem (Pmode, op1); - set_mem_alias_set (op1, ix86_GOT_alias_set ()); + fntype = TREE_TYPE (fndecl_or_type); + fndecl = fndecl_or_type; } else { - tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX); - if (tmp) - { - op1 = tmp; - if (!addend) - break; - } - else - { - op1 = operands[1]; - break; - } + fntype = fndecl_or_type; + fndecl = NULL; } - if (addend) + ccvt = ix86_get_callcvt (fntype); + if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) { - op1 = force_operand (op1, NULL_RTX); - op1 = expand_simple_binop (Pmode, PLUS, op1, addend, - op0, 1, OPTAB_DIRECT); + /* Fastcall functions use ecx/edx for arguments, which leaves + us with EAX for the static chain. + Thiscall functions use ecx for arguments, which also + leaves us with EAX for the static chain. */ + regno = AX_REG; } - else - op1 = force_operand (op1, op0); - - if (op1 == op0) - return; - - op1 = convert_to_mode (mode, op1, 1); - - default: - break; - } - - if ((flag_pic || MACHOPIC_INDIRECT) - && symbolic_operand (op1, mode)) - { - if (TARGET_MACHO && !TARGET_64BIT) + else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) { -#if TARGET_MACHO - /* dynamic-no-pic */ - if (MACHOPIC_INDIRECT) - { - rtx temp = (op0 && REG_P (op0) && mode == Pmode) - ? op0 : gen_reg_rtx (Pmode); - op1 = machopic_indirect_data_reference (op1, temp); - if (MACHOPIC_PURE) - op1 = machopic_legitimize_pic_address (op1, mode, - temp == op1 ? 0 : temp); - } - if (op0 != op1 && GET_CODE (op0) != MEM) - { - rtx insn = gen_rtx_SET (op0, op1); - emit_insn (insn); - return; - } - if (GET_CODE (op0) == MEM) - op1 = force_reg (Pmode, op1); - else - { - rtx temp = op0; - if (GET_CODE (temp) != REG) - temp = gen_reg_rtx (Pmode); - temp = legitimize_pic_address (op1, temp); - if (temp == op0) - return; - op1 = temp; - } - /* dynamic-no-pic */ -#endif + /* Thiscall functions use ecx for arguments, which leaves + us with EAX and EDX for the static chain. + We are using for abi-compatibility EAX. */ + regno = AX_REG; } - else + else if (ix86_function_regparm (fntype, fndecl) == 3) { - if (MEM_P (op0)) - op1 = force_reg (mode, op1); - else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode))) - { - rtx reg = can_create_pseudo_p () ? NULL_RTX : op0; - op1 = legitimize_pic_address (op1, reg); - if (op0 == op1) - return; - op1 = convert_to_mode (mode, op1, 1); - } - } - } - else - { - if (MEM_P (op0) - && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode) - || !push_operand (op0, mode)) - && MEM_P (op1)) - op1 = force_reg (mode, op1); - - if (push_operand (op0, mode) - && ! general_no_elim_operand (op1, mode)) - op1 = copy_to_mode_reg (mode, op1); - - /* Force large constants in 64bit compilation into register - to get them CSEed. */ - if (can_create_pseudo_p () - && (mode == DImode) && TARGET_64BIT - && immediate_operand (op1, mode) - && !x86_64_zext_immediate_operand (op1, VOIDmode) - && !register_operand (op0, mode) - && optimize) - op1 = copy_to_mode_reg (mode, op1); - - if (can_create_pseudo_p () - && CONST_DOUBLE_P (op1)) - { - /* If we are loading a floating point constant to a register, - force the value to memory now, since we'll get better code - out the back end. */ - - op1 = validize_mem (force_const_mem (mode, op1)); - if (!register_operand (op0, mode)) + /* For regparm 3, we have no free call-clobbered registers in + which to store the static chain. In order to implement this, + we have the trampoline push the static chain to the stack. + However, we can't push a value below the return address when + we call the nested function directly, so we have to use an + alternate entry point. For this we use ESI, and have the + alternate entry point push ESI, so that things appear the + same once we're executing the nested function. */ + if (incoming_p) { - rtx temp = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (temp, op1)); - emit_move_insn (op0, temp); - return; + if (fndecl == current_function_decl + && !ix86_static_chain_on_stack) + { + gcc_assert (!reload_completed); + ix86_static_chain_on_stack = true; + } + return gen_frame_mem (SImode, + plus_constant (Pmode, + arg_pointer_rtx, -8)); } + regno = SI_REG; } } - emit_insn (gen_rtx_SET (op0, op1)); -} - -void -ix86_expand_vector_move (machine_mode mode, rtx operands[]) -{ - rtx op0 = operands[0], op1 = operands[1]; - /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU - psABI since the biggest alignment is 4 byte for IA MCU psABI. */ - unsigned int align = (TARGET_IAMCU - ? GET_MODE_BITSIZE (mode) - : GET_MODE_ALIGNMENT (mode)); - - if (push_operand (op0, VOIDmode)) - op0 = emit_move_resolve_push (mode, op0); - - /* Force constants other than zero into memory. We do not know how - the instructions used to build constants modify the upper 64 bits - of the register, once we have that information we may be able - to handle some of them more efficiently. */ - if (can_create_pseudo_p () - && (CONSTANT_P (op1) - || (SUBREG_P (op1) - && CONSTANT_P (SUBREG_REG (op1)))) - && ((register_operand (op0, mode) - && !standard_sse_constant_p (op1, mode)) - /* ix86_expand_vector_move_misalign() does not like constants. */ - || (SSE_REG_MODE_P (mode) - && MEM_P (op0) - && MEM_ALIGN (op0) < align))) - { - if (SUBREG_P (op1)) - { - machine_mode imode = GET_MODE (SUBREG_REG (op1)); - rtx r = force_const_mem (imode, SUBREG_REG (op1)); - if (r) - r = validize_mem (r); - else - r = force_reg (imode, SUBREG_REG (op1)); - op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1)); - } - else - op1 = validize_mem (force_const_mem (mode, op1)); - } - - /* We need to check memory alignment for SSE mode since attribute - can make operands unaligned. */ - if (can_create_pseudo_p () - && SSE_REG_MODE_P (mode) - && ((MEM_P (op0) && (MEM_ALIGN (op0) < align)) - || (MEM_P (op1) && (MEM_ALIGN (op1) < align)))) - { - rtx tmp[2]; - - /* ix86_expand_vector_move_misalign() does not like both - arguments in memory. */ - if (!register_operand (op0, mode) - && !register_operand (op1, mode)) - op1 = force_reg (mode, op1); - - tmp[0] = op0; tmp[1] = op1; - ix86_expand_vector_move_misalign (mode, tmp); - return; - } - - /* Make operand1 a register if it isn't already. */ - if (can_create_pseudo_p () - && !register_operand (op0, mode) - && !register_operand (op1, mode)) - { - emit_move_insn (op0, force_reg (GET_MODE (op0), op1)); - return; - } - - emit_insn (gen_rtx_SET (op0, op1)); + return gen_rtx_REG (Pmode, regno); } -/* Split 32-byte AVX unaligned load and store if needed. */ +/* Emit RTL insns to initialize the variable parts of a trampoline. + FNDECL is the decl of the target address; M_TRAMP is a MEM for + the trampoline, and CHAIN_VALUE is an RTX for the static chain + to be passed to the target function. */ static void -ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) +ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) { - rtx m; - rtx (*extract) (rtx, rtx, rtx); - machine_mode mode; + rtx mem, fnaddr; + int opcode; + int offset = 0; + bool need_endbr = (flag_cf_protection & CF_BRANCH); - if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD) - || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE)) - { - emit_insn (gen_rtx_SET (op0, op1)); - return; - } + fnaddr = XEXP (DECL_RTL (fndecl), 0); - rtx orig_op0 = NULL_RTX; - mode = GET_MODE (op0); - switch (GET_MODE_CLASS (mode)) + if (TARGET_64BIT) { - case MODE_VECTOR_INT: - case MODE_INT: - if (mode != V32QImode) + int size; + + if (need_endbr) { - if (!MEM_P (op0)) - { - orig_op0 = op0; - op0 = gen_reg_rtx (V32QImode); - } - else - op0 = gen_lowpart (V32QImode, op0); - op1 = gen_lowpart (V32QImode, op1); - mode = V32QImode; + /* Insert ENDBR64. */ + mem = adjust_address (m_tramp, SImode, offset); + emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode)); + offset += 4; } - break; - case MODE_VECTOR_FLOAT: - break; - default: - gcc_unreachable (); - } - - switch (mode) - { - default: - gcc_unreachable (); - case E_V32QImode: - extract = gen_avx_vextractf128v32qi; - mode = V16QImode; - break; - case E_V8SFmode: - extract = gen_avx_vextractf128v8sf; - mode = V4SFmode; - break; - case E_V4DFmode: - extract = gen_avx_vextractf128v4df; - mode = V2DFmode; - break; - } - - if (MEM_P (op1)) - { - rtx r = gen_reg_rtx (mode); - m = adjust_address (op1, mode, 0); - emit_move_insn (r, m); - m = adjust_address (op1, mode, 16); - r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); - emit_move_insn (op0, r); - } - else if (MEM_P (op0)) - { - m = adjust_address (op0, mode, 0); - emit_insn (extract (m, op1, const0_rtx)); - m = adjust_address (op0, mode, 16); - emit_insn (extract (m, copy_rtx (op1), const1_rtx)); - } - else - gcc_unreachable (); - - if (orig_op0) - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); -} - -/* Implement the movmisalign patterns for SSE. Non-SSE modes go - straight to ix86_expand_vector_move. */ -/* Code generation for scalar reg-reg moves of single and double precision data: - if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true) - movaps reg, reg - else - movss reg, reg - if (x86_sse_partial_reg_dependency == true) - movapd reg, reg - else - movsd reg, reg - - Code generation for scalar loads of double precision data: - if (x86_sse_split_regs == true) - movlpd mem, reg (gas syntax) - else - movsd mem, reg - - Code generation for unaligned packed loads of single precision data - (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency): - if (x86_sse_unaligned_move_optimal) - movups mem, reg - - if (x86_sse_partial_reg_dependency == true) - { - xorps reg, reg - movlps mem, reg - movhps mem+8, reg - } - else - { - movlps mem, reg - movhps mem+8, reg - } - - Code generation for unaligned packed loads of double precision data - (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs): - if (x86_sse_unaligned_move_optimal) - movupd mem, reg - - if (x86_sse_split_regs == true) - { - movlpd mem, reg - movhpd mem+8, reg - } - else - { - movsd mem, reg - movhpd mem+8, reg - } - */ - -void -ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) -{ - rtx op0, op1, m; - - op0 = operands[0]; - op1 = operands[1]; - - /* Use unaligned load/store for AVX512 or when optimizing for size. */ - if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ()) - { - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - - if (TARGET_AVX) - { - if (GET_MODE_SIZE (mode) == 32) - ix86_avx256_split_vector_move_misalign (op0, op1); - else - /* Always use 128-bit mov_internal pattern for AVX. */ - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - - if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) - { - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - - /* ??? If we have typed data, then it would appear that using - movdqu is the only way to get unaligned data loaded with - integer type. */ - if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - if (MEM_P (op1)) - { - if (TARGET_SSE2 && mode == V2DFmode) - { - rtx zero; + /* Load the function address to r11. Try to load address using + the shorter movl instead of movabs. We may want to support + movq for kernel mode, but kernel does not use trampolines at + the moment. FNADDR is a 32bit address and may not be in + DImode when ptr_mode == SImode. Always use movl in this + case. */ + if (ptr_mode == SImode + || x86_64_zext_immediate_operand (fnaddr, VOIDmode)) + { + fnaddr = copy_addr_to_reg (fnaddr); - /* When SSE registers are split into halves, we can avoid - writing to the top half twice. */ - if (TARGET_SSE_SPLIT_REGS) - { - emit_clobber (op0); - zero = op0; - } - else - { - /* ??? Not sure about the best option for the Intel chips. - The following would seem to satisfy; the register is - entirely cleared, breaking the dependency chain. We - then store to the upper half, with a dependency depth - of one. A rumor has it that Intel recommends two movsd - followed by an unpacklpd, but this is unconfirmed. And - given that the dependency depth of the unpacklpd would - still be one, I'm not sure why this would be better. */ - zero = CONST0_RTX (V2DFmode); - } + mem = adjust_address (m_tramp, HImode, offset); + emit_move_insn (mem, gen_int_mode (0xbb41, HImode)); - m = adjust_address (op1, DFmode, 0); - emit_insn (gen_sse2_loadlpd (op0, zero, m)); - m = adjust_address (op1, DFmode, 8); - emit_insn (gen_sse2_loadhpd (op0, op0, m)); + mem = adjust_address (m_tramp, SImode, offset + 2); + emit_move_insn (mem, gen_lowpart (SImode, fnaddr)); + offset += 6; } else - { - rtx t; - - if (mode != V4SFmode) - t = gen_reg_rtx (V4SFmode); - else - t = op0; - - if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) - emit_move_insn (t, CONST0_RTX (V4SFmode)); - else - emit_clobber (t); + { + mem = adjust_address (m_tramp, HImode, offset); + emit_move_insn (mem, gen_int_mode (0xbb49, HImode)); - m = adjust_address (op1, V2SFmode, 0); - emit_insn (gen_sse_loadlps (t, t, m)); - m = adjust_address (op1, V2SFmode, 8); - emit_insn (gen_sse_loadhps (t, t, m)); - if (mode != V4SFmode) - emit_move_insn (op0, gen_lowpart (mode, t)); + mem = adjust_address (m_tramp, DImode, offset + 2); + emit_move_insn (mem, fnaddr); + offset += 10; } - } - else if (MEM_P (op0)) - { - if (TARGET_SSE2 && mode == V2DFmode) + + /* Load static chain using movabs to r10. Use the shorter movl + instead of movabs when ptr_mode == SImode. */ + if (ptr_mode == SImode) { - m = adjust_address (op0, DFmode, 0); - emit_insn (gen_sse2_storelpd (m, op1)); - m = adjust_address (op0, DFmode, 8); - emit_insn (gen_sse2_storehpd (m, op1)); + opcode = 0xba41; + size = 6; } else { - if (mode != V4SFmode) - op1 = gen_lowpart (V4SFmode, op1); - - m = adjust_address (op0, V2SFmode, 0); - emit_insn (gen_sse_storelps (m, op1)); - m = adjust_address (op0, V2SFmode, 8); - emit_insn (gen_sse_storehps (m, copy_rtx (op1))); + opcode = 0xba49; + size = 10; } - } - else - gcc_unreachable (); -} -/* Helper function of ix86_fixup_binary_operands to canonicalize - operand order. Returns true if the operands should be swapped. */ - -static bool -ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx dst = operands[0]; - rtx src1 = operands[1]; - rtx src2 = operands[2]; + mem = adjust_address (m_tramp, HImode, offset); + emit_move_insn (mem, gen_int_mode (opcode, HImode)); - /* If the operation is not commutative, we can't do anything. */ - if (GET_RTX_CLASS (code) != RTX_COMM_ARITH - && GET_RTX_CLASS (code) != RTX_COMM_COMPARE) - return false; + mem = adjust_address (m_tramp, ptr_mode, offset + 2); + emit_move_insn (mem, chain_value); + offset += size; - /* Highest priority is that src1 should match dst. */ - if (rtx_equal_p (dst, src1)) - return false; - if (rtx_equal_p (dst, src2)) - return true; - - /* Next highest priority is that immediate constants come second. */ - if (immediate_operand (src2, mode)) - return false; - if (immediate_operand (src1, mode)) - return true; - - /* Lowest priority is that memory references should come second. */ - if (MEM_P (src2)) - return false; - if (MEM_P (src1)) - return true; - - return false; -} - - -/* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the - destination to use for the operation. If different from the true - destination in operands[0], a copy operation will be required. */ - -rtx -ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx dst = operands[0]; - rtx src1 = operands[1]; - rtx src2 = operands[2]; - - /* Canonicalize operand order. */ - if (ix86_swap_binary_operands_p (code, mode, operands)) - { - /* It is invalid to swap operands of different modes. */ - gcc_assert (GET_MODE (src1) == GET_MODE (src2)); - - std::swap (src1, src2); + /* Jump to r11; the last (unused) byte is a nop, only there to + pad the write out to a single 32-bit store. */ + mem = adjust_address (m_tramp, SImode, offset); + emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode)); + offset += 4; } - - /* Both source operands cannot be in memory. */ - if (MEM_P (src1) && MEM_P (src2)) + else { - /* Optimization: Only read from memory once. */ - if (rtx_equal_p (src1, src2)) + rtx disp, chain; + + /* Depending on the static chain location, either load a register + with a constant, or push the constant to the stack. All of the + instructions are the same size. */ + chain = ix86_static_chain (fndecl, true); + if (REG_P (chain)) { - src2 = force_reg (mode, src2); - src1 = src2; + switch (REGNO (chain)) + { + case AX_REG: + opcode = 0xb8; break; + case CX_REG: + opcode = 0xb9; break; + default: + gcc_unreachable (); + } } - else if (rtx_equal_p (dst, src1)) - src2 = force_reg (mode, src2); else - src1 = force_reg (mode, src1); - } - - /* If the destination is memory, and we do not have matching source - operands, do things in registers. */ - if (MEM_P (dst) && !rtx_equal_p (dst, src1)) - dst = gen_reg_rtx (mode); - - /* Source 1 cannot be a constant. */ - if (CONSTANT_P (src1)) - src1 = force_reg (mode, src1); - - /* Source 1 cannot be a non-matching memory. */ - if (MEM_P (src1) && !rtx_equal_p (dst, src1)) - src1 = force_reg (mode, src1); - - /* Improve address combine. */ - if (code == PLUS - && GET_MODE_CLASS (mode) == MODE_INT - && MEM_P (src2)) - src2 = force_reg (mode, src2); - - operands[1] = src1; - operands[2] = src2; - return dst; -} - -/* Similarly, but assume that the destination has already been - set up properly. */ - -void -ix86_fixup_binary_operands_no_copy (enum rtx_code code, - machine_mode mode, rtx operands[]) -{ - rtx dst = ix86_fixup_binary_operands (code, mode, operands); - gcc_assert (dst == operands[0]); -} + opcode = 0x68; -/* Attempt to expand a binary operator. Make the expansion closer to the - actual machine, then just general_operand, which will allow 3 separate - memory references (one output, two input) in a single insn. */ + if (need_endbr) + { + /* Insert ENDBR32. */ + mem = adjust_address (m_tramp, SImode, offset); + emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode)); + offset += 4; + } -void -ix86_expand_binary_operator (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx src1, src2, dst, op, clob; + mem = adjust_address (m_tramp, QImode, offset); + emit_move_insn (mem, gen_int_mode (opcode, QImode)); - dst = ix86_fixup_binary_operands (code, mode, operands); - src1 = operands[1]; - src2 = operands[2]; + mem = adjust_address (m_tramp, SImode, offset + 1); + emit_move_insn (mem, chain_value); + offset += 5; - /* Emit the instruction. */ + mem = adjust_address (m_tramp, QImode, offset); + emit_move_insn (mem, gen_int_mode (0xe9, QImode)); - op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2)); + mem = adjust_address (m_tramp, SImode, offset + 1); - if (reload_completed - && code == PLUS - && !rtx_equal_p (dst, src1)) - { - /* This is going to be an LEA; avoid splitting it later. */ - emit_insn (op); - } - else - { - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); + /* Compute offset from the end of the jmp to the target function. + In the case in which the trampoline stores the static chain on + the stack, we need to skip the first insn which pushes the + (call-saved) register static chain; this push is 1 byte. */ + offset += 5; + disp = expand_binop (SImode, sub_optab, fnaddr, + plus_constant (Pmode, XEXP (m_tramp, 0), + offset - (MEM_P (chain) ? 1 : 0)), + NULL_RTX, 1, OPTAB_DIRECT); + emit_move_insn (mem, disp); } - /* Fix up the destination if needed. */ - if (dst != operands[0]) - emit_move_insn (operands[0], dst); -} - -/* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with - the given OPERANDS. */ + gcc_assert (offset <= TRAMPOLINE_SIZE); -void -ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx op1 = NULL_RTX, op2 = NULL_RTX; - if (SUBREG_P (operands[1])) - { - op1 = operands[1]; - op2 = operands[2]; - } - else if (SUBREG_P (operands[2])) - { - op1 = operands[2]; - op2 = operands[1]; - } - /* Optimize (__m128i) d | (__m128i) e and similar code - when d and e are float vectors into float vector logical - insn. In C/C++ without using intrinsics there is no other way - to express vector logical operation on float vectors than - to cast them temporarily to integer vectors. */ - if (op1 - && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL - && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR) - && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT - && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode) - && SUBREG_BYTE (op1) == 0 - && (GET_CODE (op2) == CONST_VECTOR - || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2)) - && SUBREG_BYTE (op2) == 0)) - && can_create_pseudo_p ()) - { - rtx dst; - switch (GET_MODE (SUBREG_REG (op1))) - { - case E_V4SFmode: - case E_V8SFmode: - case E_V16SFmode: - case E_V2DFmode: - case E_V4DFmode: - case E_V8DFmode: - dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1))); - if (GET_CODE (op2) == CONST_VECTOR) - { - op2 = gen_lowpart (GET_MODE (dst), op2); - op2 = force_reg (GET_MODE (dst), op2); - } - else - { - op1 = operands[1]; - op2 = SUBREG_REG (operands[2]); - if (!vector_operand (op2, GET_MODE (dst))) - op2 = force_reg (GET_MODE (dst), op2); - } - op1 = SUBREG_REG (op1); - if (!vector_operand (op1, GET_MODE (dst))) - op1 = force_reg (GET_MODE (dst), op1); - emit_insn (gen_rtx_SET (dst, - gen_rtx_fmt_ee (code, GET_MODE (dst), - op1, op2))); - emit_move_insn (operands[0], gen_lowpart (mode, dst)); - return; - default: - break; - } - } - if (!vector_operand (operands[1], mode)) - operands[1] = force_reg (mode, operands[1]); - if (!vector_operand (operands[2], mode)) - operands[2] = force_reg (mode, operands[2]); - ix86_fixup_binary_operands_no_copy (code, mode, operands); - emit_insn (gen_rtx_SET (operands[0], - gen_rtx_fmt_ee (code, mode, operands[1], - operands[2]))); +#ifdef HAVE_ENABLE_EXECUTE_STACK +#ifdef CHECK_EXECUTE_STACK_ENABLED + if (CHECK_EXECUTE_STACK_ENABLED) +#endif + emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"), + LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode); +#endif } -/* Return TRUE or FALSE depending on whether the binary operator meets the - appropriate constraints. */ - -bool -ix86_binary_operator_ok (enum rtx_code code, machine_mode mode, - rtx operands[3]) +static bool +ix86_allocate_stack_slots_for_args (void) { - rtx dst = operands[0]; - rtx src1 = operands[1]; - rtx src2 = operands[2]; - - /* Both source operands cannot be in memory. */ - if (MEM_P (src1) && MEM_P (src2)) - return false; - - /* Canonicalize operand order for commutative operators. */ - if (ix86_swap_binary_operands_p (code, mode, operands)) - std::swap (src1, src2); - - /* If the destination is memory, we must have a matching source operand. */ - if (MEM_P (dst) && !rtx_equal_p (dst, src1)) - return false; - - /* Source 1 cannot be a constant. */ - if (CONSTANT_P (src1)) - return false; - - /* Source 1 cannot be a non-matching memory. */ - if (MEM_P (src1) && !rtx_equal_p (dst, src1)) - /* Support "andhi/andsi/anddi" as a zero-extending move. */ - return (code == AND - && (mode == HImode - || mode == SImode - || (TARGET_64BIT && mode == DImode)) - && satisfies_constraint_L (src2)); - - return true; + /* Naked functions should not allocate stack slots for arguments. */ + return !ix86_function_naked (current_function_decl); } -/* Attempt to expand a unary operator. Make the expansion closer to the - actual machine, then just general_operand, which will allow 2 separate - memory references (one output, one input) in a single insn. */ - -void -ix86_expand_unary_operator (enum rtx_code code, machine_mode mode, - rtx operands[]) +static bool +ix86_warn_func_return (tree decl) { - bool matching_memory = false; - rtx src, dst, op, clob; - - dst = operands[0]; - src = operands[1]; - - /* If the destination is memory, and we do not have matching source - operands, do things in registers. */ - if (MEM_P (dst)) - { - if (rtx_equal_p (dst, src)) - matching_memory = true; - else - dst = gen_reg_rtx (mode); - } - - /* When source operand is memory, destination must match. */ - if (MEM_P (src) && !matching_memory) - src = force_reg (mode, src); - - /* Emit the instruction. */ - - op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src)); - - if (code == NOT) - emit_insn (op); - else - { - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); - } - - /* Fix up the destination if needed. */ - if (dst != operands[0]) - emit_move_insn (operands[0], dst); + /* Naked functions are implemented entirely in assembly, including the + return sequence, so suppress warnings about this. */ + return !ix86_function_naked (decl); } - -/* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and - divisor are within the range [0-255]. */ - -void -ix86_split_idivmod (machine_mode mode, rtx operands[], - bool signed_p) + +/* Return the shift count of a vector by scalar shift builtin second argument + ARG1. */ +static tree +ix86_vector_shift_count (tree arg1) { - rtx_code_label *end_label, *qimode_label; - rtx div, mod; - rtx_insn *insn; - rtx scratch, tmp0, tmp1, tmp2; - rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx); - rtx (*gen_zero_extend) (rtx, rtx); - rtx (*gen_test_ccno_1) (rtx, rtx); - - switch (mode) - { - case E_SImode: - if (GET_MODE (operands[0]) == SImode) - { - if (GET_MODE (operands[1]) == SImode) - gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1; - else - gen_divmod4_1 - = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2; - gen_zero_extend = gen_zero_extendqisi2; - } - else - { - gen_divmod4_1 - = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1; - gen_zero_extend = gen_zero_extendqidi2; - } - gen_test_ccno_1 = gen_testsi_ccno_1; - break; - case E_DImode: - gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1; - gen_test_ccno_1 = gen_testdi_ccno_1; - gen_zero_extend = gen_zero_extendqidi2; - break; - default: - gcc_unreachable (); - } - - end_label = gen_label_rtx (); - qimode_label = gen_label_rtx (); - - scratch = gen_reg_rtx (mode); - - /* Use 8bit unsigned divimod if dividend and divisor are within - the range [0-255]. */ - emit_move_insn (scratch, operands[2]); - scratch = expand_simple_binop (mode, IOR, scratch, operands[3], - scratch, 1, OPTAB_DIRECT); - emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100))); - tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG); - tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx); - tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0, - gen_rtx_LABEL_REF (VOIDmode, qimode_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = qimode_label; - - /* Generate original signed/unsigned divimod. */ - div = gen_divmod4_1 (operands[0], operands[1], - operands[2], operands[3]); - emit_insn (div); - - /* Branch to the end. */ - emit_jump_insn (gen_jump (end_label)); - emit_barrier (); - - /* Generate 8bit unsigned divide. */ - emit_label (qimode_label); - /* Don't use operands[0] for result of 8bit divide since not all - registers support QImode ZERO_EXTRACT. */ - tmp0 = lowpart_subreg (HImode, scratch, mode); - tmp1 = lowpart_subreg (HImode, operands[2], mode); - tmp2 = lowpart_subreg (QImode, operands[3], mode); - emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2)); - - if (signed_p) - { - div = gen_rtx_DIV (mode, operands[2], operands[3]); - mod = gen_rtx_MOD (mode, operands[2], operands[3]); - } - else - { - div = gen_rtx_UDIV (mode, operands[2], operands[3]); - mod = gen_rtx_UMOD (mode, operands[2], operands[3]); - } - if (mode == SImode) - { - if (GET_MODE (operands[0]) != SImode) - div = gen_rtx_ZERO_EXTEND (DImode, div); - if (GET_MODE (operands[1]) != SImode) - mod = gen_rtx_ZERO_EXTEND (DImode, mod); - } - - /* Extract remainder from AH. */ - tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), - tmp0, GEN_INT (8), GEN_INT (8)); - if (REG_P (operands[1])) - insn = emit_move_insn (operands[1], tmp1); - else + if (tree_fits_uhwi_p (arg1)) + return arg1; + else if (TREE_CODE (arg1) == VECTOR_CST && CHAR_BIT == 8) { - /* Need a new scratch register since the old one has result - of 8bit divide. */ - scratch = gen_reg_rtx (GET_MODE (operands[1])); - emit_move_insn (scratch, tmp1); - insn = emit_move_insn (operands[1], scratch); + /* The count argument is weird, passed in as various 128-bit + (or 64-bit) vectors, the low 64 bits from it are the count. */ + unsigned char buf[16]; + int len = native_encode_expr (arg1, buf, 16); + if (len == 0) + return NULL_TREE; + tree t = native_interpret_expr (uint64_type_node, buf, len); + if (t && tree_fits_uhwi_p (t)) + return t; } - set_unique_reg_note (insn, REG_EQUAL, mod); - - /* Zero extend quotient from AL. */ - tmp1 = gen_lowpart (QImode, tmp0); - insn = emit_insn (gen_zero_extend (operands[0], tmp1)); - set_unique_reg_note (insn, REG_EQUAL, div); - - emit_label (end_label); + return NULL_TREE; } -#define LEA_MAX_STALL (3) -#define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1) - -/* Increase given DISTANCE in half-cycles according to - dependencies between PREV and NEXT instructions. - Add 1 half-cycle if there is no dependency and - go to next cycle if there is some dependecy. */ - -static unsigned int -increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance) +static tree +ix86_fold_builtin (tree fndecl, int n_args, + tree *args, bool ignore ATTRIBUTE_UNUSED) { - df_ref def, use; - - if (!prev || !next) - return distance + (distance & 1) + 2; + if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD) + { + enum ix86_builtins fn_code = (enum ix86_builtins) + DECL_FUNCTION_CODE (fndecl); + enum rtx_code rcode; + bool is_vshift; + unsigned HOST_WIDE_INT mask; - if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev)) - return distance + 1; + switch (fn_code) + { + case IX86_BUILTIN_CPU_IS: + case IX86_BUILTIN_CPU_SUPPORTS: + gcc_assert (n_args == 1); + return fold_builtin_cpu (fndecl, args); - FOR_EACH_INSN_USE (use, next) - FOR_EACH_INSN_DEF (def, prev) - if (!DF_REF_IS_ARTIFICIAL (def) - && DF_REF_REGNO (use) == DF_REF_REGNO (def)) - return distance + (distance & 1) + 2; + case IX86_BUILTIN_NANQ: + case IX86_BUILTIN_NANSQ: + { + tree type = TREE_TYPE (TREE_TYPE (fndecl)); + const char *str = c_getstr (*args); + int quiet = fn_code == IX86_BUILTIN_NANQ; + REAL_VALUE_TYPE real; - return distance + 1; -} + if (str && real_nan (&real, str, quiet, TYPE_MODE (type))) + return build_real (type, real); + return NULL_TREE; + } -/* Function checks if instruction INSN defines register number - REGNO1 or REGNO2. */ + case IX86_BUILTIN_INFQ: + case IX86_BUILTIN_HUGE_VALQ: + { + tree type = TREE_TYPE (TREE_TYPE (fndecl)); + REAL_VALUE_TYPE inf; + real_inf (&inf); + return build_real (type, inf); + } -static bool -insn_defines_reg (unsigned int regno1, unsigned int regno2, - rtx_insn *insn) -{ - df_ref def; - - FOR_EACH_INSN_DEF (def, insn) - if (DF_REF_REG_DEF_P (def) - && !DF_REF_IS_ARTIFICIAL (def) - && (regno1 == DF_REF_REGNO (def) - || regno2 == DF_REF_REGNO (def))) - return true; - - return false; -} - -/* Function checks if instruction INSN uses register number - REGNO as a part of address expression. */ - -static bool -insn_uses_reg_mem (unsigned int regno, rtx insn) -{ - df_ref use; - - FOR_EACH_INSN_USE (use, insn) - if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use)) - return true; - - return false; -} + case IX86_BUILTIN_TZCNT16: + case IX86_BUILTIN_CTZS: + case IX86_BUILTIN_TZCNT32: + case IX86_BUILTIN_TZCNT64: + gcc_assert (n_args == 1); + if (TREE_CODE (args[0]) == INTEGER_CST) + { + tree type = TREE_TYPE (TREE_TYPE (fndecl)); + tree arg = args[0]; + if (fn_code == IX86_BUILTIN_TZCNT16 + || fn_code == IX86_BUILTIN_CTZS) + arg = fold_convert (short_unsigned_type_node, arg); + if (integer_zerop (arg)) + return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg))); + else + return fold_const_call (CFN_CTZ, type, arg); + } + break; -/* Search backward for non-agu definition of register number REGNO1 - or register number REGNO2 in basic block starting from instruction - START up to head of basic block or instruction INSN. + case IX86_BUILTIN_LZCNT16: + case IX86_BUILTIN_CLZS: + case IX86_BUILTIN_LZCNT32: + case IX86_BUILTIN_LZCNT64: + gcc_assert (n_args == 1); + if (TREE_CODE (args[0]) == INTEGER_CST) + { + tree type = TREE_TYPE (TREE_TYPE (fndecl)); + tree arg = args[0]; + if (fn_code == IX86_BUILTIN_LZCNT16 + || fn_code == IX86_BUILTIN_CLZS) + arg = fold_convert (short_unsigned_type_node, arg); + if (integer_zerop (arg)) + return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg))); + else + return fold_const_call (CFN_CLZ, type, arg); + } + break; - Function puts true value into *FOUND var if definition was found - and false otherwise. + case IX86_BUILTIN_BEXTR32: + case IX86_BUILTIN_BEXTR64: + case IX86_BUILTIN_BEXTRI32: + case IX86_BUILTIN_BEXTRI64: + gcc_assert (n_args == 2); + if (tree_fits_uhwi_p (args[1])) + { + unsigned HOST_WIDE_INT res = 0; + unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0])); + unsigned int start = tree_to_uhwi (args[1]); + unsigned int len = (start & 0xff00) >> 8; + start &= 0xff; + if (start >= prec || len == 0) + res = 0; + else if (!tree_fits_uhwi_p (args[0])) + break; + else + res = tree_to_uhwi (args[0]) >> start; + if (len > prec) + len = prec; + if (len < HOST_BITS_PER_WIDE_INT) + res &= (HOST_WIDE_INT_1U << len) - 1; + return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res); + } + break; - Distance in half-cycles between START and found instruction or head - of BB is added to DISTANCE and returned. */ + case IX86_BUILTIN_BZHI32: + case IX86_BUILTIN_BZHI64: + gcc_assert (n_args == 2); + if (tree_fits_uhwi_p (args[1])) + { + unsigned int idx = tree_to_uhwi (args[1]) & 0xff; + if (idx >= TYPE_PRECISION (TREE_TYPE (args[0]))) + return args[0]; + if (idx == 0) + return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), 0); + if (!tree_fits_uhwi_p (args[0])) + break; + unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]); + res &= ~(HOST_WIDE_INT_M1U << idx); + return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res); + } + break; -static int -distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2, - rtx_insn *insn, int distance, - rtx_insn *start, bool *found) -{ - basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL; - rtx_insn *prev = start; - rtx_insn *next = NULL; + case IX86_BUILTIN_PDEP32: + case IX86_BUILTIN_PDEP64: + gcc_assert (n_args == 2); + if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1])) + { + unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]); + unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]); + unsigned HOST_WIDE_INT res = 0; + unsigned HOST_WIDE_INT m, k = 1; + for (m = 1; m; m <<= 1) + if ((mask & m) != 0) + { + if ((src & k) != 0) + res |= m; + k <<= 1; + } + return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res); + } + break; - *found = false; + case IX86_BUILTIN_PEXT32: + case IX86_BUILTIN_PEXT64: + gcc_assert (n_args == 2); + if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1])) + { + unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]); + unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]); + unsigned HOST_WIDE_INT res = 0; + unsigned HOST_WIDE_INT m, k = 1; + for (m = 1; m; m <<= 1) + if ((mask & m) != 0) + { + if ((src & m) != 0) + res |= k; + k <<= 1; + } + return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res); + } + break; - while (prev - && prev != insn - && distance < LEA_SEARCH_THRESHOLD) - { - if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev)) - { - distance = increase_distance (prev, next, distance); - if (insn_defines_reg (regno1, regno2, prev)) + case IX86_BUILTIN_MOVMSKPS: + case IX86_BUILTIN_PMOVMSKB: + case IX86_BUILTIN_MOVMSKPD: + case IX86_BUILTIN_PMOVMSKB128: + case IX86_BUILTIN_MOVMSKPD256: + case IX86_BUILTIN_MOVMSKPS256: + case IX86_BUILTIN_PMOVMSKB256: + gcc_assert (n_args == 1); + if (TREE_CODE (args[0]) == VECTOR_CST) { - if (recog_memoized (prev) < 0 - || get_attr_type (prev) != TYPE_LEA) + HOST_WIDE_INT res = 0; + for (unsigned i = 0; i < VECTOR_CST_NELTS (args[0]); ++i) { - *found = true; - return distance; + tree e = VECTOR_CST_ELT (args[0], i); + if (TREE_CODE (e) == INTEGER_CST && !TREE_OVERFLOW (e)) + { + if (wi::neg_p (wi::to_wide (e))) + res |= HOST_WIDE_INT_1 << i; + } + else if (TREE_CODE (e) == REAL_CST && !TREE_OVERFLOW (e)) + { + if (TREE_REAL_CST (e).sign) + res |= HOST_WIDE_INT_1 << i; + } + else + return NULL_TREE; } + return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), res); } + break; - next = prev; - } - if (prev == BB_HEAD (bb)) - break; - - prev = PREV_INSN (prev); - } - - return distance; -} - -/* Search backward for non-agu definition of register number REGNO1 - or register number REGNO2 in INSN's basic block until - 1. Pass LEA_SEARCH_THRESHOLD instructions, or - 2. Reach neighbor BBs boundary, or - 3. Reach agu definition. - Returns the distance between the non-agu definition point and INSN. - If no definition point, returns -1. */ - -static int -distance_non_agu_define (unsigned int regno1, unsigned int regno2, - rtx_insn *insn) -{ - basic_block bb = BLOCK_FOR_INSN (insn); - int distance = 0; - bool found = false; - - if (insn != BB_HEAD (bb)) - distance = distance_non_agu_define_in_bb (regno1, regno2, insn, - distance, PREV_INSN (insn), - &found); - - if (!found && distance < LEA_SEARCH_THRESHOLD) - { - edge e; - edge_iterator ei; - bool simple_loop = false; - - FOR_EACH_EDGE (e, ei, bb->preds) - if (e->src == bb) - { - simple_loop = true; - break; - } - - if (simple_loop) - distance = distance_non_agu_define_in_bb (regno1, regno2, - insn, distance, - BB_END (bb), &found); - else - { - int shortest_dist = -1; - bool found_in_bb = false; - - FOR_EACH_EDGE (e, ei, bb->preds) - { - int bb_dist - = distance_non_agu_define_in_bb (regno1, regno2, - insn, distance, - BB_END (e->src), - &found_in_bb); - if (found_in_bb) - { - if (shortest_dist < 0) - shortest_dist = bb_dist; - else if (bb_dist > 0) - shortest_dist = MIN (bb_dist, shortest_dist); - - found = true; - } - } - - distance = shortest_dist; - } - } - - /* get_attr_type may modify recog data. We want to make sure - that recog data is valid for instruction INSN, on which - distance_non_agu_define is called. INSN is unchanged here. */ - extract_insn_cached (insn); - - if (!found) - return -1; - - return distance >> 1; -} - -/* Return the distance in half-cycles between INSN and the next - insn that uses register number REGNO in memory address added - to DISTANCE. Return -1 if REGNO0 is set. - - Put true value into *FOUND if register usage was found and - false otherwise. - Put true value into *REDEFINED if register redefinition was - found and false otherwise. */ - -static int -distance_agu_use_in_bb (unsigned int regno, - rtx_insn *insn, int distance, rtx_insn *start, - bool *found, bool *redefined) -{ - basic_block bb = NULL; - rtx_insn *next = start; - rtx_insn *prev = NULL; - - *found = false; - *redefined = false; - - if (start != NULL_RTX) - { - bb = BLOCK_FOR_INSN (start); - if (start != BB_HEAD (bb)) - /* If insn and start belong to the same bb, set prev to insn, - so the call to increase_distance will increase the distance - between insns by 1. */ - prev = insn; - } + case IX86_BUILTIN_PSLLD: + case IX86_BUILTIN_PSLLD128: + case IX86_BUILTIN_PSLLD128_MASK: + case IX86_BUILTIN_PSLLD256: + case IX86_BUILTIN_PSLLD256_MASK: + case IX86_BUILTIN_PSLLD512: + case IX86_BUILTIN_PSLLDI: + case IX86_BUILTIN_PSLLDI128: + case IX86_BUILTIN_PSLLDI128_MASK: + case IX86_BUILTIN_PSLLDI256: + case IX86_BUILTIN_PSLLDI256_MASK: + case IX86_BUILTIN_PSLLDI512: + case IX86_BUILTIN_PSLLQ: + case IX86_BUILTIN_PSLLQ128: + case IX86_BUILTIN_PSLLQ128_MASK: + case IX86_BUILTIN_PSLLQ256: + case IX86_BUILTIN_PSLLQ256_MASK: + case IX86_BUILTIN_PSLLQ512: + case IX86_BUILTIN_PSLLQI: + case IX86_BUILTIN_PSLLQI128: + case IX86_BUILTIN_PSLLQI128_MASK: + case IX86_BUILTIN_PSLLQI256: + case IX86_BUILTIN_PSLLQI256_MASK: + case IX86_BUILTIN_PSLLQI512: + case IX86_BUILTIN_PSLLW: + case IX86_BUILTIN_PSLLW128: + case IX86_BUILTIN_PSLLW128_MASK: + case IX86_BUILTIN_PSLLW256: + case IX86_BUILTIN_PSLLW256_MASK: + case IX86_BUILTIN_PSLLW512_MASK: + case IX86_BUILTIN_PSLLWI: + case IX86_BUILTIN_PSLLWI128: + case IX86_BUILTIN_PSLLWI128_MASK: + case IX86_BUILTIN_PSLLWI256: + case IX86_BUILTIN_PSLLWI256_MASK: + case IX86_BUILTIN_PSLLWI512_MASK: + rcode = ASHIFT; + is_vshift = false; + goto do_shift; + case IX86_BUILTIN_PSRAD: + case IX86_BUILTIN_PSRAD128: + case IX86_BUILTIN_PSRAD128_MASK: + case IX86_BUILTIN_PSRAD256: + case IX86_BUILTIN_PSRAD256_MASK: + case IX86_BUILTIN_PSRAD512: + case IX86_BUILTIN_PSRADI: + case IX86_BUILTIN_PSRADI128: + case IX86_BUILTIN_PSRADI128_MASK: + case IX86_BUILTIN_PSRADI256: + case IX86_BUILTIN_PSRADI256_MASK: + case IX86_BUILTIN_PSRADI512: + case IX86_BUILTIN_PSRAQ128_MASK: + case IX86_BUILTIN_PSRAQ256_MASK: + case IX86_BUILTIN_PSRAQ512: + case IX86_BUILTIN_PSRAQI128_MASK: + case IX86_BUILTIN_PSRAQI256_MASK: + case IX86_BUILTIN_PSRAQI512: + case IX86_BUILTIN_PSRAW: + case IX86_BUILTIN_PSRAW128: + case IX86_BUILTIN_PSRAW128_MASK: + case IX86_BUILTIN_PSRAW256: + case IX86_BUILTIN_PSRAW256_MASK: + case IX86_BUILTIN_PSRAW512: + case IX86_BUILTIN_PSRAWI: + case IX86_BUILTIN_PSRAWI128: + case IX86_BUILTIN_PSRAWI128_MASK: + case IX86_BUILTIN_PSRAWI256: + case IX86_BUILTIN_PSRAWI256_MASK: + case IX86_BUILTIN_PSRAWI512: + rcode = ASHIFTRT; + is_vshift = false; + goto do_shift; + case IX86_BUILTIN_PSRLD: + case IX86_BUILTIN_PSRLD128: + case IX86_BUILTIN_PSRLD128_MASK: + case IX86_BUILTIN_PSRLD256: + case IX86_BUILTIN_PSRLD256_MASK: + case IX86_BUILTIN_PSRLD512: + case IX86_BUILTIN_PSRLDI: + case IX86_BUILTIN_PSRLDI128: + case IX86_BUILTIN_PSRLDI128_MASK: + case IX86_BUILTIN_PSRLDI256: + case IX86_BUILTIN_PSRLDI256_MASK: + case IX86_BUILTIN_PSRLDI512: + case IX86_BUILTIN_PSRLQ: + case IX86_BUILTIN_PSRLQ128: + case IX86_BUILTIN_PSRLQ128_MASK: + case IX86_BUILTIN_PSRLQ256: + case IX86_BUILTIN_PSRLQ256_MASK: + case IX86_BUILTIN_PSRLQ512: + case IX86_BUILTIN_PSRLQI: + case IX86_BUILTIN_PSRLQI128: + case IX86_BUILTIN_PSRLQI128_MASK: + case IX86_BUILTIN_PSRLQI256: + case IX86_BUILTIN_PSRLQI256_MASK: + case IX86_BUILTIN_PSRLQI512: + case IX86_BUILTIN_PSRLW: + case IX86_BUILTIN_PSRLW128: + case IX86_BUILTIN_PSRLW128_MASK: + case IX86_BUILTIN_PSRLW256: + case IX86_BUILTIN_PSRLW256_MASK: + case IX86_BUILTIN_PSRLW512: + case IX86_BUILTIN_PSRLWI: + case IX86_BUILTIN_PSRLWI128: + case IX86_BUILTIN_PSRLWI128_MASK: + case IX86_BUILTIN_PSRLWI256: + case IX86_BUILTIN_PSRLWI256_MASK: + case IX86_BUILTIN_PSRLWI512: + rcode = LSHIFTRT; + is_vshift = false; + goto do_shift; + case IX86_BUILTIN_PSLLVV16HI: + case IX86_BUILTIN_PSLLVV16SI: + case IX86_BUILTIN_PSLLVV2DI: + case IX86_BUILTIN_PSLLVV2DI_MASK: + case IX86_BUILTIN_PSLLVV32HI: + case IX86_BUILTIN_PSLLVV4DI: + case IX86_BUILTIN_PSLLVV4DI_MASK: + case IX86_BUILTIN_PSLLVV4SI: + case IX86_BUILTIN_PSLLVV4SI_MASK: + case IX86_BUILTIN_PSLLVV8DI: + case IX86_BUILTIN_PSLLVV8HI: + case IX86_BUILTIN_PSLLVV8SI: + case IX86_BUILTIN_PSLLVV8SI_MASK: + rcode = ASHIFT; + is_vshift = true; + goto do_shift; + case IX86_BUILTIN_PSRAVQ128: + case IX86_BUILTIN_PSRAVQ256: + case IX86_BUILTIN_PSRAVV16HI: + case IX86_BUILTIN_PSRAVV16SI: + case IX86_BUILTIN_PSRAVV32HI: + case IX86_BUILTIN_PSRAVV4SI: + case IX86_BUILTIN_PSRAVV4SI_MASK: + case IX86_BUILTIN_PSRAVV8DI: + case IX86_BUILTIN_PSRAVV8HI: + case IX86_BUILTIN_PSRAVV8SI: + case IX86_BUILTIN_PSRAVV8SI_MASK: + rcode = ASHIFTRT; + is_vshift = true; + goto do_shift; + case IX86_BUILTIN_PSRLVV16HI: + case IX86_BUILTIN_PSRLVV16SI: + case IX86_BUILTIN_PSRLVV2DI: + case IX86_BUILTIN_PSRLVV2DI_MASK: + case IX86_BUILTIN_PSRLVV32HI: + case IX86_BUILTIN_PSRLVV4DI: + case IX86_BUILTIN_PSRLVV4DI_MASK: + case IX86_BUILTIN_PSRLVV4SI: + case IX86_BUILTIN_PSRLVV4SI_MASK: + case IX86_BUILTIN_PSRLVV8DI: + case IX86_BUILTIN_PSRLVV8HI: + case IX86_BUILTIN_PSRLVV8SI: + case IX86_BUILTIN_PSRLVV8SI_MASK: + rcode = LSHIFTRT; + is_vshift = true; + goto do_shift; - while (next - && next != insn - && distance < LEA_SEARCH_THRESHOLD) - { - if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next)) - { - distance = increase_distance(prev, next, distance); - if (insn_uses_reg_mem (regno, next)) + do_shift: + gcc_assert (n_args >= 2); + if (TREE_CODE (args[0]) != VECTOR_CST) + break; + mask = HOST_WIDE_INT_M1U; + if (n_args > 2) { - /* Return DISTANCE if OP0 is used in memory - address in NEXT. */ - *found = true; - return distance; + /* This is masked shift. */ + if (!tree_fits_uhwi_p (args[n_args - 1]) + || TREE_SIDE_EFFECTS (args[n_args - 2])) + break; + mask = tree_to_uhwi (args[n_args - 1]); + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); + mask |= HOST_WIDE_INT_M1U << elems; + if (mask != HOST_WIDE_INT_M1U + && TREE_CODE (args[n_args - 2]) != VECTOR_CST) + break; + if (mask == (HOST_WIDE_INT_M1U << elems)) + return args[n_args - 2]; } - - if (insn_defines_reg (regno, INVALID_REGNUM, next)) + if (is_vshift && TREE_CODE (args[1]) != VECTOR_CST) + break; + if (tree tem = (is_vshift ? integer_one_node + : ix86_vector_shift_count (args[1]))) { - /* Return -1 if OP0 is set in NEXT. */ - *redefined = true; - return -1; - } - - prev = next; - } - - if (next == BB_END (bb)) - break; - - next = NEXT_INSN (next); - } - - return distance; -} - -/* Return the distance between INSN and the next insn that uses - register number REGNO0 in memory address. Return -1 if no such - a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */ - -static int -distance_agu_use (unsigned int regno0, rtx_insn *insn) -{ - basic_block bb = BLOCK_FOR_INSN (insn); - int distance = 0; - bool found = false; - bool redefined = false; - - if (insn != BB_END (bb)) - distance = distance_agu_use_in_bb (regno0, insn, distance, - NEXT_INSN (insn), - &found, &redefined); - - if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD) - { - edge e; - edge_iterator ei; - bool simple_loop = false; - - FOR_EACH_EDGE (e, ei, bb->succs) - if (e->dest == bb) - { - simple_loop = true; - break; - } - - if (simple_loop) - distance = distance_agu_use_in_bb (regno0, insn, - distance, BB_HEAD (bb), - &found, &redefined); - else - { - int shortest_dist = -1; - bool found_in_bb = false; - bool redefined_in_bb = false; - - FOR_EACH_EDGE (e, ei, bb->succs) - { - int bb_dist - = distance_agu_use_in_bb (regno0, insn, - distance, BB_HEAD (e->dest), - &found_in_bb, &redefined_in_bb); - if (found_in_bb) + unsigned HOST_WIDE_INT count = tree_to_uhwi (tem); + unsigned HOST_WIDE_INT prec + = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (args[0]))); + if (count == 0 && mask == HOST_WIDE_INT_M1U) + return args[0]; + if (count >= prec) { - if (shortest_dist < 0) - shortest_dist = bb_dist; - else if (bb_dist > 0) - shortest_dist = MIN (bb_dist, shortest_dist); - - found = true; + if (rcode == ASHIFTRT) + count = prec - 1; + else if (mask == HOST_WIDE_INT_M1U) + return build_zero_cst (TREE_TYPE (args[0])); + } + tree countt = NULL_TREE; + if (!is_vshift) + { + if (count >= prec) + countt = integer_zero_node; + else + countt = build_int_cst (integer_type_node, count); + } + tree_vector_builder builder; + builder.new_unary_operation (TREE_TYPE (args[0]), args[0], + false); + unsigned int cnt = builder.encoded_nelts (); + for (unsigned int i = 0; i < cnt; ++i) + { + tree elt = VECTOR_CST_ELT (args[0], i); + if (TREE_CODE (elt) != INTEGER_CST || TREE_OVERFLOW (elt)) + return NULL_TREE; + tree type = TREE_TYPE (elt); + if (rcode == LSHIFTRT) + elt = fold_convert (unsigned_type_for (type), elt); + if (is_vshift) + { + countt = VECTOR_CST_ELT (args[1], i); + if (TREE_CODE (countt) != INTEGER_CST + || TREE_OVERFLOW (countt)) + return NULL_TREE; + if (wi::neg_p (wi::to_wide (countt)) + || wi::to_widest (countt) >= prec) + { + if (rcode == ASHIFTRT) + countt = build_int_cst (TREE_TYPE (countt), + prec - 1); + else + { + elt = build_zero_cst (TREE_TYPE (elt)); + countt = build_zero_cst (TREE_TYPE (countt)); + } + } + } + else if (count >= prec) + elt = build_zero_cst (TREE_TYPE (elt)); + elt = const_binop (rcode == ASHIFT + ? LSHIFT_EXPR : RSHIFT_EXPR, + TREE_TYPE (elt), elt, countt); + if (!elt || TREE_CODE (elt) != INTEGER_CST) + return NULL_TREE; + if (rcode == LSHIFTRT) + elt = fold_convert (type, elt); + if ((mask & (HOST_WIDE_INT_1U << i)) == 0) + { + elt = VECTOR_CST_ELT (args[n_args - 2], i); + if (TREE_CODE (elt) != INTEGER_CST + || TREE_OVERFLOW (elt)) + return NULL_TREE; + } + builder.quick_push (elt); } + return builder.build (); } + break; - distance = shortest_dist; - } - } - - if (!found || redefined) - return -1; - - return distance >> 1; -} - -/* Define this macro to tune LEA priority vs ADD, it take effect when - there is a dilemma of choicing LEA or ADD - Negative value: ADD is more preferred than LEA - Zero: Netrual - Positive value: LEA is more preferred than ADD*/ -#define IX86_LEA_PRIORITY 0 - -/* Return true if usage of lea INSN has performance advantage - over a sequence of instructions. Instructions sequence has - SPLIT_COST cycles higher latency than lea latency. */ - -static bool -ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1, - unsigned int regno2, int split_cost, bool has_scale) -{ - int dist_define, dist_use; - - /* For Silvermont if using a 2-source or 3-source LEA for - non-destructive destination purposes, or due to wanting - ability to use SCALE, the use of LEA is justified. */ - if (TARGET_SILVERMONT || TARGET_GOLDMONT || TARGET_GOLDMONT_PLUS - || TARGET_TREMONT || TARGET_INTEL) - { - if (has_scale) - return true; - if (split_cost < 1) - return false; - if (regno0 == regno1 || regno0 == regno2) - return false; - return true; - } - - dist_define = distance_non_agu_define (regno1, regno2, insn); - dist_use = distance_agu_use (regno0, insn); - - if (dist_define < 0 || dist_define >= LEA_MAX_STALL) - { - /* If there is no non AGU operand definition, no AGU - operand usage and split cost is 0 then both lea - and non lea variants have same priority. Currently - we prefer lea for 64 bit code and non lea on 32 bit - code. */ - if (dist_use < 0 && split_cost == 0) - return TARGET_64BIT || IX86_LEA_PRIORITY; - else - return true; - } - - /* With longer definitions distance lea is more preferable. - Here we change it to take into account splitting cost and - lea priority. */ - dist_define += split_cost + IX86_LEA_PRIORITY; - - /* If there is no use in memory addess then we just check - that split cost exceeds AGU stall. */ - if (dist_use < 0) - return dist_define > LEA_MAX_STALL; - - /* If this insn has both backward non-agu dependence and forward - agu dependence, the one with short distance takes effect. */ - return dist_define >= dist_use; -} - -/* Return true if it is legal to clobber flags by INSN and - false otherwise. */ - -static bool -ix86_ok_to_clobber_flags (rtx_insn *insn) -{ - basic_block bb = BLOCK_FOR_INSN (insn); - df_ref use; - bitmap live; - - while (insn) - { - if (NONDEBUG_INSN_P (insn)) - { - FOR_EACH_INSN_USE (use, insn) - if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG) - return false; - - if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn)) - return true; - } - - if (insn == BB_END (bb)) - break; - - insn = NEXT_INSN (insn); - } - - live = df_get_live_out(bb); - return !REGNO_REG_SET_P (live, FLAGS_REG); -} - -/* Return true if we need to split op0 = op1 + op2 into a sequence of - move and add to avoid AGU stalls. */ - -bool -ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[]) -{ - unsigned int regno0, regno1, regno2; - - /* Check if we need to optimize. */ - if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) - return false; - - /* Check it is correct to split here. */ - if (!ix86_ok_to_clobber_flags(insn)) - return false; - - regno0 = true_regnum (operands[0]); - regno1 = true_regnum (operands[1]); - regno2 = true_regnum (operands[2]); - - /* We need to split only adds with non destructive - destination operand. */ - if (regno0 == regno1 || regno0 == regno2) - return false; - else - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false); -} - -/* Return true if we should emit lea instruction instead of mov - instruction. */ - -bool -ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[]) -{ - unsigned int regno0, regno1; - - /* Check if we need to optimize. */ - if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) - return false; - - /* Use lea for reg to reg moves only. */ - if (!REG_P (operands[0]) || !REG_P (operands[1])) - return false; - - regno0 = true_regnum (operands[0]); - regno1 = true_regnum (operands[1]); - - return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false); -} - -/* Return true if we need to split lea into a sequence of - instructions to avoid AGU stalls. */ - -bool -ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[]) -{ - unsigned int regno0, regno1, regno2; - int split_cost; - struct ix86_address parts; - int ok; - - /* Check we need to optimize. */ - if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun)) - return false; - - /* The "at least two components" test below might not catch simple - move or zero extension insns if parts.base is non-NULL and parts.disp - is const0_rtx as the only components in the address, e.g. if the - register is %rbp or %r13. As this test is much cheaper and moves or - zero extensions are the common case, do this check first. */ - if (REG_P (operands[1]) - || (SImode_address_operand (operands[1], VOIDmode) - && REG_P (XEXP (operands[1], 0)))) - return false; - - /* Check if it is OK to split here. */ - if (!ix86_ok_to_clobber_flags (insn)) - return false; - - ok = ix86_decompose_address (operands[1], &parts); - gcc_assert (ok); - - /* There should be at least two components in the address. */ - if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX) - + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2) - return false; - - /* We should not split into add if non legitimate pic - operand is used as displacement. */ - if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp)) - return false; - - regno0 = true_regnum (operands[0]) ; - regno1 = INVALID_REGNUM; - regno2 = INVALID_REGNUM; - - if (parts.base) - regno1 = true_regnum (parts.base); - if (parts.index) - regno2 = true_regnum (parts.index); - - split_cost = 0; - - /* Compute how many cycles we will add to execution time - if split lea into a sequence of instructions. */ - if (parts.base || parts.index) - { - /* Have to use mov instruction if non desctructive - destination form is used. */ - if (regno1 != regno0 && regno2 != regno0) - split_cost += 1; - - /* Have to add index to base if both exist. */ - if (parts.base && parts.index) - split_cost += 1; - - /* Have to use shift and adds if scale is 2 or greater. */ - if (parts.scale > 1) - { - if (regno0 != regno1) - split_cost += 1; - else if (regno2 == regno0) - split_cost += 4; - else - split_cost += parts.scale; - } - - /* Have to use add instruction with immediate if - disp is non zero. */ - if (parts.disp && parts.disp != const0_rtx) - split_cost += 1; - - /* Subtract the price of lea. */ - split_cost -= 1; - } - - return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost, - parts.scale > 1); -} - -/* Emit x86 binary operand CODE in mode MODE, where the first operand - matches destination. RTX includes clobber of FLAGS_REG. */ - -static void -ix86_emit_binop (enum rtx_code code, machine_mode mode, - rtx dst, rtx src) -{ - rtx op, clob; - - op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src)); - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - - emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); -} - -/* Return true if regno1 def is nearest to the insn. */ - -static bool -find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2) -{ - rtx_insn *prev = insn; - rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn)); - - if (insn == start) - return false; - while (prev && prev != start) - { - if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev)) - { - prev = PREV_INSN (prev); - continue; - } - if (insn_defines_reg (regno1, INVALID_REGNUM, prev)) - return true; - else if (insn_defines_reg (regno2, INVALID_REGNUM, prev)) - return false; - prev = PREV_INSN (prev); - } - - /* None of the regs is defined in the bb. */ - return false; -} - -/* Split lea instructions into a sequence of instructions - which are executed on ALU to avoid AGU stalls. - It is assumed that it is allowed to clobber flags register - at lea position. */ - -void -ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode) -{ - unsigned int regno0, regno1, regno2; - struct ix86_address parts; - rtx target, tmp; - int ok, adds; - - ok = ix86_decompose_address (operands[1], &parts); - gcc_assert (ok); - - target = gen_lowpart (mode, operands[0]); - - regno0 = true_regnum (target); - regno1 = INVALID_REGNUM; - regno2 = INVALID_REGNUM; - - if (parts.base) - { - parts.base = gen_lowpart (mode, parts.base); - regno1 = true_regnum (parts.base); - } - - if (parts.index) - { - parts.index = gen_lowpart (mode, parts.index); - regno2 = true_regnum (parts.index); - } - - if (parts.disp) - parts.disp = gen_lowpart (mode, parts.disp); - - if (parts.scale > 1) - { - /* Case r1 = r1 + ... */ - if (regno1 == regno0) - { - /* If we have a case r1 = r1 + C * r2 then we - should use multiplication which is very - expensive. Assume cost model is wrong if we - have such case here. */ - gcc_assert (regno2 != regno0); - - for (adds = parts.scale; adds > 0; adds--) - ix86_emit_binop (PLUS, mode, target, parts.index); - } - else - { - /* r1 = r2 + r3 * C case. Need to move r3 into r1. */ - if (regno0 != regno2) - emit_insn (gen_rtx_SET (target, parts.index)); - - /* Use shift for scaling. */ - ix86_emit_binop (ASHIFT, mode, target, - GEN_INT (exact_log2 (parts.scale))); - - if (parts.base) - ix86_emit_binop (PLUS, mode, target, parts.base); - - if (parts.disp && parts.disp != const0_rtx) - ix86_emit_binop (PLUS, mode, target, parts.disp); - } - } - else if (!parts.base && !parts.index) - { - gcc_assert(parts.disp); - emit_insn (gen_rtx_SET (target, parts.disp)); - } - else - { - if (!parts.base) - { - if (regno0 != regno2) - emit_insn (gen_rtx_SET (target, parts.index)); - } - else if (!parts.index) - { - if (regno0 != regno1) - emit_insn (gen_rtx_SET (target, parts.base)); - } - else - { - if (regno0 == regno1) - tmp = parts.index; - else if (regno0 == regno2) - tmp = parts.base; - else - { - rtx tmp1; - - /* Find better operand for SET instruction, depending - on which definition is farther from the insn. */ - if (find_nearest_reg_def (insn, regno1, regno2)) - tmp = parts.index, tmp1 = parts.base; - else - tmp = parts.base, tmp1 = parts.index; - - emit_insn (gen_rtx_SET (target, tmp)); - - if (parts.disp && parts.disp != const0_rtx) - ix86_emit_binop (PLUS, mode, target, parts.disp); - - ix86_emit_binop (PLUS, mode, target, tmp1); - return; - } - - ix86_emit_binop (PLUS, mode, target, tmp); - } - - if (parts.disp && parts.disp != const0_rtx) - ix86_emit_binop (PLUS, mode, target, parts.disp); - } -} - -/* Return true if it is ok to optimize an ADD operation to LEA - operation to avoid flag register consumation. For most processors, - ADD is faster than LEA. For the processors like BONNELL, if the - destination register of LEA holds an actual address which will be - used soon, LEA is better and otherwise ADD is better. */ - -bool -ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[]) -{ - unsigned int regno0 = true_regnum (operands[0]); - unsigned int regno1 = true_regnum (operands[1]); - unsigned int regno2 = true_regnum (operands[2]); - - /* If a = b + c, (a!=b && a!=c), must use lea form. */ - if (regno0 != regno1 && regno0 != regno2) - return true; - - if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) - return false; - - return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false); -} - -/* Return true if destination reg of SET_BODY is shift count of - USE_BODY. */ - -static bool -ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body) -{ - rtx set_dest; - rtx shift_rtx; - int i; - - /* Retrieve destination of SET_BODY. */ - switch (GET_CODE (set_body)) - { - case SET: - set_dest = SET_DEST (set_body); - if (!set_dest || !REG_P (set_dest)) - return false; - break; - case PARALLEL: - for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--) - if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i), - use_body)) - return true; - /* FALLTHROUGH */ - default: - return false; - } - - /* Retrieve shift count of USE_BODY. */ - switch (GET_CODE (use_body)) - { - case SET: - shift_rtx = XEXP (use_body, 1); - break; - case PARALLEL: - for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--) - if (ix86_dep_by_shift_count_body (set_body, - XVECEXP (use_body, 0, i))) - return true; - /* FALLTHROUGH */ - default: - return false; - } - - if (shift_rtx - && (GET_CODE (shift_rtx) == ASHIFT - || GET_CODE (shift_rtx) == LSHIFTRT - || GET_CODE (shift_rtx) == ASHIFTRT - || GET_CODE (shift_rtx) == ROTATE - || GET_CODE (shift_rtx) == ROTATERT)) - { - rtx shift_count = XEXP (shift_rtx, 1); - - /* Return true if shift count is dest of SET_BODY. */ - if (REG_P (shift_count)) - { - /* Add check since it can be invoked before register - allocation in pre-reload schedule. */ - if (reload_completed - && true_regnum (set_dest) == true_regnum (shift_count)) - return true; - else if (REGNO(set_dest) == REGNO(shift_count)) - return true; - } - } - - return false; -} - -/* Return true if destination reg of SET_INSN is shift count of - USE_INSN. */ - -bool -ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn) -{ - return ix86_dep_by_shift_count_body (PATTERN (set_insn), - PATTERN (use_insn)); -} - -/* Return TRUE or FALSE depending on whether the unary operator meets the - appropriate constraints. */ - -bool -ix86_unary_operator_ok (enum rtx_code, - machine_mode, - rtx operands[2]) -{ - /* If one of operands is memory, source and destination must match. */ - if ((MEM_P (operands[0]) - || MEM_P (operands[1])) - && ! rtx_equal_p (operands[0], operands[1])) - return false; - return true; -} - -/* Return TRUE if the operands to a vec_interleave_{high,low}v2df - are ok, keeping in mind the possible movddup alternative. */ - -bool -ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high) -{ - if (MEM_P (operands[0])) - return rtx_equal_p (operands[0], operands[1 + high]); - if (MEM_P (operands[1]) && MEM_P (operands[2])) - return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]); - return true; -} - -/* Post-reload splitter for converting an SF or DFmode value in an - SSE register into an unsigned SImode. */ - -void -ix86_split_convert_uns_si_sse (rtx operands[]) -{ - machine_mode vecmode; - rtx value, large, zero_or_two31, input, two31, x; - - large = operands[1]; - zero_or_two31 = operands[2]; - input = operands[3]; - two31 = operands[4]; - vecmode = GET_MODE (large); - value = gen_rtx_REG (vecmode, REGNO (operands[0])); - - /* Load up the value into the low element. We must ensure that the other - elements are valid floats -- zero is the easiest such value. */ - if (MEM_P (input)) - { - if (vecmode == V4SFmode) - emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input)); - else - emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input)); - } - else - { - input = gen_rtx_REG (vecmode, REGNO (input)); - emit_move_insn (value, CONST0_RTX (vecmode)); - if (vecmode == V4SFmode) - emit_insn (gen_sse_movss (value, value, input)); - else - emit_insn (gen_sse2_movsd (value, value, input)); - } - - emit_move_insn (large, two31); - emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31); - - x = gen_rtx_fmt_ee (LE, vecmode, large, value); - emit_insn (gen_rtx_SET (large, x)); - - x = gen_rtx_AND (vecmode, zero_or_two31, large); - emit_insn (gen_rtx_SET (zero_or_two31, x)); - - x = gen_rtx_MINUS (vecmode, value, zero_or_two31); - emit_insn (gen_rtx_SET (value, x)); - - large = gen_rtx_REG (V4SImode, REGNO (large)); - emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31))); - - x = gen_rtx_REG (V4SImode, REGNO (value)); - if (vecmode == V4SFmode) - emit_insn (gen_fix_truncv4sfv4si2 (x, value)); - else - emit_insn (gen_sse2_cvttpd2dq (x, value)); - value = x; - - emit_insn (gen_xorv4si3 (value, value, large)); -} - -/* Convert an unsigned DImode value into a DFmode, using only SSE. - Expects the 64-bit DImode to be supplied in a pair of integral - registers. Requires SSE2; will use SSE3 if available. For x86_32, - -mfpmath=sse, !optimize_size only. */ - -void -ix86_expand_convert_uns_didf_sse (rtx target, rtx input) -{ - REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt; - rtx int_xmm, fp_xmm; - rtx biases, exponents; - rtx x; - - int_xmm = gen_reg_rtx (V4SImode); - if (TARGET_INTER_UNIT_MOVES_TO_VEC) - emit_insn (gen_movdi_to_sse (int_xmm, input)); - else if (TARGET_SSE_SPLIT_REGS) - { - emit_clobber (int_xmm); - emit_move_insn (gen_lowpart (DImode, int_xmm), input); - } - else - { - x = gen_reg_rtx (V2DImode); - ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0); - emit_move_insn (int_xmm, gen_lowpart (V4SImode, x)); - } - - x = gen_rtx_CONST_VECTOR (V4SImode, - gen_rtvec (4, GEN_INT (0x43300000UL), - GEN_INT (0x45300000UL), - const0_rtx, const0_rtx)); - exponents = validize_mem (force_const_mem (V4SImode, x)); - - /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */ - emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents)); - - /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm) - yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)). - Similarly (0x45300000UL ## fp_value_hi_xmm) yields - (0x1.0p84 + double(fp_value_hi_xmm)). - Note these exponents differ by 32. */ - - fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm)); - - /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values - in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */ - real_ldexp (&bias_lo_rvt, &dconst1, 52); - real_ldexp (&bias_hi_rvt, &dconst1, 84); - biases = const_double_from_real_value (bias_lo_rvt, DFmode); - x = const_double_from_real_value (bias_hi_rvt, DFmode); - biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x)); - biases = validize_mem (force_const_mem (V2DFmode, biases)); - emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases)); - - /* Add the upper and lower DFmode values together. */ - if (TARGET_SSE3) - emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm)); - else - { - x = copy_to_mode_reg (V2DFmode, fp_xmm); - emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm)); - emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x)); - } - - ix86_expand_vector_extract (false, target, fp_xmm, 0); -} - -/* Not used, but eases macroization of patterns. */ -void -ix86_expand_convert_uns_sixf_sse (rtx, rtx) -{ - gcc_unreachable (); -} - -/* Convert an unsigned SImode value into a DFmode. Only currently used - for SSE, but applicable anywhere. */ - -void -ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) -{ - REAL_VALUE_TYPE TWO31r; - rtx x, fp; - - x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1), - NULL, 1, OPTAB_DIRECT); - - fp = gen_reg_rtx (DFmode); - emit_insn (gen_floatsidf2 (fp, x)); - - real_ldexp (&TWO31r, &dconst1, 31); - x = const_double_from_real_value (TWO31r, DFmode); - - x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); - if (x != target) - emit_move_insn (target, x); -} - -/* Convert a signed DImode value into a DFmode. Only used for SSE in - 32-bit mode; otherwise we have a direct convert instruction. */ - -void -ix86_expand_convert_sign_didf_sse (rtx target, rtx input) -{ - REAL_VALUE_TYPE TWO32r; - rtx fp_lo, fp_hi, x; - - fp_lo = gen_reg_rtx (DFmode); - fp_hi = gen_reg_rtx (DFmode); - - emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input))); - - real_ldexp (&TWO32r, &dconst1, 32); - x = const_double_from_real_value (TWO32r, DFmode); - fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT); - - ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input)); - - x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target, - 0, OPTAB_DIRECT); - if (x != target) - emit_move_insn (target, x); -} - -/* Convert an unsigned SImode value into a SFmode, using only SSE. - For x86_32, -mfpmath=sse, !optimize_size only. */ -void -ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) -{ - REAL_VALUE_TYPE ONE16r; - rtx fp_hi, fp_lo, int_hi, int_lo, x; - - real_ldexp (&ONE16r, &dconst1, 16); - x = const_double_from_real_value (ONE16r, SFmode); - int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff), - NULL, 0, OPTAB_DIRECT); - int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16), - NULL, 0, OPTAB_DIRECT); - fp_hi = gen_reg_rtx (SFmode); - fp_lo = gen_reg_rtx (SFmode); - emit_insn (gen_floatsisf2 (fp_hi, int_hi)); - emit_insn (gen_floatsisf2 (fp_lo, int_lo)); - fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, - 0, OPTAB_DIRECT); - fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, - 0, OPTAB_DIRECT); - if (!rtx_equal_p (target, fp_hi)) - emit_move_insn (target, fp_hi); -} - -/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert - a vector of unsigned ints VAL to vector of floats TARGET. */ - -void -ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val) -{ - rtx tmp[8]; - REAL_VALUE_TYPE TWO16r; - machine_mode intmode = GET_MODE (val); - machine_mode fltmode = GET_MODE (target); - rtx (*cvt) (rtx, rtx); - - if (intmode == V4SImode) - cvt = gen_floatv4siv4sf2; - else - cvt = gen_floatv8siv8sf2; - tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff)); - tmp[0] = force_reg (intmode, tmp[0]); - tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1, - OPTAB_DIRECT); - tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16), - NULL_RTX, 1, OPTAB_DIRECT); - tmp[3] = gen_reg_rtx (fltmode); - emit_insn (cvt (tmp[3], tmp[1])); - tmp[4] = gen_reg_rtx (fltmode); - emit_insn (cvt (tmp[4], tmp[2])); - real_ldexp (&TWO16r, &dconst1, 16); - tmp[5] = const_double_from_real_value (TWO16r, SFmode); - tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5])); - tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1, - OPTAB_DIRECT); - tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1, - OPTAB_DIRECT); - if (tmp[7] != target) - emit_move_insn (target, tmp[7]); -} - -/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc* - pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*. - This is done by doing just signed conversion if < 0x1p31, and otherwise by - subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */ - -rtx -ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp) -{ - REAL_VALUE_TYPE TWO31r; - rtx two31r, tmp[4]; - machine_mode mode = GET_MODE (val); - machine_mode scalarmode = GET_MODE_INNER (mode); - machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode; - rtx (*cmp) (rtx, rtx, rtx, rtx); - int i; - - for (i = 0; i < 3; i++) - tmp[i] = gen_reg_rtx (mode); - real_ldexp (&TWO31r, &dconst1, 31); - two31r = const_double_from_real_value (TWO31r, scalarmode); - two31r = ix86_build_const_vector (mode, 1, two31r); - two31r = force_reg (mode, two31r); - switch (mode) - { - case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break; - case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break; - case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break; - case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break; - default: gcc_unreachable (); - } - tmp[3] = gen_rtx_LE (mode, two31r, val); - emit_insn (cmp (tmp[0], two31r, val, tmp[3])); - tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1], - 0, OPTAB_DIRECT); - if (intmode == V4SImode || TARGET_AVX2) - *xorp = expand_simple_binop (intmode, ASHIFT, - gen_lowpart (intmode, tmp[0]), - GEN_INT (31), NULL_RTX, 0, - OPTAB_DIRECT); - else - { - rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31); - two31 = ix86_build_const_vector (intmode, 1, two31); - *xorp = expand_simple_binop (intmode, AND, - gen_lowpart (intmode, tmp[0]), - two31, NULL_RTX, 0, - OPTAB_DIRECT); - } - return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2], - 0, OPTAB_DIRECT); -} - -/* A subroutine of ix86_build_signbit_mask. If VECT is true, - then replicate the value for all elements of the vector - register. */ - -rtx -ix86_build_const_vector (machine_mode mode, bool vect, rtx value) -{ - int i, n_elt; - rtvec v; - machine_mode scalar_mode; - - switch (mode) - { - case E_V64QImode: - case E_V32QImode: - case E_V16QImode: - case E_V32HImode: - case E_V16HImode: - case E_V8HImode: - case E_V16SImode: - case E_V8SImode: - case E_V4SImode: - case E_V8DImode: - case E_V4DImode: - case E_V2DImode: - gcc_assert (vect); - /* FALLTHRU */ - case E_V16SFmode: - case E_V8SFmode: - case E_V4SFmode: - case E_V8DFmode: - case E_V4DFmode: - case E_V2DFmode: - n_elt = GET_MODE_NUNITS (mode); - v = rtvec_alloc (n_elt); - scalar_mode = GET_MODE_INNER (mode); - - RTVEC_ELT (v, 0) = value; - - for (i = 1; i < n_elt; ++i) - RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode); - - return gen_rtx_CONST_VECTOR (mode, v); - - default: - gcc_unreachable (); - } -} - -/* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders - and ix86_expand_int_vcond. Create a mask for the sign bit in MODE - for an SSE register. If VECT is true, then replicate the mask for - all elements of the vector register. If INVERT is true, then create - a mask excluding the sign bit. */ - -rtx -ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert) -{ - machine_mode vec_mode, imode; - wide_int w; - rtx mask, v; - - switch (mode) - { - case E_V16SImode: - case E_V16SFmode: - case E_V8SImode: - case E_V4SImode: - case E_V8SFmode: - case E_V4SFmode: - vec_mode = mode; - imode = SImode; - break; - - case E_V8DImode: - case E_V4DImode: - case E_V2DImode: - case E_V8DFmode: - case E_V4DFmode: - case E_V2DFmode: - vec_mode = mode; - imode = DImode; - break; - - case E_TImode: - case E_TFmode: - vec_mode = VOIDmode; - imode = TImode; - break; - - default: - gcc_unreachable (); - } - - machine_mode inner_mode = GET_MODE_INNER (mode); - w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1, - GET_MODE_BITSIZE (inner_mode)); - if (invert) - w = wi::bit_not (w); - - /* Force this value into the low part of a fp vector constant. */ - mask = immed_wide_int_const (w, imode); - mask = gen_lowpart (inner_mode, mask); - - if (vec_mode == VOIDmode) - return force_reg (inner_mode, mask); - - v = ix86_build_const_vector (vec_mode, vect, mask); - return force_reg (vec_mode, v); -} - -/* Generate code for floating point ABS or NEG. */ - -void -ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode, - rtx operands[]) -{ - rtx mask, set, dst, src; - bool use_sse = false; - bool vector_mode = VECTOR_MODE_P (mode); - machine_mode vmode = mode; - - if (vector_mode) - use_sse = true; - else if (mode == TFmode) - use_sse = true; - else if (TARGET_SSE_MATH) - { - use_sse = SSE_FLOAT_MODE_P (mode); - if (mode == SFmode) - vmode = V4SFmode; - else if (mode == DFmode) - vmode = V2DFmode; - } - - /* NEG and ABS performed with SSE use bitwise mask operations. - Create the appropriate mask now. */ - if (use_sse) - mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS); - else - mask = NULL_RTX; - - dst = operands[0]; - src = operands[1]; - - set = gen_rtx_fmt_e (code, mode, src); - set = gen_rtx_SET (dst, set); - - if (mask) - { - rtx use, clob; - rtvec par; - - use = gen_rtx_USE (VOIDmode, mask); - if (vector_mode) - par = gen_rtvec (2, set, use); - else - { - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - par = gen_rtvec (3, set, use, clob); - } - emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); - } - else - emit_insn (set); -} - -/* Expand a copysign operation. Special case operand 0 being a constant. */ - -void -ix86_expand_copysign (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, op0, op1, mask, nmask; - - dest = operands[0]; - op0 = operands[1]; - op1 = operands[2]; - - mode = GET_MODE (dest); - - if (mode == SFmode) - vmode = V4SFmode; - else if (mode == DFmode) - vmode = V2DFmode; - else - vmode = mode; - - if (CONST_DOUBLE_P (op0)) - { - rtx (*copysign_insn)(rtx, rtx, rtx, rtx); - - if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0))) - op0 = simplify_unary_operation (ABS, mode, op0, mode); - - if (mode == SFmode || mode == DFmode) - { - if (op0 == CONST0_RTX (mode)) - op0 = CONST0_RTX (vmode); - else - { - rtx v = ix86_build_const_vector (vmode, false, op0); - - op0 = force_reg (vmode, v); - } - } - else if (op0 != CONST0_RTX (mode)) - op0 = force_reg (mode, op0); - - mask = ix86_build_signbit_mask (vmode, 0, 0); - - if (mode == SFmode) - copysign_insn = gen_copysignsf3_const; - else if (mode == DFmode) - copysign_insn = gen_copysigndf3_const; - else - copysign_insn = gen_copysigntf3_const; - - emit_insn (copysign_insn (dest, op0, op1, mask)); - } - else - { - rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx); - - nmask = ix86_build_signbit_mask (vmode, 0, 1); - mask = ix86_build_signbit_mask (vmode, 0, 0); - - if (mode == SFmode) - copysign_insn = gen_copysignsf3_var; - else if (mode == DFmode) - copysign_insn = gen_copysigndf3_var; - else - copysign_insn = gen_copysigntf3_var; - - emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask)); - } -} - -/* Deconstruct a copysign operation into bit masks. Operand 0 is known to - be a constant, and so has already been expanded into a vector constant. */ - -void -ix86_split_copysign_const (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, op0, mask, x; - - dest = operands[0]; - op0 = operands[1]; - mask = operands[3]; - - mode = GET_MODE (dest); - vmode = GET_MODE (mask); - - dest = lowpart_subreg (vmode, dest, mode); - x = gen_rtx_AND (vmode, dest, mask); - emit_insn (gen_rtx_SET (dest, x)); - - if (op0 != CONST0_RTX (vmode)) - { - x = gen_rtx_IOR (vmode, dest, op0); - emit_insn (gen_rtx_SET (dest, x)); - } -} - -/* Deconstruct a copysign operation into bit masks. Operand 0 is variable, - so we have to do two masks. */ - -void -ix86_split_copysign_var (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, scratch, op0, op1, mask, nmask, x; - - dest = operands[0]; - scratch = operands[1]; - op0 = operands[2]; - op1 = operands[3]; - nmask = operands[4]; - mask = operands[5]; - - mode = GET_MODE (dest); - vmode = GET_MODE (mask); - - if (rtx_equal_p (op0, op1)) - { - /* Shouldn't happen often (it's useless, obviously), but when it does - we'd generate incorrect code if we continue below. */ - emit_move_insn (dest, op0); - return; - } - - if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */ - { - gcc_assert (REGNO (op1) == REGNO (scratch)); - - x = gen_rtx_AND (vmode, scratch, mask); - emit_insn (gen_rtx_SET (scratch, x)); - - dest = mask; - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_NOT (vmode, dest); - x = gen_rtx_AND (vmode, x, op0); - emit_insn (gen_rtx_SET (dest, x)); - } - else - { - if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */ - { - x = gen_rtx_AND (vmode, scratch, mask); - } - else /* alternative 2,4 */ - { - gcc_assert (REGNO (mask) == REGNO (scratch)); - op1 = lowpart_subreg (vmode, op1, mode); - x = gen_rtx_AND (vmode, scratch, op1); - } - emit_insn (gen_rtx_SET (scratch, x)); - - if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */ - { - dest = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_AND (vmode, dest, nmask); - } - else /* alternative 3,4 */ - { - gcc_assert (REGNO (nmask) == REGNO (dest)); - dest = nmask; - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_AND (vmode, dest, op0); - } - emit_insn (gen_rtx_SET (dest, x)); - } - - x = gen_rtx_IOR (vmode, dest, scratch); - emit_insn (gen_rtx_SET (dest, x)); -} - -/* Expand an xorsign operation. */ - -void -ix86_expand_xorsign (rtx operands[]) -{ - rtx (*xorsign_insn)(rtx, rtx, rtx, rtx); - machine_mode mode, vmode; - rtx dest, op0, op1, mask; - - dest = operands[0]; - op0 = operands[1]; - op1 = operands[2]; - - mode = GET_MODE (dest); - - if (mode == SFmode) - { - xorsign_insn = gen_xorsignsf3_1; - vmode = V4SFmode; - } - else if (mode == DFmode) - { - xorsign_insn = gen_xorsigndf3_1; - vmode = V2DFmode; - } - else - gcc_unreachable (); - - mask = ix86_build_signbit_mask (vmode, 0, 0); - - emit_insn (xorsign_insn (dest, op0, op1, mask)); -} - -/* Deconstruct an xorsign operation into bit masks. */ - -void -ix86_split_xorsign (rtx operands[]) -{ - machine_mode mode, vmode; - rtx dest, op0, mask, x; - - dest = operands[0]; - op0 = operands[1]; - mask = operands[3]; - - mode = GET_MODE (dest); - vmode = GET_MODE (mask); - - dest = lowpart_subreg (vmode, dest, mode); - x = gen_rtx_AND (vmode, dest, mask); - emit_insn (gen_rtx_SET (dest, x)); - - op0 = lowpart_subreg (vmode, op0, mode); - x = gen_rtx_XOR (vmode, dest, op0); - emit_insn (gen_rtx_SET (dest, x)); -} - -/* Return TRUE or FALSE depending on whether the first SET in INSN - has source and destination with matching CC modes, and that the - CC mode is at least as constrained as REQ_MODE. */ - -bool -ix86_match_ccmode (rtx insn, machine_mode req_mode) -{ - rtx set; - machine_mode set_mode; - - set = PATTERN (insn); - if (GET_CODE (set) == PARALLEL) - set = XVECEXP (set, 0, 0); - gcc_assert (GET_CODE (set) == SET); - gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE); - - set_mode = GET_MODE (SET_DEST (set)); - switch (set_mode) - { - case E_CCNOmode: - if (req_mode != CCNOmode - && (req_mode != CCmode - || XEXP (SET_SRC (set), 1) != const0_rtx)) - return false; - break; - case E_CCmode: - if (req_mode == CCGCmode) - return false; - /* FALLTHRU */ - case E_CCGCmode: - if (req_mode == CCGOCmode || req_mode == CCNOmode) - return false; - /* FALLTHRU */ - case E_CCGOCmode: - if (req_mode == CCZmode) - return false; - /* FALLTHRU */ - case E_CCZmode: - break; - - case E_CCGZmode: - - case E_CCAmode: - case E_CCCmode: - case E_CCOmode: - case E_CCPmode: - case E_CCSmode: - if (set_mode != req_mode) - return false; - break; - - default: - gcc_unreachable (); - } - - return GET_MODE (SET_SRC (set)) == set_mode; -} - -/* Generate insn patterns to do an integer compare of OPERANDS. */ - -static rtx -ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1) -{ - machine_mode cmpmode; - rtx tmp, flags; - - cmpmode = SELECT_CC_MODE (code, op0, op1); - flags = gen_rtx_REG (cmpmode, FLAGS_REG); - - /* This is very simple, but making the interface the same as in the - FP case makes the rest of the code easier. */ - tmp = gen_rtx_COMPARE (cmpmode, op0, op1); - emit_insn (gen_rtx_SET (flags, tmp)); - - /* Return the test that should be put into the flags user, i.e. - the bcc, scc, or cmov instruction. */ - return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx); -} - -/* Figure out whether to use unordered fp comparisons. */ - -static bool -ix86_unordered_fp_compare (enum rtx_code code) -{ - if (!TARGET_IEEE_FP) - return false; - - switch (code) - { - case GT: - case GE: - case LT: - case LE: - return false; - - case EQ: - case NE: - - case LTGT: - case UNORDERED: - case ORDERED: - case UNLT: - case UNLE: - case UNGT: - case UNGE: - case UNEQ: - return true; - - default: - gcc_unreachable (); - } -} - -machine_mode -ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1) -{ - machine_mode mode = GET_MODE (op0); - - if (SCALAR_FLOAT_MODE_P (mode)) - { - gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); - return CCFPmode; - } - - switch (code) - { - /* Only zero flag is needed. */ - case EQ: /* ZF=0 */ - case NE: /* ZF!=0 */ - return CCZmode; - /* Codes needing carry flag. */ - case GEU: /* CF=0 */ - case LTU: /* CF=1 */ - /* Detect overflow checks. They need just the carry flag. */ - if (GET_CODE (op0) == PLUS - && (rtx_equal_p (op1, XEXP (op0, 0)) - || rtx_equal_p (op1, XEXP (op0, 1)))) - return CCCmode; - else - return CCmode; - case GTU: /* CF=0 & ZF=0 */ - case LEU: /* CF=1 | ZF=1 */ - return CCmode; - /* Codes possibly doable only with sign flag when - comparing against zero. */ - case GE: /* SF=OF or SF=0 */ - case LT: /* SF<>OF or SF=1 */ - if (op1 == const0_rtx) - return CCGOCmode; - else - /* For other cases Carry flag is not required. */ - return CCGCmode; - /* Codes doable only with sign flag when comparing - against zero, but we miss jump instruction for it - so we need to use relational tests against overflow - that thus needs to be zero. */ - case GT: /* ZF=0 & SF=OF */ - case LE: /* ZF=1 | SF<>OF */ - if (op1 == const0_rtx) - return CCNOmode; - else - return CCGCmode; - /* strcmp pattern do (use flags) and combine may ask us for proper - mode. */ - case USE: - return CCmode; - default: - gcc_unreachable (); - } -} - -/* Return the fixed registers used for condition codes. */ - -static bool -ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2) -{ - *p1 = FLAGS_REG; - *p2 = INVALID_REGNUM; - return true; -} - -/* If two condition code modes are compatible, return a condition code - mode which is compatible with both. Otherwise, return - VOIDmode. */ - -static machine_mode -ix86_cc_modes_compatible (machine_mode m1, machine_mode m2) -{ - if (m1 == m2) - return m1; - - if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC) - return VOIDmode; - - if ((m1 == CCGCmode && m2 == CCGOCmode) - || (m1 == CCGOCmode && m2 == CCGCmode)) - return CCGCmode; - - if ((m1 == CCNOmode && m2 == CCGOCmode) - || (m1 == CCGOCmode && m2 == CCNOmode)) - return CCNOmode; - - if (m1 == CCZmode - && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode)) - return m2; - else if (m2 == CCZmode - && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode)) - return m1; - - switch (m1) - { - default: - gcc_unreachable (); - - case E_CCmode: - case E_CCGCmode: - case E_CCGOCmode: - case E_CCNOmode: - case E_CCAmode: - case E_CCCmode: - case E_CCOmode: - case E_CCPmode: - case E_CCSmode: - case E_CCZmode: - switch (m2) - { - default: - return VOIDmode; - - case E_CCmode: - case E_CCGCmode: - case E_CCGOCmode: - case E_CCNOmode: - case E_CCAmode: - case E_CCCmode: - case E_CCOmode: - case E_CCPmode: - case E_CCSmode: - case E_CCZmode: - return CCmode; - } - - case E_CCFPmode: - /* These are only compatible with themselves, which we already - checked above. */ - return VOIDmode; - } -} - - -/* Return a comparison we can do and that it is equivalent to - swap_condition (code) apart possibly from orderedness. - But, never change orderedness if TARGET_IEEE_FP, returning - UNKNOWN in that case if necessary. */ - -static enum rtx_code -ix86_fp_swap_condition (enum rtx_code code) -{ - switch (code) - { - case GT: /* GTU - CF=0 & ZF=0 */ - return TARGET_IEEE_FP ? UNKNOWN : UNLT; - case GE: /* GEU - CF=0 */ - return TARGET_IEEE_FP ? UNKNOWN : UNLE; - case UNLT: /* LTU - CF=1 */ - return TARGET_IEEE_FP ? UNKNOWN : GT; - case UNLE: /* LEU - CF=1 | ZF=1 */ - return TARGET_IEEE_FP ? UNKNOWN : GE; - default: - return swap_condition (code); - } -} - -/* Return cost of comparison CODE using the best strategy for performance. - All following functions do use number of instructions as a cost metrics. - In future this should be tweaked to compute bytes for optimize_size and - take into account performance of various instructions on various CPUs. */ - -static int -ix86_fp_comparison_cost (enum rtx_code code) -{ - int arith_cost; - - /* The cost of code using bit-twiddling on %ah. */ - switch (code) - { - case UNLE: - case UNLT: - case LTGT: - case GT: - case GE: - case UNORDERED: - case ORDERED: - case UNEQ: - arith_cost = 4; - break; - case LT: - case NE: - case EQ: - case UNGE: - arith_cost = TARGET_IEEE_FP ? 5 : 4; - break; - case LE: - case UNGT: - arith_cost = TARGET_IEEE_FP ? 6 : 4; - break; - default: - gcc_unreachable (); - } - - switch (ix86_fp_comparison_strategy (code)) - { - case IX86_FPCMP_COMI: - return arith_cost > 4 ? 3 : 2; - case IX86_FPCMP_SAHF: - return arith_cost > 4 ? 4 : 3; - default: - return arith_cost; - } -} - -/* Return strategy to use for floating-point. We assume that fcomi is always - preferrable where available, since that is also true when looking at size - (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */ - -enum ix86_fpcmp_strategy -ix86_fp_comparison_strategy (enum rtx_code) -{ - /* Do fcomi/sahf based test when profitable. */ - - if (TARGET_CMOVE) - return IX86_FPCMP_COMI; - - if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ())) - return IX86_FPCMP_SAHF; - - return IX86_FPCMP_ARITH; -} - -/* Swap, force into registers, or otherwise massage the two operands - to a fp comparison. The operands are updated in place; the new - comparison code is returned. */ - -static enum rtx_code -ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) -{ - bool unordered_compare = ix86_unordered_fp_compare (code); - rtx op0 = *pop0, op1 = *pop1; - machine_mode op_mode = GET_MODE (op0); - bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); - - /* All of the unordered compare instructions only work on registers. - The same is true of the fcomi compare instructions. The XFmode - compare instructions require registers except when comparing - against zero or when converting operand 1 from fixed point to - floating point. */ - - if (!is_sse - && (unordered_compare - || (op_mode == XFmode - && ! (standard_80387_constant_p (op0) == 1 - || standard_80387_constant_p (op1) == 1) - && GET_CODE (op1) != FLOAT) - || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI)) - { - op0 = force_reg (op_mode, op0); - op1 = force_reg (op_mode, op1); - } - else - { - /* %%% We only allow op1 in memory; op0 must be st(0). So swap - things around if they appear profitable, otherwise force op0 - into a register. */ - - if (standard_80387_constant_p (op0) == 0 - || (MEM_P (op0) - && ! (standard_80387_constant_p (op1) == 0 - || MEM_P (op1)))) - { - enum rtx_code new_code = ix86_fp_swap_condition (code); - if (new_code != UNKNOWN) - { - std::swap (op0, op1); - code = new_code; - } - } - - if (!REG_P (op0)) - op0 = force_reg (op_mode, op0); - - if (CONSTANT_P (op1)) - { - int tmp = standard_80387_constant_p (op1); - if (tmp == 0) - op1 = validize_mem (force_const_mem (op_mode, op1)); - else if (tmp == 1) - { - if (TARGET_CMOVE) - op1 = force_reg (op_mode, op1); - } - else - op1 = force_reg (op_mode, op1); - } - } - - /* Try to rearrange the comparison to make it cheaper. */ - if (ix86_fp_comparison_cost (code) - > ix86_fp_comparison_cost (swap_condition (code)) - && (REG_P (op1) || can_create_pseudo_p ())) - { - std::swap (op0, op1); - code = swap_condition (code); - if (!REG_P (op0)) - op0 = force_reg (op_mode, op0); - } - - *pop0 = op0; - *pop1 = op1; - return code; -} - -/* Convert comparison codes we use to represent FP comparison to integer - code that will result in proper branch. Return UNKNOWN if no such code - is available. */ - -enum rtx_code -ix86_fp_compare_code_to_integer (enum rtx_code code) -{ - switch (code) - { - case GT: - return GTU; - case GE: - return GEU; - case ORDERED: - case UNORDERED: - return code; - case UNEQ: - return EQ; - case UNLT: - return LTU; - case UNLE: - return LEU; - case LTGT: - return NE; - default: - return UNKNOWN; - } -} - -/* Generate insn patterns to do a floating point compare of OPERANDS. */ - -static rtx -ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1) -{ - bool unordered_compare = ix86_unordered_fp_compare (code); - machine_mode cmp_mode; - rtx tmp, scratch; - - code = ix86_prepare_fp_compare_args (code, &op0, &op1); - - tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); - if (unordered_compare) - tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); - - /* Do fcomi/sahf based test when profitable. */ - switch (ix86_fp_comparison_strategy (code)) - { - case IX86_FPCMP_COMI: - cmp_mode = CCFPmode; - emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp)); - break; - - case IX86_FPCMP_SAHF: - cmp_mode = CCFPmode; - tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); - scratch = gen_reg_rtx (HImode); - emit_insn (gen_rtx_SET (scratch, tmp)); - emit_insn (gen_x86_sahf_1 (scratch)); - break; - - case IX86_FPCMP_ARITH: - cmp_mode = CCNOmode; - tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW); - scratch = gen_reg_rtx (HImode); - emit_insn (gen_rtx_SET (scratch, tmp)); - - /* In the unordered case, we have to check C2 for NaN's, which - doesn't happen to work out to anything nice combination-wise. - So do some bit twiddling on the value we've got in AH to come - up with an appropriate set of condition codes. */ - - switch (code) - { - case GT: - case UNGT: - if (code == GT || !TARGET_IEEE_FP) - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); - code = EQ; - } - else - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); - emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44))); - cmp_mode = CCmode; - code = GEU; - } - break; - case LT: - case UNLT: - if (code == LT && TARGET_IEEE_FP) - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx)); - cmp_mode = CCmode; - code = EQ; - } - else - { - emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx)); - code = NE; - } - break; - case GE: - case UNGE: - if (code == GE || !TARGET_IEEE_FP) - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05))); - code = EQ; - } - else - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx)); - code = NE; - } - break; - case LE: - case UNLE: - if (code == LE && TARGET_IEEE_FP) - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx)); - emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); - cmp_mode = CCmode; - code = LTU; - } - else - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45))); - code = NE; - } - break; - case EQ: - case UNEQ: - if (code == EQ && TARGET_IEEE_FP) - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40))); - cmp_mode = CCmode; - code = EQ; - } - else - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); - code = NE; - } - break; - case NE: - case LTGT: - if (code == NE && TARGET_IEEE_FP) - { - emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45))); - emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, - GEN_INT (0x40))); - code = NE; - } - else - { - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40))); - code = EQ; - } - break; - - case UNORDERED: - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); - code = NE; - break; - case ORDERED: - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04))); - code = EQ; - break; - - default: - gcc_unreachable (); - } - break; - - default: - gcc_unreachable(); - } - - /* Return the test that should be put into the flags user, i.e. - the bcc, scc, or cmov instruction. */ - return gen_rtx_fmt_ee (code, VOIDmode, - gen_rtx_REG (cmp_mode, FLAGS_REG), - const0_rtx); -} - -static rtx -ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1) -{ - rtx ret; - - if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC) - ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1); - - else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0))) - { - gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0))); - ret = ix86_expand_fp_compare (code, op0, op1); - } - else - ret = ix86_expand_int_compare (code, op0, op1); - - return ret; -} - -void -ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label) -{ - machine_mode mode = GET_MODE (op0); - rtx tmp; - - /* Handle special case - vector comparsion with boolean result, transform - it using ptest instruction. */ - if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG); - machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode; - - gcc_assert (code == EQ || code == NE); - /* Generate XOR since we can't check that one operand is zero vector. */ - tmp = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1))); - tmp = gen_lowpart (p_mode, tmp); - emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG), - gen_rtx_UNSPEC (CCmode, - gen_rtvec (2, tmp, tmp), - UNSPEC_PTEST))); - tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, label), - pc_rtx); - emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - return; - } - - switch (mode) - { - case E_SFmode: - case E_DFmode: - case E_XFmode: - case E_QImode: - case E_HImode: - case E_SImode: - simple: - tmp = ix86_expand_compare (code, op0, op1); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, label), - pc_rtx); - emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - return; - - case E_DImode: - if (TARGET_64BIT) - goto simple; - /* For 32-bit target DI comparison may be performed on - SSE registers. To allow this we should avoid split - to SI mode which is achieved by doing xor in DI mode - and then comparing with zero (which is recognized by - STV pass). We don't compare using xor when optimizing - for size. */ - if (!optimize_insn_for_size_p () - && TARGET_STV - && (code == EQ || code == NE)) - { - op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1)); - op1 = const0_rtx; - } - /* FALLTHRU */ - case E_TImode: - /* Expand DImode branch into multiple compare+branch. */ - { - rtx lo[2], hi[2]; - rtx_code_label *label2; - enum rtx_code code1, code2, code3; - machine_mode submode; - - if (CONSTANT_P (op0) && !CONSTANT_P (op1)) - { - std::swap (op0, op1); - code = swap_condition (code); - } - - split_double_mode (mode, &op0, 1, lo+0, hi+0); - split_double_mode (mode, &op1, 1, lo+1, hi+1); - - submode = mode == DImode ? SImode : DImode; - - /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to - avoid two branches. This costs one extra insn, so disable when - optimizing for size. */ - - if ((code == EQ || code == NE) - && (!optimize_insn_for_size_p () - || hi[1] == const0_rtx || lo[1] == const0_rtx)) - { - rtx xor0, xor1; - - xor1 = hi[0]; - if (hi[1] != const0_rtx) - xor1 = expand_binop (submode, xor_optab, xor1, hi[1], - NULL_RTX, 0, OPTAB_WIDEN); - - xor0 = lo[0]; - if (lo[1] != const0_rtx) - xor0 = expand_binop (submode, xor_optab, xor0, lo[1], - NULL_RTX, 0, OPTAB_WIDEN); - - tmp = expand_binop (submode, ior_optab, xor1, xor0, - NULL_RTX, 0, OPTAB_WIDEN); - - ix86_expand_branch (code, tmp, const0_rtx, label); - return; - } - - /* Otherwise, if we are doing less-than or greater-or-equal-than, - op1 is a constant and the low word is zero, then we can just - examine the high word. Similarly for low word -1 and - less-or-equal-than or greater-than. */ - - if (CONST_INT_P (hi[1])) - switch (code) - { - case LT: case LTU: case GE: case GEU: - if (lo[1] == const0_rtx) - { - ix86_expand_branch (code, hi[0], hi[1], label); - return; - } - break; - case LE: case LEU: case GT: case GTU: - if (lo[1] == constm1_rtx) - { - ix86_expand_branch (code, hi[0], hi[1], label); - return; - } - break; - default: - break; - } - - /* Emulate comparisons that do not depend on Zero flag with - double-word subtraction. Note that only Overflow, Sign - and Carry flags are valid, so swap arguments and condition - of comparisons that would otherwise test Zero flag. */ - - switch (code) - { - case LE: case LEU: case GT: case GTU: - std::swap (lo[0], lo[1]); - std::swap (hi[0], hi[1]); - code = swap_condition (code); - /* FALLTHRU */ - - case LT: case LTU: case GE: case GEU: - { - rtx (*cmp_insn) (rtx, rtx); - rtx (*sbb_insn) (rtx, rtx, rtx); - bool uns = (code == LTU || code == GEU); - - if (TARGET_64BIT) - { - cmp_insn = gen_cmpdi_1; - sbb_insn - = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz; - } - else - { - cmp_insn = gen_cmpsi_1; - sbb_insn - = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz; - } - - if (!nonimmediate_operand (lo[0], submode)) - lo[0] = force_reg (submode, lo[0]); - if (!x86_64_general_operand (lo[1], submode)) - lo[1] = force_reg (submode, lo[1]); - - if (!register_operand (hi[0], submode)) - hi[0] = force_reg (submode, hi[0]); - if ((uns && !nonimmediate_operand (hi[1], submode)) - || (!uns && !x86_64_general_operand (hi[1], submode))) - hi[1] = force_reg (submode, hi[1]); - - emit_insn (cmp_insn (lo[0], lo[1])); - emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1])); - - tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG); - - ix86_expand_branch (code, tmp, const0_rtx, label); - return; - } - - default: - break; - } - - /* Otherwise, we need two or three jumps. */ - - label2 = gen_label_rtx (); - - code1 = code; - code2 = swap_condition (code); - code3 = unsigned_condition (code); - - switch (code) - { - case LT: case GT: case LTU: case GTU: - break; - - case LE: code1 = LT; code2 = GT; break; - case GE: code1 = GT; code2 = LT; break; - case LEU: code1 = LTU; code2 = GTU; break; - case GEU: code1 = GTU; code2 = LTU; break; - - case EQ: code1 = UNKNOWN; code2 = NE; break; - case NE: code2 = UNKNOWN; break; - - default: - gcc_unreachable (); - } - - /* - * a < b => - * if (hi(a) < hi(b)) goto true; - * if (hi(a) > hi(b)) goto false; - * if (lo(a) < lo(b)) goto true; - * false: - */ - - if (code1 != UNKNOWN) - ix86_expand_branch (code1, hi[0], hi[1], label); - if (code2 != UNKNOWN) - ix86_expand_branch (code2, hi[0], hi[1], label2); - - ix86_expand_branch (code3, lo[0], lo[1], label); - - if (code2 != UNKNOWN) - emit_label (label2); - return; - } - - default: - gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC); - goto simple; - } -} - -void -ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1) -{ - rtx ret; - - gcc_assert (GET_MODE (dest) == QImode); - - ret = ix86_expand_compare (code, op0, op1); - PUT_MODE (ret, QImode); - emit_insn (gen_rtx_SET (dest, ret)); -} - -/* Expand comparison setting or clearing carry flag. Return true when - successful and set pop for the operation. */ -static bool -ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) -{ - machine_mode mode - = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); - - /* Do not handle double-mode compares that go through special path. */ - if (mode == (TARGET_64BIT ? TImode : DImode)) - return false; - - if (SCALAR_FLOAT_MODE_P (mode)) - { - rtx compare_op; - rtx_insn *compare_seq; - - gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); - - /* Shortcut: following common codes never translate - into carry flag compares. */ - if (code == EQ || code == NE || code == UNEQ || code == LTGT - || code == ORDERED || code == UNORDERED) - return false; - - /* These comparisons require zero flag; swap operands so they won't. */ - if ((code == GT || code == UNLE || code == LE || code == UNGT) - && !TARGET_IEEE_FP) - { - std::swap (op0, op1); - code = swap_condition (code); - } - - /* Try to expand the comparison and verify that we end up with - carry flag based comparison. This fails to be true only when - we decide to expand comparison using arithmetic that is not - too common scenario. */ - start_sequence (); - compare_op = ix86_expand_fp_compare (code, op0, op1); - compare_seq = get_insns (); - end_sequence (); - - if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode) - code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op)); - else - code = GET_CODE (compare_op); - - if (code != LTU && code != GEU) - return false; - - emit_insn (compare_seq); - *pop = compare_op; - return true; - } - - if (!INTEGRAL_MODE_P (mode)) - return false; - - switch (code) - { - case LTU: - case GEU: - break; - - /* Convert a==0 into (unsigned)a<1. */ - case EQ: - case NE: - if (op1 != const0_rtx) - return false; - op1 = const1_rtx; - code = (code == EQ ? LTU : GEU); - break; - - /* Convert a>b into b=b-1. */ - case GTU: - case LEU: - if (CONST_INT_P (op1)) - { - op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); - /* Bail out on overflow. We still can swap operands but that - would force loading of the constant into register. */ - if (op1 == const0_rtx - || !x86_64_immediate_operand (op1, GET_MODE (op1))) - return false; - code = (code == GTU ? GEU : LTU); - } - else - { - std::swap (op0, op1); - code = (code == GTU ? LTU : GEU); - } - break; - - /* Convert a>=0 into (unsigned)a<0x80000000. */ - case LT: - case GE: - if (mode == DImode || op1 != const0_rtx) - return false; - op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); - code = (code == LT ? GEU : LTU); - break; - case LE: - case GT: - if (mode == DImode || op1 != constm1_rtx) - return false; - op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode); - code = (code == LE ? GEU : LTU); - break; - - default: - return false; - } - /* Swapping operands may cause constant to appear as first operand. */ - if (!nonimmediate_operand (op0, VOIDmode)) - { - if (!can_create_pseudo_p ()) - return false; - op0 = force_reg (mode, op0); - } - *pop = ix86_expand_compare (code, op0, op1); - gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU); - return true; -} - -bool -ix86_expand_int_movcc (rtx operands[]) -{ - enum rtx_code code = GET_CODE (operands[1]), compare_code; - rtx_insn *compare_seq; - rtx compare_op; - machine_mode mode = GET_MODE (operands[0]); - bool sign_bit_compare_p = false; - rtx op0 = XEXP (operands[1], 0); - rtx op1 = XEXP (operands[1], 1); - - if (GET_MODE (op0) == TImode - || (GET_MODE (op0) == DImode - && !TARGET_64BIT)) - return false; - - start_sequence (); - compare_op = ix86_expand_compare (code, op0, op1); - compare_seq = get_insns (); - end_sequence (); - - compare_code = GET_CODE (compare_op); - - if ((op1 == const0_rtx && (code == GE || code == LT)) - || (op1 == constm1_rtx && (code == GT || code == LE))) - sign_bit_compare_p = true; - - /* Don't attempt mode expansion here -- if we had to expand 5 or 6 - HImode insns, we'd be swallowed in word prefix ops. */ - - if ((mode != HImode || TARGET_FAST_PREFIX) - && (mode != (TARGET_64BIT ? TImode : DImode)) - && CONST_INT_P (operands[2]) - && CONST_INT_P (operands[3])) - { - rtx out = operands[0]; - HOST_WIDE_INT ct = INTVAL (operands[2]); - HOST_WIDE_INT cf = INTVAL (operands[3]); - HOST_WIDE_INT diff; - - diff = ct - cf; - /* Sign bit compares are better done using shifts than we do by using - sbb. */ - if (sign_bit_compare_p - || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) - { - /* Detect overlap between destination and compare sources. */ - rtx tmp = out; - - if (!sign_bit_compare_p) - { - rtx flags; - bool fpcmp = false; - - compare_code = GET_CODE (compare_op); - - flags = XEXP (compare_op, 0); - - if (GET_MODE (flags) == CCFPmode) - { - fpcmp = true; - compare_code - = ix86_fp_compare_code_to_integer (compare_code); - } - - /* To simplify rest of code, restrict to the GEU case. */ - if (compare_code == LTU) - { - std::swap (ct, cf); - compare_code = reverse_condition (compare_code); - code = reverse_condition (code); - } - else - { - if (fpcmp) - PUT_CODE (compare_op, - reverse_condition_maybe_unordered - (GET_CODE (compare_op))); - else - PUT_CODE (compare_op, - reverse_condition (GET_CODE (compare_op))); - } - diff = ct - cf; - - if (reg_overlap_mentioned_p (out, op0) - || reg_overlap_mentioned_p (out, op1)) - tmp = gen_reg_rtx (mode); - - if (mode == DImode) - emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op)); - else - emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), - flags, compare_op)); - } - else - { - if (code == GT || code == GE) - code = reverse_condition (code); - else - { - std::swap (ct, cf); - diff = ct - cf; - } - tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1); - } - - if (diff == 1) - { - /* - * cmpl op0,op1 - * sbbl dest,dest - * [addl dest, ct] - * - * Size 5 - 8. - */ - if (ct) - tmp = expand_simple_binop (mode, PLUS, - tmp, GEN_INT (ct), - copy_rtx (tmp), 1, OPTAB_DIRECT); - } - else if (cf == -1) - { - /* - * cmpl op0,op1 - * sbbl dest,dest - * orl $ct, dest - * - * Size 8. - */ - tmp = expand_simple_binop (mode, IOR, - tmp, GEN_INT (ct), - copy_rtx (tmp), 1, OPTAB_DIRECT); - } - else if (diff == -1 && ct) - { - /* - * cmpl op0,op1 - * sbbl dest,dest - * notl dest - * [addl dest, cf] - * - * Size 8 - 11. - */ - tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); - if (cf) - tmp = expand_simple_binop (mode, PLUS, - copy_rtx (tmp), GEN_INT (cf), - copy_rtx (tmp), 1, OPTAB_DIRECT); - } - else - { - /* - * cmpl op0,op1 - * sbbl dest,dest - * [notl dest] - * andl cf - ct, dest - * [addl dest, ct] - * - * Size 8 - 11. - */ - - if (cf == 0) - { - cf = ct; - ct = 0; - tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1); - } - - tmp = expand_simple_binop (mode, AND, - copy_rtx (tmp), - gen_int_mode (cf - ct, mode), - copy_rtx (tmp), 1, OPTAB_DIRECT); - if (ct) - tmp = expand_simple_binop (mode, PLUS, - copy_rtx (tmp), GEN_INT (ct), - copy_rtx (tmp), 1, OPTAB_DIRECT); - } - - if (!rtx_equal_p (tmp, out)) - emit_move_insn (copy_rtx (out), copy_rtx (tmp)); - - return true; - } - - if (diff < 0) - { - machine_mode cmp_mode = GET_MODE (op0); - enum rtx_code new_code; - - if (SCALAR_FLOAT_MODE_P (cmp_mode)) - { - gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); - - /* We may be reversing unordered compare to normal compare, that - is not valid in general (we may convert non-trapping condition - to trapping one), however on i386 we currently emit all - comparisons unordered. */ - new_code = reverse_condition_maybe_unordered (code); - } - else - new_code = ix86_reverse_condition (code, cmp_mode); - if (new_code != UNKNOWN) - { - std::swap (ct, cf); - diff = -diff; - code = new_code; - } - } - - compare_code = UNKNOWN; - if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT - && CONST_INT_P (op1)) - { - if (op1 == const0_rtx - && (code == LT || code == GE)) - compare_code = code; - else if (op1 == constm1_rtx) - { - if (code == LE) - compare_code = LT; - else if (code == GT) - compare_code = GE; - } - } - - /* Optimize dest = (op0 < 0) ? -1 : cf. */ - if (compare_code != UNKNOWN - && GET_MODE (op0) == GET_MODE (out) - && (cf == -1 || ct == -1)) - { - /* If lea code below could be used, only optimize - if it results in a 2 insn sequence. */ - - if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8 - || diff == 3 || diff == 5 || diff == 9) - || (compare_code == LT && ct == -1) - || (compare_code == GE && cf == -1)) - { - /* - * notl op1 (if necessary) - * sarl $31, op1 - * orl cf, op1 - */ - if (ct != -1) - { - cf = ct; - ct = -1; - code = reverse_condition (code); - } - - out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); - - out = expand_simple_binop (mode, IOR, - out, GEN_INT (cf), - out, 1, OPTAB_DIRECT); - if (out != operands[0]) - emit_move_insn (operands[0], out); - - return true; - } - } - - - if ((diff == 1 || diff == 2 || diff == 4 || diff == 8 - || diff == 3 || diff == 5 || diff == 9) - && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL) - && (mode != DImode - || x86_64_immediate_operand (GEN_INT (cf), VOIDmode))) - { - /* - * xorl dest,dest - * cmpl op1,op2 - * setcc dest - * lea cf(dest*(ct-cf)),dest - * - * Size 14. - * - * This also catches the degenerate setcc-only case. - */ - - rtx tmp; - int nops; - - out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); - - nops = 0; - /* On x86_64 the lea instruction operates on Pmode, so we need - to get arithmetics done in proper mode to match. */ - if (diff == 1) - tmp = copy_rtx (out); - else - { - rtx out1; - out1 = copy_rtx (out); - tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1)); - nops++; - if (diff & 1) - { - tmp = gen_rtx_PLUS (mode, tmp, out1); - nops++; - } - } - if (cf != 0) - { - tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf)); - nops++; - } - if (!rtx_equal_p (tmp, out)) - { - if (nops == 1) - out = force_operand (tmp, copy_rtx (out)); - else - emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp))); - } - if (!rtx_equal_p (out, operands[0])) - emit_move_insn (operands[0], copy_rtx (out)); - - return true; - } - - /* - * General case: Jumpful: - * xorl dest,dest cmpl op1, op2 - * cmpl op1, op2 movl ct, dest - * setcc dest jcc 1f - * decl dest movl cf, dest - * andl (cf-ct),dest 1: - * addl ct,dest - * - * Size 20. Size 14. - * - * This is reasonably steep, but branch mispredict costs are - * high on modern cpus, so consider failing only if optimizing - * for space. - */ - - if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) - && BRANCH_COST (optimize_insn_for_speed_p (), - false) >= 2) - { - if (cf == 0) - { - machine_mode cmp_mode = GET_MODE (op0); - enum rtx_code new_code; - - if (SCALAR_FLOAT_MODE_P (cmp_mode)) - { - gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); - - /* We may be reversing unordered compare to normal compare, - that is not valid in general (we may convert non-trapping - condition to trapping one), however on i386 we currently - emit all comparisons unordered. */ - new_code = reverse_condition_maybe_unordered (code); - } - else - { - new_code = ix86_reverse_condition (code, cmp_mode); - if (compare_code != UNKNOWN && new_code != UNKNOWN) - compare_code = reverse_condition (compare_code); - } - - if (new_code != UNKNOWN) - { - cf = ct; - ct = 0; - code = new_code; - } - } - - if (compare_code != UNKNOWN) - { - /* notl op1 (if needed) - sarl $31, op1 - andl (cf-ct), op1 - addl ct, op1 - - For x < 0 (resp. x <= -1) there will be no notl, - so if possible swap the constants to get rid of the - complement. - True/false will be -1/0 while code below (store flag - followed by decrement) is 0/-1, so the constants need - to be exchanged once more. */ - - if (compare_code == GE || !cf) - { - code = reverse_condition (code); - compare_code = LT; - } - else - std::swap (ct, cf); - - out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1); - } - else - { - out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1); - - out = expand_simple_binop (mode, PLUS, copy_rtx (out), - constm1_rtx, - copy_rtx (out), 1, OPTAB_DIRECT); - } - - out = expand_simple_binop (mode, AND, copy_rtx (out), - gen_int_mode (cf - ct, mode), - copy_rtx (out), 1, OPTAB_DIRECT); - if (ct) - out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct), - copy_rtx (out), 1, OPTAB_DIRECT); - if (!rtx_equal_p (out, operands[0])) - emit_move_insn (operands[0], copy_rtx (out)); - - return true; - } - } - - if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL)) - { - /* Try a few things more with specific constants and a variable. */ - - optab op; - rtx var, orig_out, out, tmp; - - if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2) - return false; - - /* If one of the two operands is an interesting constant, load a - constant with the above and mask it in with a logical operation. */ - - if (CONST_INT_P (operands[2])) - { - var = operands[3]; - if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) - operands[3] = constm1_rtx, op = and_optab; - else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx) - operands[3] = const0_rtx, op = ior_optab; - else - return false; - } - else if (CONST_INT_P (operands[3])) - { - var = operands[2]; - if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) - operands[2] = constm1_rtx, op = and_optab; - else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx) - operands[2] = const0_rtx, op = ior_optab; - else - return false; - } - else - return false; - - orig_out = operands[0]; - tmp = gen_reg_rtx (mode); - operands[0] = tmp; - - /* Recurse to get the constant loaded. */ - if (!ix86_expand_int_movcc (operands)) - return false; - - /* Mask in the interesting variable. */ - out = expand_binop (mode, op, var, tmp, orig_out, 0, - OPTAB_WIDEN); - if (!rtx_equal_p (out, orig_out)) - emit_move_insn (copy_rtx (orig_out), copy_rtx (out)); - - return true; - } - - /* - * For comparison with above, - * - * movl cf,dest - * movl ct,tmp - * cmpl op1,op2 - * cmovcc tmp,dest - * - * Size 15. - */ - - if (! nonimmediate_operand (operands[2], mode)) - operands[2] = force_reg (mode, operands[2]); - if (! nonimmediate_operand (operands[3], mode)) - operands[3] = force_reg (mode, operands[3]); - - if (! register_operand (operands[2], VOIDmode) - && (mode == QImode - || ! register_operand (operands[3], VOIDmode))) - operands[2] = force_reg (mode, operands[2]); - - if (mode == QImode - && ! register_operand (operands[3], VOIDmode)) - operands[3] = force_reg (mode, operands[3]); - - emit_insn (compare_seq); - emit_insn (gen_rtx_SET (operands[0], - gen_rtx_IF_THEN_ELSE (mode, - compare_op, operands[2], - operands[3]))); - return true; -} - -/* Swap, force into registers, or otherwise massage the two operands - to an sse comparison with a mask result. Thus we differ a bit from - ix86_prepare_fp_compare_args which expects to produce a flags result. - - The DEST operand exists to help determine whether to commute commutative - operators. The POP0/POP1 operands are updated in place. The new - comparison code is returned, or UNKNOWN if not implementable. */ - -static enum rtx_code -ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code, - rtx *pop0, rtx *pop1) -{ - switch (code) - { - case LTGT: - case UNEQ: - /* AVX supports all the needed comparisons. */ - if (TARGET_AVX) - break; - /* We have no LTGT as an operator. We could implement it with - NE & ORDERED, but this requires an extra temporary. It's - not clear that it's worth it. */ - return UNKNOWN; - - case LT: - case LE: - case UNGT: - case UNGE: - /* These are supported directly. */ - break; - - case EQ: - case NE: - case UNORDERED: - case ORDERED: - /* AVX has 3 operand comparisons, no need to swap anything. */ - if (TARGET_AVX) - break; - /* For commutative operators, try to canonicalize the destination - operand to be first in the comparison - this helps reload to - avoid extra moves. */ - if (!dest || !rtx_equal_p (dest, *pop1)) - break; - /* FALLTHRU */ - - case GE: - case GT: - case UNLE: - case UNLT: - /* These are not supported directly before AVX, and furthermore - ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the - comparison operands to transform into something that is - supported. */ - std::swap (*pop0, *pop1); - code = swap_condition (code); - break; - - default: - gcc_unreachable (); - } - - return code; -} - -/* Detect conditional moves that exactly match min/max operational - semantics. Note that this is IEEE safe, as long as we don't - interchange the operands. - - Returns FALSE if this conditional move doesn't match a MIN/MAX, - and TRUE if the operation is successful and instructions are emitted. */ - -static bool -ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0, - rtx cmp_op1, rtx if_true, rtx if_false) -{ - machine_mode mode; - bool is_min; - rtx tmp; - - if (code == LT) - ; - else if (code == UNGE) - std::swap (if_true, if_false); - else - return false; - - if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false)) - is_min = true; - else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false)) - is_min = false; - else - return false; - - mode = GET_MODE (dest); - - /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here, - but MODE may be a vector mode and thus not appropriate. */ - if (!flag_finite_math_only || flag_signed_zeros) - { - int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX; - rtvec v; - - if_true = force_reg (mode, if_true); - v = gen_rtvec (2, if_true, if_false); - tmp = gen_rtx_UNSPEC (mode, v, u); - } - else - { - code = is_min ? SMIN : SMAX; - if (MEM_P (if_true) && MEM_P (if_false)) - if_true = force_reg (mode, if_true); - tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false); - } - - emit_insn (gen_rtx_SET (dest, tmp)); - return true; -} - -/* Expand an SSE comparison. Return the register with the result. */ - -static rtx -ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1, - rtx op_true, rtx op_false) -{ - machine_mode mode = GET_MODE (dest); - machine_mode cmp_ops_mode = GET_MODE (cmp_op0); - - /* In general case result of comparison can differ from operands' type. */ - machine_mode cmp_mode; - - /* In AVX512F the result of comparison is an integer mask. */ - bool maskcmp = false; - rtx x; - - if (GET_MODE_SIZE (cmp_ops_mode) == 64) - { - unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode); - cmp_mode = int_mode_for_size (nbits, 0).require (); - maskcmp = true; - } - else - cmp_mode = cmp_ops_mode; - - cmp_op0 = force_reg (cmp_ops_mode, cmp_op0); - - int (*op1_predicate)(rtx, machine_mode) - = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand; - - if (!op1_predicate (cmp_op1, cmp_ops_mode)) - cmp_op1 = force_reg (cmp_ops_mode, cmp_op1); - - if (optimize - || (maskcmp && cmp_mode != mode) - || (op_true && reg_overlap_mentioned_p (dest, op_true)) - || (op_false && reg_overlap_mentioned_p (dest, op_false))) - dest = gen_reg_rtx (maskcmp ? cmp_mode : mode); - - /* Compare patterns for int modes are unspec in AVX512F only. */ - if (maskcmp && (code == GT || code == EQ)) - { - rtx (*gen)(rtx, rtx, rtx); - - switch (cmp_ops_mode) - { - case E_V64QImode: - gcc_assert (TARGET_AVX512BW); - gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1; - break; - case E_V32HImode: - gcc_assert (TARGET_AVX512BW); - gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1; - break; - case E_V16SImode: - gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1; - break; - case E_V8DImode: - gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1; - break; - default: - gen = NULL; - } - - if (gen) - { - emit_insn (gen (dest, cmp_op0, cmp_op1)); - return dest; - } - } - x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1); - - if (cmp_mode != mode && !maskcmp) - { - x = force_reg (cmp_ops_mode, x); - convert_move (dest, x, false); - } - else - emit_insn (gen_rtx_SET (dest, x)); - - return dest; -} - -/* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical - operations. This is used for both scalar and vector conditional moves. */ - -void -ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false) -{ - machine_mode mode = GET_MODE (dest); - machine_mode cmpmode = GET_MODE (cmp); - - /* In AVX512F the result of comparison is an integer mask. */ - bool maskcmp = (mode != cmpmode && TARGET_AVX512F); - - rtx t2, t3, x; - - /* If we have an integer mask and FP value then we need - to cast mask to FP mode. */ - if (mode != cmpmode && VECTOR_MODE_P (cmpmode)) - { - cmp = force_reg (cmpmode, cmp); - cmp = gen_rtx_SUBREG (mode, cmp, 0); - } - - if (maskcmp) - { - rtx (*gen) (rtx, rtx) = NULL; - if ((op_true == CONST0_RTX (mode) - && vector_all_ones_operand (op_false, mode)) - || (op_false == CONST0_RTX (mode) - && vector_all_ones_operand (op_true, mode))) - switch (mode) - { - case E_V64QImode: - if (TARGET_AVX512BW) - gen = gen_avx512bw_cvtmask2bv64qi; - break; - case E_V32QImode: - if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_cvtmask2bv32qi; - break; - case E_V16QImode: - if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_cvtmask2bv16qi; - break; - case E_V32HImode: - if (TARGET_AVX512BW) - gen = gen_avx512bw_cvtmask2wv32hi; - break; - case E_V16HImode: - if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_cvtmask2wv16hi; - break; - case E_V8HImode: - if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_cvtmask2wv8hi; - break; - case E_V16SImode: - if (TARGET_AVX512DQ) - gen = gen_avx512f_cvtmask2dv16si; - break; - case E_V8SImode: - if (TARGET_AVX512VL && TARGET_AVX512DQ) - gen = gen_avx512vl_cvtmask2dv8si; - break; - case E_V4SImode: - if (TARGET_AVX512VL && TARGET_AVX512DQ) - gen = gen_avx512vl_cvtmask2dv4si; - break; - case E_V8DImode: - if (TARGET_AVX512DQ) - gen = gen_avx512f_cvtmask2qv8di; - break; - case E_V4DImode: - if (TARGET_AVX512VL && TARGET_AVX512DQ) - gen = gen_avx512vl_cvtmask2qv4di; - break; - case E_V2DImode: - if (TARGET_AVX512VL && TARGET_AVX512DQ) - gen = gen_avx512vl_cvtmask2qv2di; - break; - default: - break; - } - if (gen && SCALAR_INT_MODE_P (cmpmode)) - { - cmp = force_reg (cmpmode, cmp); - if (op_true == CONST0_RTX (mode)) - { - rtx (*gen_not) (rtx, rtx); - switch (cmpmode) - { - case E_QImode: gen_not = gen_knotqi; break; - case E_HImode: gen_not = gen_knothi; break; - case E_SImode: gen_not = gen_knotsi; break; - case E_DImode: gen_not = gen_knotdi; break; - default: gcc_unreachable (); - } - rtx n = gen_reg_rtx (cmpmode); - emit_insn (gen_not (n, cmp)); - cmp = n; - } - emit_insn (gen (dest, cmp)); - return; - } - } - else if (vector_all_ones_operand (op_true, mode) - && op_false == CONST0_RTX (mode)) - { - emit_insn (gen_rtx_SET (dest, cmp)); - return; - } - else if (op_false == CONST0_RTX (mode)) - { - op_true = force_reg (mode, op_true); - x = gen_rtx_AND (mode, cmp, op_true); - emit_insn (gen_rtx_SET (dest, x)); - return; - } - else if (op_true == CONST0_RTX (mode)) - { - op_false = force_reg (mode, op_false); - x = gen_rtx_NOT (mode, cmp); - x = gen_rtx_AND (mode, x, op_false); - emit_insn (gen_rtx_SET (dest, x)); - return; - } - else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)) - { - op_false = force_reg (mode, op_false); - x = gen_rtx_IOR (mode, cmp, op_false); - emit_insn (gen_rtx_SET (dest, x)); - return; - } - else if (TARGET_XOP) - { - op_true = force_reg (mode, op_true); - - if (!nonimmediate_operand (op_false, mode)) - op_false = force_reg (mode, op_false); - - emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp, - op_true, - op_false))); - return; - } - - rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; - rtx d = dest; - - if (!vector_operand (op_true, mode)) - op_true = force_reg (mode, op_true); - - op_false = force_reg (mode, op_false); - - switch (mode) - { - case E_V4SFmode: - if (TARGET_SSE4_1) - gen = gen_sse4_1_blendvps; - break; - case E_V2DFmode: - if (TARGET_SSE4_1) - gen = gen_sse4_1_blendvpd; - break; - case E_SFmode: - if (TARGET_SSE4_1) - { - gen = gen_sse4_1_blendvss; - op_true = force_reg (mode, op_true); - } - break; - case E_DFmode: - if (TARGET_SSE4_1) - { - gen = gen_sse4_1_blendvsd; - op_true = force_reg (mode, op_true); - } - break; - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - if (TARGET_SSE4_1) - { - gen = gen_sse4_1_pblendvb; - if (mode != V16QImode) - d = gen_reg_rtx (V16QImode); - op_false = gen_lowpart (V16QImode, op_false); - op_true = gen_lowpart (V16QImode, op_true); - cmp = gen_lowpart (V16QImode, cmp); - } - break; - case E_V8SFmode: - if (TARGET_AVX) - gen = gen_avx_blendvps256; - break; - case E_V4DFmode: - if (TARGET_AVX) - gen = gen_avx_blendvpd256; - break; - case E_V32QImode: - case E_V16HImode: - case E_V8SImode: - case E_V4DImode: - if (TARGET_AVX2) - { - gen = gen_avx2_pblendvb; - if (mode != V32QImode) - d = gen_reg_rtx (V32QImode); - op_false = gen_lowpart (V32QImode, op_false); - op_true = gen_lowpart (V32QImode, op_true); - cmp = gen_lowpart (V32QImode, cmp); - } - break; - - case E_V64QImode: - gen = gen_avx512bw_blendmv64qi; - break; - case E_V32HImode: - gen = gen_avx512bw_blendmv32hi; - break; - case E_V16SImode: - gen = gen_avx512f_blendmv16si; - break; - case E_V8DImode: - gen = gen_avx512f_blendmv8di; - break; - case E_V8DFmode: - gen = gen_avx512f_blendmv8df; - break; - case E_V16SFmode: - gen = gen_avx512f_blendmv16sf; - break; - - default: - break; - } - - if (gen != NULL) - { - emit_insn (gen (d, op_false, op_true, cmp)); - if (d != dest) - emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); - } - else - { - op_true = force_reg (mode, op_true); - - t2 = gen_reg_rtx (mode); - if (optimize) - t3 = gen_reg_rtx (mode); - else - t3 = dest; - - x = gen_rtx_AND (mode, op_true, cmp); - emit_insn (gen_rtx_SET (t2, x)); - - x = gen_rtx_NOT (mode, cmp); - x = gen_rtx_AND (mode, x, op_false); - emit_insn (gen_rtx_SET (t3, x)); - - x = gen_rtx_IOR (mode, t3, t2); - emit_insn (gen_rtx_SET (dest, x)); - } -} - -/* Expand a floating-point conditional move. Return true if successful. */ - -bool -ix86_expand_fp_movcc (rtx operands[]) -{ - machine_mode mode = GET_MODE (operands[0]); - enum rtx_code code = GET_CODE (operands[1]); - rtx tmp, compare_op; - rtx op0 = XEXP (operands[1], 0); - rtx op1 = XEXP (operands[1], 1); - - if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode)) - { - machine_mode cmode; - - /* Since we've no cmove for sse registers, don't force bad register - allocation just to gain access to it. Deny movcc when the - comparison mode doesn't match the move mode. */ - cmode = GET_MODE (op0); - if (cmode == VOIDmode) - cmode = GET_MODE (op1); - if (cmode != mode) - return false; - - code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1); - if (code == UNKNOWN) - return false; - - if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1, - operands[2], operands[3])) - return true; - - tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1, - operands[2], operands[3]); - ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]); - return true; - } - - if (GET_MODE (op0) == TImode - || (GET_MODE (op0) == DImode - && !TARGET_64BIT)) - return false; - - /* The floating point conditional move instructions don't directly - support conditions resulting from a signed integer comparison. */ - - compare_op = ix86_expand_compare (code, op0, op1); - if (!fcmov_comparison_operator (compare_op, VOIDmode)) - { - tmp = gen_reg_rtx (QImode); - ix86_expand_setcc (tmp, code, op0, op1); - - compare_op = ix86_expand_compare (NE, tmp, const0_rtx); - } - - emit_insn (gen_rtx_SET (operands[0], - gen_rtx_IF_THEN_ELSE (mode, compare_op, - operands[2], operands[3]))); - - return true; -} - -/* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */ - -static int -ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code) -{ - switch (code) - { - case EQ: - return 0; - case LT: - case LTU: - return 1; - case LE: - case LEU: - return 2; - case NE: - return 4; - case GE: - case GEU: - return 5; - case GT: - case GTU: - return 6; - default: - gcc_unreachable (); - } -} - -/* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */ - -static int -ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code) -{ - switch (code) - { - case EQ: - return 0x00; - case NE: - return 0x04; - case GT: - return 0x0e; - case LE: - return 0x02; - case GE: - return 0x0d; - case LT: - return 0x01; - case UNLE: - return 0x0a; - case UNLT: - return 0x09; - case UNGE: - return 0x05; - case UNGT: - return 0x06; - case UNEQ: - return 0x18; - case LTGT: - return 0x0c; - case ORDERED: - return 0x07; - case UNORDERED: - return 0x03; - default: - gcc_unreachable (); - } -} - -/* Return immediate value to be used in UNSPEC_PCMP - for comparison CODE in MODE. */ - -static int -ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode) -{ - if (FLOAT_MODE_P (mode)) - return ix86_fp_cmp_code_to_pcmp_immediate (code); - return ix86_int_cmp_code_to_pcmp_immediate (code); -} - -/* Expand AVX-512 vector comparison. */ - -bool -ix86_expand_mask_vec_cmp (rtx operands[]) -{ - machine_mode mask_mode = GET_MODE (operands[0]); - machine_mode cmp_mode = GET_MODE (operands[2]); - enum rtx_code code = GET_CODE (operands[1]); - rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode)); - int unspec_code; - rtx unspec; - - switch (code) - { - case LEU: - case GTU: - case GEU: - case LTU: - unspec_code = UNSPEC_UNSIGNED_PCMP; - break; - - default: - unspec_code = UNSPEC_PCMP; - } - - unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2], - operands[3], imm), - unspec_code); - emit_insn (gen_rtx_SET (operands[0], unspec)); - - return true; -} - -/* Expand fp vector comparison. */ - -bool -ix86_expand_fp_vec_cmp (rtx operands[]) -{ - enum rtx_code code = GET_CODE (operands[1]); - rtx cmp; - - code = ix86_prepare_sse_fp_compare_args (operands[0], code, - &operands[2], &operands[3]); - if (code == UNKNOWN) - { - rtx temp; - switch (GET_CODE (operands[1])) - { - case LTGT: - temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2], - operands[3], NULL, NULL); - cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2], - operands[3], NULL, NULL); - code = AND; - break; - case UNEQ: - temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2], - operands[3], NULL, NULL); - cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2], - operands[3], NULL, NULL); - code = IOR; - break; - default: - gcc_unreachable (); - } - cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, - OPTAB_DIRECT); - } - else - cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3], - operands[1], operands[2]); - - if (operands[0] != cmp) - emit_move_insn (operands[0], cmp); - - return true; -} - -static rtx -ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1, - rtx op_true, rtx op_false, bool *negate) -{ - machine_mode data_mode = GET_MODE (dest); - machine_mode mode = GET_MODE (cop0); - rtx x; - - *negate = false; - - /* XOP supports all of the comparisons on all 128-bit vector int types. */ - if (TARGET_XOP - && (mode == V16QImode || mode == V8HImode - || mode == V4SImode || mode == V2DImode)) - ; - else - { - /* Canonicalize the comparison to EQ, GT, GTU. */ - switch (code) - { - case EQ: - case GT: - case GTU: - break; - - case NE: - case LE: - case LEU: - code = reverse_condition (code); - *negate = true; - break; - - case GE: - case GEU: - code = reverse_condition (code); - *negate = true; - /* FALLTHRU */ - - case LT: - case LTU: - std::swap (cop0, cop1); - code = swap_condition (code); - break; - - default: - gcc_unreachable (); - } - - /* Only SSE4.1/SSE4.2 supports V2DImode. */ - if (mode == V2DImode) - { - switch (code) - { - case EQ: - /* SSE4.1 supports EQ. */ - if (!TARGET_SSE4_1) - return NULL; - break; - - case GT: - case GTU: - /* SSE4.2 supports GT/GTU. */ - if (!TARGET_SSE4_2) - return NULL; - break; - - default: - gcc_unreachable (); - } - } - - rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode); - rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode); - if (*negate) - std::swap (optrue, opfalse); - - /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when - not using integer masks into min (x, y) == x ? -1 : 0 (i.e. - min (x, y) == x). While we add one instruction (the minimum), - we remove the need for two instructions in the negation, as the - result is done this way. - When using masks, do it for SI/DImode element types, as it is shorter - than the two subtractions. */ - if ((code != EQ - && GET_MODE_SIZE (mode) != 64 - && vector_all_ones_operand (opfalse, data_mode) - && optrue == CONST0_RTX (data_mode)) - || (code == GTU - && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4 - /* Don't do it if not using integer masks and we'd end up with - the right values in the registers though. */ - && (GET_MODE_SIZE (mode) == 64 - || !vector_all_ones_operand (optrue, data_mode) - || opfalse != CONST0_RTX (data_mode)))) - { - rtx (*gen) (rtx, rtx, rtx) = NULL; - - switch (mode) - { - case E_V16SImode: - gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3; - break; - case E_V8DImode: - gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3; - cop0 = force_reg (mode, cop0); - cop1 = force_reg (mode, cop1); - break; - case E_V32QImode: - if (TARGET_AVX2) - gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3; - break; - case E_V16HImode: - if (TARGET_AVX2) - gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3; - break; - case E_V8SImode: - if (TARGET_AVX2) - gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3; - break; - case E_V4DImode: - if (TARGET_AVX512VL) - { - gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3; - cop0 = force_reg (mode, cop0); - cop1 = force_reg (mode, cop1); - } - break; - case E_V16QImode: - if (code == GTU && TARGET_SSE2) - gen = gen_uminv16qi3; - else if (code == GT && TARGET_SSE4_1) - gen = gen_sminv16qi3; - break; - case E_V8HImode: - if (code == GTU && TARGET_SSE4_1) - gen = gen_uminv8hi3; - else if (code == GT && TARGET_SSE2) - gen = gen_sminv8hi3; - break; - case E_V4SImode: - if (TARGET_SSE4_1) - gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3; - break; - case E_V2DImode: - if (TARGET_AVX512VL) - { - gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3; - cop0 = force_reg (mode, cop0); - cop1 = force_reg (mode, cop1); - } - break; - default: - break; - } - - if (gen) - { - rtx tem = gen_reg_rtx (mode); - if (!vector_operand (cop0, mode)) - cop0 = force_reg (mode, cop0); - if (!vector_operand (cop1, mode)) - cop1 = force_reg (mode, cop1); - *negate = !*negate; - emit_insn (gen (tem, cop0, cop1)); - cop1 = tem; - code = EQ; - } - } - - /* Unsigned parallel compare is not supported by the hardware. - Play some tricks to turn this into a signed comparison - against 0. */ - if (code == GTU) - { - cop0 = force_reg (mode, cop0); - - switch (mode) - { - case E_V16SImode: - case E_V8DImode: - case E_V8SImode: - case E_V4DImode: - case E_V4SImode: - case E_V2DImode: - { - rtx t1, t2, mask; - rtx (*gen_sub3) (rtx, rtx, rtx); - - switch (mode) - { - case E_V16SImode: gen_sub3 = gen_subv16si3; break; - case E_V8DImode: gen_sub3 = gen_subv8di3; break; - case E_V8SImode: gen_sub3 = gen_subv8si3; break; - case E_V4DImode: gen_sub3 = gen_subv4di3; break; - case E_V4SImode: gen_sub3 = gen_subv4si3; break; - case E_V2DImode: gen_sub3 = gen_subv2di3; break; - default: - gcc_unreachable (); - } - /* Subtract (-(INT MAX) - 1) from both operands to make - them signed. */ - mask = ix86_build_signbit_mask (mode, true, false); - t1 = gen_reg_rtx (mode); - emit_insn (gen_sub3 (t1, cop0, mask)); - - t2 = gen_reg_rtx (mode); - emit_insn (gen_sub3 (t2, cop1, mask)); - - cop0 = t1; - cop1 = t2; - code = GT; - } - break; - - case E_V64QImode: - case E_V32HImode: - case E_V32QImode: - case E_V16HImode: - case E_V16QImode: - case E_V8HImode: - /* Perform a parallel unsigned saturating subtraction. */ - x = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0, - cop1))); - - cop0 = x; - cop1 = CONST0_RTX (mode); - code = EQ; - *negate = !*negate; - break; - - default: - gcc_unreachable (); - } - } - } - - if (*negate) - std::swap (op_true, op_false); - - /* Allow the comparison to be done in one mode, but the movcc to - happen in another mode. */ - if (data_mode == mode) - { - x = ix86_expand_sse_cmp (dest, code, cop0, cop1, - op_true, op_false); - } - else - { - gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode)); - x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1, - op_true, op_false); - if (GET_MODE (x) == mode) - x = gen_lowpart (data_mode, x); - } - - return x; -} - -/* Expand integer vector comparison. */ - -bool -ix86_expand_int_vec_cmp (rtx operands[]) -{ - rtx_code code = GET_CODE (operands[1]); - bool negate = false; - rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2], - operands[3], NULL, NULL, &negate); - - if (!cmp) - return false; - - if (negate) - cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp, - CONST0_RTX (GET_MODE (cmp)), - NULL, NULL, &negate); - - gcc_assert (!negate); - - if (operands[0] != cmp) - emit_move_insn (operands[0], cmp); - - return true; -} - -/* Expand a floating-point vector conditional move; a vcond operation - rather than a movcc operation. */ - -bool -ix86_expand_fp_vcond (rtx operands[]) -{ - enum rtx_code code = GET_CODE (operands[3]); - rtx cmp; - - code = ix86_prepare_sse_fp_compare_args (operands[0], code, - &operands[4], &operands[5]); - if (code == UNKNOWN) - { - rtx temp; - switch (GET_CODE (operands[3])) - { - case LTGT: - temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4], - operands[5], operands[0], operands[0]); - cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4], - operands[5], operands[1], operands[2]); - code = AND; - break; - case UNEQ: - temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4], - operands[5], operands[0], operands[0]); - cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4], - operands[5], operands[1], operands[2]); - code = IOR; - break; - default: - gcc_unreachable (); - } - cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1, - OPTAB_DIRECT); - ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); - return true; - } - - if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4], - operands[5], operands[1], operands[2])) - return true; - - cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5], - operands[1], operands[2]); - ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]); - return true; -} - -/* Expand a signed/unsigned integral vector conditional move. */ - -bool -ix86_expand_int_vcond (rtx operands[]) -{ - machine_mode data_mode = GET_MODE (operands[0]); - machine_mode mode = GET_MODE (operands[4]); - enum rtx_code code = GET_CODE (operands[3]); - bool negate = false; - rtx x, cop0, cop1; - - cop0 = operands[4]; - cop1 = operands[5]; - - /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31 - and x < 0 ? 1 : 0 into (unsigned) x >> 31. */ - if ((code == LT || code == GE) - && data_mode == mode - && cop1 == CONST0_RTX (mode) - && operands[1 + (code == LT)] == CONST0_RTX (data_mode) - && GET_MODE_UNIT_SIZE (data_mode) > 1 - && GET_MODE_UNIT_SIZE (data_mode) <= 8 - && (GET_MODE_SIZE (data_mode) == 16 - || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32))) - { - rtx negop = operands[2 - (code == LT)]; - int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1; - if (negop == CONST1_RTX (data_mode)) - { - rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift), - operands[0], 1, OPTAB_DIRECT); - if (res != operands[0]) - emit_move_insn (operands[0], res); - return true; - } - else if (GET_MODE_INNER (data_mode) != DImode - && vector_all_ones_operand (negop, data_mode)) - { - rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift), - operands[0], 0, OPTAB_DIRECT); - if (res != operands[0]) - emit_move_insn (operands[0], res); - return true; - } - } - - if (!nonimmediate_operand (cop1, mode)) - cop1 = force_reg (mode, cop1); - if (!general_operand (operands[1], data_mode)) - operands[1] = force_reg (data_mode, operands[1]); - if (!general_operand (operands[2], data_mode)) - operands[2] = force_reg (data_mode, operands[2]); - - x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1, - operands[1], operands[2], &negate); - - if (!x) - return false; - - ix86_expand_sse_movcc (operands[0], x, operands[1+negate], - operands[2-negate]); - return true; -} - -/* AVX512F does support 64-byte integer vector operations, - thus the longest vector we are faced with is V64QImode. */ -#define MAX_VECT_LEN 64 - -struct expand_vec_perm_d -{ - rtx target, op0, op1; - unsigned char perm[MAX_VECT_LEN]; - machine_mode vmode; - unsigned char nelt; - bool one_operand_p; - bool testing_p; -}; - -static bool -ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1, - struct expand_vec_perm_d *d) -{ - /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const - expander, so args are either in d, or in op0, op1 etc. */ - machine_mode mode = GET_MODE (d ? d->op0 : op0); - machine_mode maskmode = mode; - rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; - - switch (mode) - { - case E_V8HImode: - if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_vpermt2varv8hi3; - break; - case E_V16HImode: - if (TARGET_AVX512VL && TARGET_AVX512BW) - gen = gen_avx512vl_vpermt2varv16hi3; - break; - case E_V64QImode: - if (TARGET_AVX512VBMI) - gen = gen_avx512bw_vpermt2varv64qi3; - break; - case E_V32HImode: - if (TARGET_AVX512BW) - gen = gen_avx512bw_vpermt2varv32hi3; - break; - case E_V4SImode: - if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermt2varv4si3; - break; - case E_V8SImode: - if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermt2varv8si3; - break; - case E_V16SImode: - if (TARGET_AVX512F) - gen = gen_avx512f_vpermt2varv16si3; - break; - case E_V4SFmode: - if (TARGET_AVX512VL) - { - gen = gen_avx512vl_vpermt2varv4sf3; - maskmode = V4SImode; - } - break; - case E_V8SFmode: - if (TARGET_AVX512VL) - { - gen = gen_avx512vl_vpermt2varv8sf3; - maskmode = V8SImode; - } - break; - case E_V16SFmode: - if (TARGET_AVX512F) - { - gen = gen_avx512f_vpermt2varv16sf3; - maskmode = V16SImode; - } - break; - case E_V2DImode: - if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermt2varv2di3; - break; - case E_V4DImode: - if (TARGET_AVX512VL) - gen = gen_avx512vl_vpermt2varv4di3; - break; - case E_V8DImode: - if (TARGET_AVX512F) - gen = gen_avx512f_vpermt2varv8di3; - break; - case E_V2DFmode: - if (TARGET_AVX512VL) - { - gen = gen_avx512vl_vpermt2varv2df3; - maskmode = V2DImode; - } - break; - case E_V4DFmode: - if (TARGET_AVX512VL) - { - gen = gen_avx512vl_vpermt2varv4df3; - maskmode = V4DImode; - } - break; - case E_V8DFmode: - if (TARGET_AVX512F) - { - gen = gen_avx512f_vpermt2varv8df3; - maskmode = V8DImode; - } - break; - default: - break; - } - - if (gen == NULL) - return false; - - /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const - expander, so args are either in d, or in op0, op1 etc. */ - if (d) - { - rtx vec[64]; - target = d->target; - op0 = d->op0; - op1 = d->op1; - for (int i = 0; i < d->nelt; ++i) - vec[i] = GEN_INT (d->perm[i]); - mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); - } - - emit_insn (gen (target, force_reg (maskmode, mask), op0, op1)); - return true; -} - -/* Expand a variable vector permutation. */ - -void -ix86_expand_vec_perm (rtx operands[]) -{ - rtx target = operands[0]; - rtx op0 = operands[1]; - rtx op1 = operands[2]; - rtx mask = operands[3]; - rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32]; - machine_mode mode = GET_MODE (op0); - machine_mode maskmode = GET_MODE (mask); - int w, e, i; - bool one_operand_shuffle = rtx_equal_p (op0, op1); - - /* Number of elements in the vector. */ - w = GET_MODE_NUNITS (mode); - e = GET_MODE_UNIT_SIZE (mode); - gcc_assert (w <= 64); - - if (TARGET_AVX512F && one_operand_shuffle) - { - rtx (*gen) (rtx, rtx, rtx) = NULL; - switch (mode) - { - case E_V16SImode: - gen =gen_avx512f_permvarv16si; - break; - case E_V16SFmode: - gen = gen_avx512f_permvarv16sf; - break; - case E_V8DImode: - gen = gen_avx512f_permvarv8di; - break; - case E_V8DFmode: - gen = gen_avx512f_permvarv8df; - break; - default: - break; - } - if (gen != NULL) - { - emit_insn (gen (target, op0, mask)); - return; - } - } - - if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL)) - return; - - if (TARGET_AVX2) - { - if (mode == V4DImode || mode == V4DFmode || mode == V16HImode) - { - /* Unfortunately, the VPERMQ and VPERMPD instructions only support - an constant shuffle operand. With a tiny bit of effort we can - use VPERMD instead. A re-interpretation stall for V4DFmode is - unfortunate but there's no avoiding it. - Similarly for V16HImode we don't have instructions for variable - shuffling, while for V32QImode we can use after preparing suitable - masks vpshufb; vpshufb; vpermq; vpor. */ - - if (mode == V16HImode) - { - maskmode = mode = V32QImode; - w = 32; - e = 1; - } - else - { - maskmode = mode = V8SImode; - w = 8; - e = 4; - } - t1 = gen_reg_rtx (maskmode); - - /* Replicate the low bits of the V4DImode mask into V8SImode: - mask = { A B C D } - t1 = { A A B B C C D D }. */ - for (i = 0; i < w / 2; ++i) - vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2); - vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); - vt = force_reg (maskmode, vt); - mask = gen_lowpart (maskmode, mask); - if (maskmode == V8SImode) - emit_insn (gen_avx2_permvarv8si (t1, mask, vt)); - else - emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt)); - - /* Multiply the shuffle indicies by two. */ - t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1, - OPTAB_DIRECT); - - /* Add one to the odd shuffle indicies: - t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */ - for (i = 0; i < w / 2; ++i) - { - vec[i * 2] = const0_rtx; - vec[i * 2 + 1] = const1_rtx; - } - vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); - vt = validize_mem (force_const_mem (maskmode, vt)); - t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1, - OPTAB_DIRECT); - - /* Continue as if V8SImode (resp. V32QImode) was used initially. */ - operands[3] = mask = t1; - target = gen_reg_rtx (mode); - op0 = gen_lowpart (mode, op0); - op1 = gen_lowpart (mode, op1); - } - - switch (mode) - { - case E_V8SImode: - /* The VPERMD and VPERMPS instructions already properly ignore - the high bits of the shuffle elements. No need for us to - perform an AND ourselves. */ - if (one_operand_shuffle) - { - emit_insn (gen_avx2_permvarv8si (target, op0, mask)); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - } - else - { - t1 = gen_reg_rtx (V8SImode); - t2 = gen_reg_rtx (V8SImode); - emit_insn (gen_avx2_permvarv8si (t1, op0, mask)); - emit_insn (gen_avx2_permvarv8si (t2, op1, mask)); - goto merge_two; - } - return; - - case E_V8SFmode: - mask = gen_lowpart (V8SImode, mask); - if (one_operand_shuffle) - emit_insn (gen_avx2_permvarv8sf (target, op0, mask)); - else - { - t1 = gen_reg_rtx (V8SFmode); - t2 = gen_reg_rtx (V8SFmode); - emit_insn (gen_avx2_permvarv8sf (t1, op0, mask)); - emit_insn (gen_avx2_permvarv8sf (t2, op1, mask)); - goto merge_two; - } - return; - - case E_V4SImode: - /* By combining the two 128-bit input vectors into one 256-bit - input vector, we can use VPERMD and VPERMPS for the full - two-operand shuffle. */ - t1 = gen_reg_rtx (V8SImode); - t2 = gen_reg_rtx (V8SImode); - emit_insn (gen_avx_vec_concatv8si (t1, op0, op1)); - emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); - emit_insn (gen_avx2_permvarv8si (t1, t1, t2)); - emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx)); - return; - - case E_V4SFmode: - t1 = gen_reg_rtx (V8SFmode); - t2 = gen_reg_rtx (V8SImode); - mask = gen_lowpart (V4SImode, mask); - emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1)); - emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); - emit_insn (gen_avx2_permvarv8sf (t1, t1, t2)); - emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx)); - return; - - case E_V32QImode: - t1 = gen_reg_rtx (V32QImode); - t2 = gen_reg_rtx (V32QImode); - t3 = gen_reg_rtx (V32QImode); - vt2 = GEN_INT (-128); - vt = gen_const_vec_duplicate (V32QImode, vt2); - vt = force_reg (V32QImode, vt); - for (i = 0; i < 32; i++) - vec[i] = i < 16 ? vt2 : const0_rtx; - vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec)); - vt2 = force_reg (V32QImode, vt2); - /* From mask create two adjusted masks, which contain the same - bits as mask in the low 7 bits of each vector element. - The first mask will have the most significant bit clear - if it requests element from the same 128-bit lane - and MSB set if it requests element from the other 128-bit lane. - The second mask will have the opposite values of the MSB, - and additionally will have its 128-bit lanes swapped. - E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have - t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and - t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ... - stands for other 12 bytes. */ - /* The bit whether element is from the same lane or the other - lane is bit 4, so shift it up by 3 to the MSB position. */ - t5 = gen_reg_rtx (V4DImode); - emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask), - GEN_INT (3))); - /* Clear MSB bits from the mask just in case it had them set. */ - emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask)); - /* After this t1 will have MSB set for elements from other lane. */ - emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2)); - /* Clear bits other than MSB. */ - emit_insn (gen_andv32qi3 (t1, t1, vt)); - /* Or in the lower bits from mask into t3. */ - emit_insn (gen_iorv32qi3 (t3, t1, t2)); - /* And invert MSB bits in t1, so MSB is set for elements from the same - lane. */ - emit_insn (gen_xorv32qi3 (t1, t1, vt)); - /* Swap 128-bit lanes in t3. */ - t6 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3), - const2_rtx, GEN_INT (3), - const0_rtx, const1_rtx)); - /* And or in the lower bits from mask into t1. */ - emit_insn (gen_iorv32qi3 (t1, t1, t2)); - if (one_operand_shuffle) - { - /* Each of these shuffles will put 0s in places where - element from the other 128-bit lane is needed, otherwise - will shuffle in the requested value. */ - emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, - gen_lowpart (V32QImode, t6))); - emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1)); - /* For t3 the 128-bit lanes are swapped again. */ - t7 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3), - const2_rtx, GEN_INT (3), - const0_rtx, const1_rtx)); - /* And oring both together leads to the result. */ - emit_insn (gen_iorv32qi3 (target, t1, - gen_lowpart (V32QImode, t7))); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - return; - } - - t4 = gen_reg_rtx (V32QImode); - /* Similarly to the above one_operand_shuffle code, - just for repeated twice for each operand. merge_two: - code will merge the two results together. */ - emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, - gen_lowpart (V32QImode, t6))); - emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, - gen_lowpart (V32QImode, t6))); - emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1)); - emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1)); - t7 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4), - const2_rtx, GEN_INT (3), - const0_rtx, const1_rtx)); - t8 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3), - const2_rtx, GEN_INT (3), - const0_rtx, const1_rtx)); - emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7))); - emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8))); - t1 = t4; - t2 = t3; - goto merge_two; - - default: - gcc_assert (GET_MODE_SIZE (mode) <= 16); - break; - } - } - - if (TARGET_XOP) - { - /* The XOP VPPERM insn supports three inputs. By ignoring the - one_operand_shuffle special case, we avoid creating another - set of constant vectors in memory. */ - one_operand_shuffle = false; - - /* mask = mask & {2*w-1, ...} */ - vt = GEN_INT (2*w - 1); - } - else - { - /* mask = mask & {w-1, ...} */ - vt = GEN_INT (w - 1); - } - - vt = gen_const_vec_duplicate (maskmode, vt); - mask = expand_simple_binop (maskmode, AND, mask, vt, - NULL_RTX, 0, OPTAB_DIRECT); - - /* For non-QImode operations, convert the word permutation control - into a byte permutation control. */ - if (mode != V16QImode) - { - mask = expand_simple_binop (maskmode, ASHIFT, mask, - GEN_INT (exact_log2 (e)), - NULL_RTX, 0, OPTAB_DIRECT); - - /* Convert mask to vector of chars. */ - mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask)); - - /* Replicate each of the input bytes into byte positions: - (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8} - (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12} - (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */ - for (i = 0; i < 16; ++i) - vec[i] = GEN_INT (i/e * e); - vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - vt = validize_mem (force_const_mem (V16QImode, vt)); - if (TARGET_XOP) - emit_insn (gen_xop_pperm (mask, mask, mask, vt)); - else - emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt)); - - /* Convert it into the byte positions by doing - mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */ - for (i = 0; i < 16; ++i) - vec[i] = GEN_INT (i % e); - vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - vt = validize_mem (force_const_mem (V16QImode, vt)); - emit_insn (gen_addv16qi3 (mask, mask, vt)); - } - - /* The actual shuffle operations all operate on V16QImode. */ - op0 = gen_lowpart (V16QImode, op0); - op1 = gen_lowpart (V16QImode, op1); - - if (TARGET_XOP) - { - if (GET_MODE (target) != V16QImode) - target = gen_reg_rtx (V16QImode); - emit_insn (gen_xop_pperm (target, op0, op1, mask)); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - } - else if (one_operand_shuffle) - { - if (GET_MODE (target) != V16QImode) - target = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask)); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - } - else - { - rtx xops[6]; - bool ok; - - /* Shuffle the two input vectors independently. */ - t1 = gen_reg_rtx (V16QImode); - t2 = gen_reg_rtx (V16QImode); - emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask)); - emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask)); - - merge_two: - /* Then merge them together. The key is whether any given control - element contained a bit set that indicates the second word. */ - mask = operands[3]; - vt = GEN_INT (w); - if (maskmode == V2DImode && !TARGET_SSE4_1) - { - /* Without SSE4.1, we don't have V2DImode EQ. Perform one - more shuffle to convert the V2DI input mask into a V4SI - input mask. At which point the masking that expand_int_vcond - will work as desired. */ - rtx t3 = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask), - const0_rtx, const0_rtx, - const2_rtx, const2_rtx)); - mask = t3; - maskmode = V4SImode; - e = w = 4; - } - - vt = gen_const_vec_duplicate (maskmode, vt); - vt = force_reg (maskmode, vt); - mask = expand_simple_binop (maskmode, AND, mask, vt, - NULL_RTX, 0, OPTAB_DIRECT); - - if (GET_MODE (target) != mode) - target = gen_reg_rtx (mode); - xops[0] = target; - xops[1] = gen_lowpart (mode, t2); - xops[2] = gen_lowpart (mode, t1); - xops[3] = gen_rtx_EQ (maskmode, mask, vt); - xops[4] = mask; - xops[5] = vt; - ok = ix86_expand_int_vcond (xops); - gcc_assert (ok); - if (target != operands[0]) - emit_move_insn (operands[0], - gen_lowpart (GET_MODE (operands[0]), target)); - } -} - -/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is - true if we should do zero extension, else sign extension. HIGH_P is - true if we want the N/2 high elements, else the low elements. */ - -void -ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) -{ - machine_mode imode = GET_MODE (src); - rtx tmp; - - if (TARGET_SSE4_1) - { - rtx (*unpack)(rtx, rtx); - rtx (*extract)(rtx, rtx) = NULL; - machine_mode halfmode = BLKmode; - - switch (imode) - { - case E_V64QImode: - if (unsigned_p) - unpack = gen_avx512bw_zero_extendv32qiv32hi2; - else - unpack = gen_avx512bw_sign_extendv32qiv32hi2; - halfmode = V32QImode; - extract - = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi; - break; - case E_V32QImode: - if (unsigned_p) - unpack = gen_avx2_zero_extendv16qiv16hi2; - else - unpack = gen_avx2_sign_extendv16qiv16hi2; - halfmode = V16QImode; - extract - = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi; - break; - case E_V32HImode: - if (unsigned_p) - unpack = gen_avx512f_zero_extendv16hiv16si2; - else - unpack = gen_avx512f_sign_extendv16hiv16si2; - halfmode = V16HImode; - extract - = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi; - break; - case E_V16HImode: - if (unsigned_p) - unpack = gen_avx2_zero_extendv8hiv8si2; - else - unpack = gen_avx2_sign_extendv8hiv8si2; - halfmode = V8HImode; - extract - = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi; - break; - case E_V16SImode: - if (unsigned_p) - unpack = gen_avx512f_zero_extendv8siv8di2; - else - unpack = gen_avx512f_sign_extendv8siv8di2; - halfmode = V8SImode; - extract - = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si; - break; - case E_V8SImode: - if (unsigned_p) - unpack = gen_avx2_zero_extendv4siv4di2; - else - unpack = gen_avx2_sign_extendv4siv4di2; - halfmode = V4SImode; - extract - = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si; - break; - case E_V16QImode: - if (unsigned_p) - unpack = gen_sse4_1_zero_extendv8qiv8hi2; - else - unpack = gen_sse4_1_sign_extendv8qiv8hi2; - break; - case E_V8HImode: - if (unsigned_p) - unpack = gen_sse4_1_zero_extendv4hiv4si2; - else - unpack = gen_sse4_1_sign_extendv4hiv4si2; - break; - case E_V4SImode: - if (unsigned_p) - unpack = gen_sse4_1_zero_extendv2siv2di2; - else - unpack = gen_sse4_1_sign_extendv2siv2di2; - break; - default: - gcc_unreachable (); - } - - if (GET_MODE_SIZE (imode) >= 32) - { - tmp = gen_reg_rtx (halfmode); - emit_insn (extract (tmp, src)); - } - else if (high_p) - { - /* Shift higher 8 bytes to lower 8 bytes. */ - tmp = gen_reg_rtx (V1TImode); - emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src), - GEN_INT (64))); - tmp = gen_lowpart (imode, tmp); - } - else - tmp = src; - - emit_insn (unpack (dest, tmp)); - } - else - { - rtx (*unpack)(rtx, rtx, rtx); - - switch (imode) - { - case E_V16QImode: - if (high_p) - unpack = gen_vec_interleave_highv16qi; - else - unpack = gen_vec_interleave_lowv16qi; - break; - case E_V8HImode: - if (high_p) - unpack = gen_vec_interleave_highv8hi; - else - unpack = gen_vec_interleave_lowv8hi; - break; - case E_V4SImode: - if (high_p) - unpack = gen_vec_interleave_highv4si; - else - unpack = gen_vec_interleave_lowv4si; - break; - default: - gcc_unreachable (); - } - - if (unsigned_p) - tmp = force_reg (imode, CONST0_RTX (imode)); - else - tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), - src, pc_rtx, pc_rtx); - - rtx tmp2 = gen_reg_rtx (imode); - emit_insn (unpack (tmp2, src, tmp)); - emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2)); - } -} - -/* Expand conditional increment or decrement using adb/sbb instructions. - The default case using setcc followed by the conditional move can be - done by generic code. */ -bool -ix86_expand_int_addcc (rtx operands[]) -{ - enum rtx_code code = GET_CODE (operands[1]); - rtx flags; - rtx (*insn)(rtx, rtx, rtx, rtx, rtx); - rtx compare_op; - rtx val = const0_rtx; - bool fpcmp = false; - machine_mode mode; - rtx op0 = XEXP (operands[1], 0); - rtx op1 = XEXP (operands[1], 1); - - if (operands[3] != const1_rtx - && operands[3] != constm1_rtx) - return false; - if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op)) - return false; - code = GET_CODE (compare_op); - - flags = XEXP (compare_op, 0); - - if (GET_MODE (flags) == CCFPmode) - { - fpcmp = true; - code = ix86_fp_compare_code_to_integer (code); - } - - if (code != LTU) - { - val = constm1_rtx; - if (fpcmp) - PUT_CODE (compare_op, - reverse_condition_maybe_unordered - (GET_CODE (compare_op))); - else - PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op))); - } - - mode = GET_MODE (operands[0]); - - /* Construct either adc or sbb insn. */ - if ((code == LTU) == (operands[3] == constm1_rtx)) - { - switch (mode) - { - case E_QImode: - insn = gen_subqi3_carry; - break; - case E_HImode: - insn = gen_subhi3_carry; - break; - case E_SImode: - insn = gen_subsi3_carry; - break; - case E_DImode: - insn = gen_subdi3_carry; - break; - default: - gcc_unreachable (); - } - } - else - { - switch (mode) - { - case E_QImode: - insn = gen_addqi3_carry; - break; - case E_HImode: - insn = gen_addhi3_carry; - break; - case E_SImode: - insn = gen_addsi3_carry; - break; - case E_DImode: - insn = gen_adddi3_carry; - break; - default: - gcc_unreachable (); - } - } - emit_insn (insn (operands[0], operands[2], val, flags, compare_op)); - - return true; -} - - -/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode, - but works for floating pointer parameters and nonoffsetable memories. - For pushes, it returns just stack offsets; the values will be saved - in the right order. Maximally three parts are generated. */ - -static int -ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode) -{ - int size; - - if (!TARGET_64BIT) - size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4; - else - size = (GET_MODE_SIZE (mode) + 4) / 8; - - gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand))); - gcc_assert (size >= 2 && size <= 4); - - /* Optimize constant pool reference to immediates. This is used by fp - moves, that force all constants to memory to allow combining. */ - if (MEM_P (operand) && MEM_READONLY_P (operand)) - operand = avoid_constant_pool_reference (operand); - - if (MEM_P (operand) && !offsettable_memref_p (operand)) - { - /* The only non-offsetable memories we handle are pushes. */ - int ok = push_operand (operand, VOIDmode); - - gcc_assert (ok); - - operand = copy_rtx (operand); - PUT_MODE (operand, word_mode); - parts[0] = parts[1] = parts[2] = parts[3] = operand; - return size; - } - - if (GET_CODE (operand) == CONST_VECTOR) - { - scalar_int_mode imode = int_mode_for_mode (mode).require (); - /* Caution: if we looked through a constant pool memory above, - the operand may actually have a different mode now. That's - ok, since we want to pun this all the way back to an integer. */ - operand = simplify_subreg (imode, operand, GET_MODE (operand), 0); - gcc_assert (operand != NULL); - mode = imode; - } - - if (!TARGET_64BIT) - { - if (mode == DImode) - split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); - else - { - int i; - - if (REG_P (operand)) - { - gcc_assert (reload_completed); - for (i = 0; i < size; i++) - parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i); - } - else if (offsettable_memref_p (operand)) - { - operand = adjust_address (operand, SImode, 0); - parts[0] = operand; - for (i = 1; i < size; i++) - parts[i] = adjust_address (operand, SImode, 4 * i); - } - else if (CONST_DOUBLE_P (operand)) - { - const REAL_VALUE_TYPE *r; - long l[4]; - - r = CONST_DOUBLE_REAL_VALUE (operand); - switch (mode) - { - case E_TFmode: - real_to_target (l, r, mode); - parts[3] = gen_int_mode (l[3], SImode); - parts[2] = gen_int_mode (l[2], SImode); - break; - case E_XFmode: - /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since - long double may not be 80-bit. */ - real_to_target (l, r, mode); - parts[2] = gen_int_mode (l[2], SImode); - break; - case E_DFmode: - REAL_VALUE_TO_TARGET_DOUBLE (*r, l); - break; - default: - gcc_unreachable (); - } - parts[1] = gen_int_mode (l[1], SImode); - parts[0] = gen_int_mode (l[0], SImode); - } - else - gcc_unreachable (); - } - } - else - { - if (mode == TImode) - split_double_mode (mode, &operand, 1, &parts[0], &parts[1]); - if (mode == XFmode || mode == TFmode) - { - machine_mode upper_mode = mode==XFmode ? SImode : DImode; - if (REG_P (operand)) - { - gcc_assert (reload_completed); - parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0); - parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1); - } - else if (offsettable_memref_p (operand)) - { - operand = adjust_address (operand, DImode, 0); - parts[0] = operand; - parts[1] = adjust_address (operand, upper_mode, 8); - } - else if (CONST_DOUBLE_P (operand)) - { - long l[4]; - - real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode); - - /* real_to_target puts 32-bit pieces in each long. */ - parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff)) - | ((l[1] & HOST_WIDE_INT_C (0xffffffff)) - << 32), DImode); - - if (upper_mode == SImode) - parts[1] = gen_int_mode (l[2], SImode); - else - parts[1] - = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff)) - | ((l[3] & HOST_WIDE_INT_C (0xffffffff)) - << 32), DImode); - } - else - gcc_unreachable (); - } - } - - return size; -} - -/* Emit insns to perform a move or push of DI, DF, XF, and TF values. - Return false when normal moves are needed; true when all required - insns have been emitted. Operands 2-4 contain the input values - int the correct order; operands 5-7 contain the output values. */ - -void -ix86_split_long_move (rtx operands[]) -{ - rtx part[2][4]; - int nparts, i, j; - int push = 0; - int collisions = 0; - machine_mode mode = GET_MODE (operands[0]); - bool collisionparts[4]; - - /* The DFmode expanders may ask us to move double. - For 64bit target this is single move. By hiding the fact - here we simplify i386.md splitters. */ - if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8) - { - /* Optimize constant pool reference to immediates. This is used by - fp moves, that force all constants to memory to allow combining. */ - - if (MEM_P (operands[1]) - && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF - && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0))) - operands[1] = get_pool_constant (XEXP (operands[1], 0)); - if (push_operand (operands[0], VOIDmode)) - { - operands[0] = copy_rtx (operands[0]); - PUT_MODE (operands[0], word_mode); - } - else - operands[0] = gen_lowpart (DImode, operands[0]); - operands[1] = gen_lowpart (DImode, operands[1]); - emit_move_insn (operands[0], operands[1]); - return; - } - - /* The only non-offsettable memory we handle is push. */ - if (push_operand (operands[0], VOIDmode)) - push = 1; - else - gcc_assert (!MEM_P (operands[0]) - || offsettable_memref_p (operands[0])); - - nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0])); - ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0])); - - /* When emitting push, take care for source operands on the stack. */ - if (push && MEM_P (operands[1]) - && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) - { - rtx src_base = XEXP (part[1][nparts - 1], 0); - - /* Compensate for the stack decrement by 4. */ - if (!TARGET_64BIT && nparts == 3 - && mode == XFmode && TARGET_128BIT_LONG_DOUBLE) - src_base = plus_constant (Pmode, src_base, 4); - - /* src_base refers to the stack pointer and is - automatically decreased by emitted push. */ - for (i = 0; i < nparts; i++) - part[1][i] = change_address (part[1][i], - GET_MODE (part[1][i]), src_base); - } - - /* We need to do copy in the right order in case an address register - of the source overlaps the destination. */ - if (REG_P (part[0][0]) && MEM_P (part[1][0])) - { - rtx tmp; - - for (i = 0; i < nparts; i++) - { - collisionparts[i] - = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0)); - if (collisionparts[i]) - collisions++; - } - - /* Collision in the middle part can be handled by reordering. */ - if (collisions == 1 && nparts == 3 && collisionparts [1]) - { - std::swap (part[0][1], part[0][2]); - std::swap (part[1][1], part[1][2]); - } - else if (collisions == 1 - && nparts == 4 - && (collisionparts [1] || collisionparts [2])) - { - if (collisionparts [1]) - { - std::swap (part[0][1], part[0][2]); - std::swap (part[1][1], part[1][2]); - } - else - { - std::swap (part[0][2], part[0][3]); - std::swap (part[1][2], part[1][3]); - } - } - - /* If there are more collisions, we can't handle it by reordering. - Do an lea to the last part and use only one colliding move. */ - else if (collisions > 1) - { - rtx base, addr; - - collisions = 1; - - base = part[0][nparts - 1]; - - /* Handle the case when the last part isn't valid for lea. - Happens in 64-bit mode storing the 12-byte XFmode. */ - if (GET_MODE (base) != Pmode) - base = gen_rtx_REG (Pmode, REGNO (base)); - - addr = XEXP (part[1][0], 0); - if (TARGET_TLS_DIRECT_SEG_REFS) - { - struct ix86_address parts; - int ok = ix86_decompose_address (addr, &parts); - gcc_assert (ok); - /* It is not valid to use %gs: or %fs: in lea. */ - gcc_assert (parts.seg == ADDR_SPACE_GENERIC); - } - emit_insn (gen_rtx_SET (base, addr)); - part[1][0] = replace_equiv_address (part[1][0], base); - for (i = 1; i < nparts; i++) - { - tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i); - part[1][i] = replace_equiv_address (part[1][i], tmp); - } - } - } - - if (push) - { - if (!TARGET_64BIT) - { - if (nparts == 3) - { - if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode) - emit_insn (ix86_gen_add3 (stack_pointer_rtx, - stack_pointer_rtx, GEN_INT (-4))); - emit_move_insn (part[0][2], part[1][2]); - } - else if (nparts == 4) - { - emit_move_insn (part[0][3], part[1][3]); - emit_move_insn (part[0][2], part[1][2]); - } - } - else - { - /* In 64bit mode we don't have 32bit push available. In case this is - register, it is OK - we will just use larger counterpart. We also - retype memory - these comes from attempt to avoid REX prefix on - moving of second half of TFmode value. */ - if (GET_MODE (part[1][1]) == SImode) - { - switch (GET_CODE (part[1][1])) - { - case MEM: - part[1][1] = adjust_address (part[1][1], DImode, 0); - break; - - case REG: - part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1])); - break; - - default: - gcc_unreachable (); - } - - if (GET_MODE (part[1][0]) == SImode) - part[1][0] = part[1][1]; - } - } - emit_move_insn (part[0][1], part[1][1]); - emit_move_insn (part[0][0], part[1][0]); - return; - } - - /* Choose correct order to not overwrite the source before it is copied. */ - if ((REG_P (part[0][0]) - && REG_P (part[1][1]) - && (REGNO (part[0][0]) == REGNO (part[1][1]) - || (nparts == 3 - && REGNO (part[0][0]) == REGNO (part[1][2])) - || (nparts == 4 - && REGNO (part[0][0]) == REGNO (part[1][3])))) - || (collisions > 0 - && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0)))) - { - for (i = 0, j = nparts - 1; i < nparts; i++, j--) - { - operands[2 + i] = part[0][j]; - operands[6 + i] = part[1][j]; - } - } - else - { - for (i = 0; i < nparts; i++) - { - operands[2 + i] = part[0][i]; - operands[6 + i] = part[1][i]; - } - } - - /* If optimizing for size, attempt to locally unCSE nonzero constants. */ - if (optimize_insn_for_size_p ()) - { - for (j = 0; j < nparts - 1; j++) - if (CONST_INT_P (operands[6 + j]) - && operands[6 + j] != const0_rtx - && REG_P (operands[2 + j])) - for (i = j; i < nparts - 1; i++) - if (CONST_INT_P (operands[7 + i]) - && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j])) - operands[7 + i] = operands[2 + j]; - } - - for (i = 0; i < nparts; i++) - emit_move_insn (operands[2 + i], operands[6 + i]); - - return; -} - -/* Helper function of ix86_split_ashl used to generate an SImode/DImode - left shift by a constant, either using a single shift or - a sequence of add instructions. */ - -static void -ix86_expand_ashl_const (rtx operand, int count, machine_mode mode) -{ - rtx (*insn)(rtx, rtx, rtx); - - if (count == 1 - || (count * ix86_cost->add <= ix86_cost->shift_const - && !optimize_insn_for_size_p ())) - { - insn = mode == DImode ? gen_addsi3 : gen_adddi3; - while (count-- > 0) - emit_insn (insn (operand, operand, operand)); - } - else - { - insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3; - emit_insn (insn (operand, operand, GEN_INT (count))); - } -} - -void -ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode) -{ - rtx (*gen_ashl3)(rtx, rtx, rtx); - rtx (*gen_shld)(rtx, rtx, rtx); - int half_width = GET_MODE_BITSIZE (mode) >> 1; - - rtx low[2], high[2]; - int count; - - if (CONST_INT_P (operands[2])) - { - split_double_mode (mode, operands, 2, low, high); - count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); - - if (count >= half_width) - { - emit_move_insn (high[0], low[1]); - emit_move_insn (low[0], const0_rtx); - - if (count > half_width) - ix86_expand_ashl_const (high[0], count - half_width, mode); - } - else - { - gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - emit_insn (gen_shld (high[0], low[0], GEN_INT (count))); - ix86_expand_ashl_const (low[0], count, mode); - } - return; - } - - split_double_mode (mode, operands, 1, low, high); - - gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3; - - if (operands[1] == const1_rtx) - { - /* Assuming we've chosen a QImode capable registers, then 1 << N - can be done with two 32/64-bit shifts, no branches, no cmoves. */ - if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0])) - { - rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG); - - ix86_expand_clear (low[0]); - ix86_expand_clear (high[0]); - emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width))); - - d = gen_lowpart (QImode, low[0]); - d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); - s = gen_rtx_EQ (QImode, flags, const0_rtx); - emit_insn (gen_rtx_SET (d, s)); - - d = gen_lowpart (QImode, high[0]); - d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); - s = gen_rtx_NE (QImode, flags, const0_rtx); - emit_insn (gen_rtx_SET (d, s)); - } - - /* Otherwise, we can get the same results by manually performing - a bit extract operation on bit 5/6, and then performing the two - shifts. The two methods of getting 0/1 into low/high are exactly - the same size. Avoiding the shift in the bit extract case helps - pentium4 a bit; no one else seems to care much either way. */ - else - { - machine_mode half_mode; - rtx (*gen_lshr3)(rtx, rtx, rtx); - rtx (*gen_and3)(rtx, rtx, rtx); - rtx (*gen_xor3)(rtx, rtx, rtx); - HOST_WIDE_INT bits; - rtx x; - - if (mode == DImode) - { - half_mode = SImode; - gen_lshr3 = gen_lshrsi3; - gen_and3 = gen_andsi3; - gen_xor3 = gen_xorsi3; - bits = 5; - } - else - { - half_mode = DImode; - gen_lshr3 = gen_lshrdi3; - gen_and3 = gen_anddi3; - gen_xor3 = gen_xordi3; - bits = 6; - } - - if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ()) - x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]); - else - x = gen_lowpart (half_mode, operands[2]); - emit_insn (gen_rtx_SET (high[0], x)); - - emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits))); - emit_insn (gen_and3 (high[0], high[0], const1_rtx)); - emit_move_insn (low[0], high[0]); - emit_insn (gen_xor3 (low[0], low[0], const1_rtx)); - } - - emit_insn (gen_ashl3 (low[0], low[0], operands[2])); - emit_insn (gen_ashl3 (high[0], high[0], operands[2])); - return; - } - - if (operands[1] == constm1_rtx) - { - /* For -1 << N, we can avoid the shld instruction, because we - know that we're shifting 0...31/63 ones into a -1. */ - emit_move_insn (low[0], constm1_rtx); - if (optimize_insn_for_size_p ()) - emit_move_insn (high[0], low[0]); - else - emit_move_insn (high[0], constm1_rtx); - } - else - { - gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - split_double_mode (mode, operands, 1, low, high); - emit_insn (gen_shld (high[0], low[0], operands[2])); - } - - emit_insn (gen_ashl3 (low[0], low[0], operands[2])); - - if (TARGET_CMOVE && scratch) - { - rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx) - = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1; - - ix86_expand_clear (scratch); - emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch)); - } - else - { - rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx) - = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2; - - emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2])); - } -} - -void -ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode) -{ - rtx (*gen_ashr3)(rtx, rtx, rtx) - = mode == DImode ? gen_ashrsi3 : gen_ashrdi3; - rtx (*gen_shrd)(rtx, rtx, rtx); - int half_width = GET_MODE_BITSIZE (mode) >> 1; - - rtx low[2], high[2]; - int count; - - if (CONST_INT_P (operands[2])) - { - split_double_mode (mode, operands, 2, low, high); - count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); - - if (count == GET_MODE_BITSIZE (mode) - 1) - { - emit_move_insn (high[0], high[1]); - emit_insn (gen_ashr3 (high[0], high[0], - GEN_INT (half_width - 1))); - emit_move_insn (low[0], high[0]); - - } - else if (count >= half_width) - { - emit_move_insn (low[0], high[1]); - emit_move_insn (high[0], low[0]); - emit_insn (gen_ashr3 (high[0], high[0], - GEN_INT (half_width - 1))); - - if (count > half_width) - emit_insn (gen_ashr3 (low[0], low[0], - GEN_INT (count - half_width))); - } - else - { - gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); - emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count))); - } - } - else - { - gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - split_double_mode (mode, operands, 1, low, high); - - emit_insn (gen_shrd (low[0], high[0], operands[2])); - emit_insn (gen_ashr3 (high[0], high[0], operands[2])); - - if (TARGET_CMOVE && scratch) - { - rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx) - = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1; - - emit_move_insn (scratch, high[0]); - emit_insn (gen_ashr3 (scratch, scratch, - GEN_INT (half_width - 1))); - emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2], - scratch)); - } - else - { - rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx) - = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3; - - emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2])); - } - } -} - -void -ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode) -{ - rtx (*gen_lshr3)(rtx, rtx, rtx) - = mode == DImode ? gen_lshrsi3 : gen_lshrdi3; - rtx (*gen_shrd)(rtx, rtx, rtx); - int half_width = GET_MODE_BITSIZE (mode) >> 1; - - rtx low[2], high[2]; - int count; - - if (CONST_INT_P (operands[2])) - { - split_double_mode (mode, operands, 2, low, high); - count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1); - - if (count >= half_width) - { - emit_move_insn (low[0], high[1]); - ix86_expand_clear (high[0]); - - if (count > half_width) - emit_insn (gen_lshr3 (low[0], low[0], - GEN_INT (count - half_width))); - } - else - { - gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - emit_insn (gen_shrd (low[0], high[0], GEN_INT (count))); - emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count))); - } - } - else - { - gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd; - - if (!rtx_equal_p (operands[0], operands[1])) - emit_move_insn (operands[0], operands[1]); - - split_double_mode (mode, operands, 1, low, high); - - emit_insn (gen_shrd (low[0], high[0], operands[2])); - emit_insn (gen_lshr3 (high[0], high[0], operands[2])); - - if (TARGET_CMOVE && scratch) - { - rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx) - = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1; - - ix86_expand_clear (scratch); - emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2], - scratch)); - } - else - { - rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx) - = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2; - - emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2])); - } - } -} - -/* Predict just emitted jump instruction to be taken with probability PROB. */ -static void -predict_jump (int prob) -{ - rtx_insn *insn = get_last_insn (); - gcc_assert (JUMP_P (insn)); - add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob)); -} - -/* Helper function for the string operations below. Dest VARIABLE whether - it is aligned to VALUE bytes. If true, jump to the label. */ -static rtx_code_label * -ix86_expand_aligntest (rtx variable, int value, bool epilogue) -{ - rtx_code_label *label = gen_label_rtx (); - rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); - if (GET_MODE (variable) == DImode) - emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value))); - else - emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); - emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), - 1, label); - if (epilogue) - predict_jump (REG_BR_PROB_BASE * 50 / 100); - else - predict_jump (REG_BR_PROB_BASE * 90 / 100); - return label; -} - -/* Adjust COUNTER by the VALUE. */ -static void -ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value) -{ - rtx (*gen_add)(rtx, rtx, rtx) - = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3; - - emit_insn (gen_add (countreg, countreg, GEN_INT (-value))); -} - -/* Zero extend possibly SImode EXP to Pmode register. */ -rtx -ix86_zero_extend_to_Pmode (rtx exp) -{ - return force_reg (Pmode, convert_to_mode (Pmode, exp, 1)); -} - -/* Divide COUNTREG by SCALE. */ -static rtx -scale_counter (rtx countreg, int scale) -{ - rtx sc; - - if (scale == 1) - return countreg; - if (CONST_INT_P (countreg)) - return GEN_INT (INTVAL (countreg) / scale); - gcc_assert (REG_P (countreg)); - - sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg, - GEN_INT (exact_log2 (scale)), - NULL, 1, OPTAB_DIRECT); - return sc; -} - -/* Return mode for the memcpy/memset loop counter. Prefer SImode over - DImode for constant loop counts. */ - -static machine_mode -counter_mode (rtx count_exp) -{ - if (GET_MODE (count_exp) != VOIDmode) - return GET_MODE (count_exp); - if (!CONST_INT_P (count_exp)) - return Pmode; - if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff)) - return DImode; - return SImode; -} - -/* Copy the address to a Pmode register. This is used for x32 to - truncate DImode TLS address to a SImode register. */ - -static rtx -ix86_copy_addr_to_reg (rtx addr) -{ - rtx reg; - if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode) - { - reg = copy_addr_to_reg (addr); - REG_POINTER (reg) = 1; - return reg; - } - else - { - gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode); - reg = copy_to_mode_reg (DImode, addr); - REG_POINTER (reg) = 1; - return gen_rtx_SUBREG (SImode, reg, 0); - } -} - -/* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR - to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT - specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set - memory by VALUE (supposed to be in MODE). - - The size is rounded down to whole number of chunk size moved at once. - SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */ - - -static void -expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, rtx value, - rtx count, machine_mode mode, int unroll, - int expected_size, bool issetmem) -{ - rtx_code_label *out_label, *top_label; - rtx iter, tmp; - machine_mode iter_mode = counter_mode (count); - int piece_size_n = GET_MODE_SIZE (mode) * unroll; - rtx piece_size = GEN_INT (piece_size_n); - rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); - rtx size; - int i; - - top_label = gen_label_rtx (); - out_label = gen_label_rtx (); - iter = gen_reg_rtx (iter_mode); - - size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, - NULL, 1, OPTAB_DIRECT); - /* Those two should combine. */ - if (piece_size == const1_rtx) - { - emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, - true, out_label); - predict_jump (REG_BR_PROB_BASE * 10 / 100); - } - emit_move_insn (iter, const0_rtx); - - emit_label (top_label); - - tmp = convert_modes (Pmode, iter_mode, iter, true); - - /* This assert could be relaxed - in this case we'll need to compute - smallest power of two, containing in PIECE_SIZE_N and pass it to - offset_address. */ - gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0); - destmem = offset_address (destmem, tmp, piece_size_n); - destmem = adjust_address (destmem, mode, 0); - - if (!issetmem) - { - srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n); - srcmem = adjust_address (srcmem, mode, 0); - - /* When unrolling for chips that reorder memory reads and writes, - we can save registers by using single temporary. - Also using 4 temporaries is overkill in 32bit mode. */ - if (!TARGET_64BIT && 0) - { - for (i = 0; i < unroll; i++) - { - if (i) - { - destmem = adjust_address (copy_rtx (destmem), mode, - GET_MODE_SIZE (mode)); - srcmem = adjust_address (copy_rtx (srcmem), mode, - GET_MODE_SIZE (mode)); - } - emit_move_insn (destmem, srcmem); - } - } - else - { - rtx tmpreg[4]; - gcc_assert (unroll <= 4); - for (i = 0; i < unroll; i++) - { - tmpreg[i] = gen_reg_rtx (mode); - if (i) - srcmem = adjust_address (copy_rtx (srcmem), mode, - GET_MODE_SIZE (mode)); - emit_move_insn (tmpreg[i], srcmem); - } - for (i = 0; i < unroll; i++) - { - if (i) - destmem = adjust_address (copy_rtx (destmem), mode, - GET_MODE_SIZE (mode)); - emit_move_insn (destmem, tmpreg[i]); - } - } - } - else - for (i = 0; i < unroll; i++) - { - if (i) - destmem = adjust_address (copy_rtx (destmem), mode, - GET_MODE_SIZE (mode)); - emit_move_insn (destmem, value); - } - - tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter, - true, OPTAB_LIB_WIDEN); - if (tmp != iter) - emit_move_insn (iter, tmp); - - emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, - true, top_label); - if (expected_size != -1) - { - expected_size /= GET_MODE_SIZE (mode) * unroll; - if (expected_size == 0) - predict_jump (0); - else if (expected_size > REG_BR_PROB_BASE) - predict_jump (REG_BR_PROB_BASE - 1); - else - predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) - / expected_size); - } - else - predict_jump (REG_BR_PROB_BASE * 80 / 100); - iter = ix86_zero_extend_to_Pmode (iter); - tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr, - true, OPTAB_LIB_WIDEN); - if (tmp != destptr) - emit_move_insn (destptr, tmp); - if (!issetmem) - { - tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr, - true, OPTAB_LIB_WIDEN); - if (tmp != srcptr) - emit_move_insn (srcptr, tmp); - } - emit_label (out_label); -} - -/* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument. - When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored. - When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored. - For setmem case, VALUE is a promoted to a wider size ORIG_VALUE. - ORIG_VALUE is the original value passed to memset to fill the memory with. - Other arguments have same meaning as for previous function. */ - -static void -expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, rtx value, rtx orig_value, - rtx count, - machine_mode mode, bool issetmem) -{ - rtx destexp; - rtx srcexp; - rtx countreg; - HOST_WIDE_INT rounded_count; - - /* If possible, it is shorter to use rep movs. - TODO: Maybe it is better to move this logic to decide_alg. */ - if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3) - && (!issetmem || orig_value == const0_rtx)) - mode = SImode; - - if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) - destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); - - countreg = ix86_zero_extend_to_Pmode (scale_counter (count, - GET_MODE_SIZE (mode))); - if (mode != QImode) - { - destexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); - destexp = gen_rtx_PLUS (Pmode, destexp, destptr); - } - else - destexp = gen_rtx_PLUS (Pmode, destptr, countreg); - if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count)) - { - rounded_count - = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode)); - destmem = shallow_copy_rtx (destmem); - set_mem_size (destmem, rounded_count); - } - else if (MEM_SIZE_KNOWN_P (destmem)) - clear_mem_size (destmem); - - if (issetmem) - { - value = force_reg (mode, gen_lowpart (mode, value)); - emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); - } - else - { - if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) - srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); - if (mode != QImode) - { - srcexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); - srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); - } - else - srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); - if (CONST_INT_P (count)) - { - rounded_count - = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode)); - srcmem = shallow_copy_rtx (srcmem); - set_mem_size (srcmem, rounded_count); - } - else - { - if (MEM_SIZE_KNOWN_P (srcmem)) - clear_mem_size (srcmem); - } - emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, - destexp, srcexp)); - } -} - -/* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to - DESTMEM. - SRC is passed by pointer to be updated on return. - Return value is updated DST. */ -static rtx -emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr, - HOST_WIDE_INT size_to_move) -{ - rtx dst = destmem, src = *srcmem, adjust, tempreg; - enum insn_code code; - machine_mode move_mode; - int piece_size, i; - - /* Find the widest mode in which we could perform moves. - Start with the biggest power of 2 less than SIZE_TO_MOVE and half - it until move of such size is supported. */ - piece_size = 1 << floor_log2 (size_to_move); - while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode) - || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing) - { - gcc_assert (piece_size > 1); - piece_size >>= 1; - } - - /* Find the corresponding vector mode with the same size as MOVE_MODE. - MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ - if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) - { - int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); - if (!mode_for_vector (word_mode, nunits).exists (&move_mode) - || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing) - { - move_mode = word_mode; - piece_size = GET_MODE_SIZE (move_mode); - code = optab_handler (mov_optab, move_mode); - } - } - gcc_assert (code != CODE_FOR_nothing); - - dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); - src = adjust_automodify_address_nv (src, move_mode, srcptr, 0); - - /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ - gcc_assert (size_to_move % piece_size == 0); - adjust = GEN_INT (piece_size); - for (i = 0; i < size_to_move; i += piece_size) - { - /* We move from memory to memory, so we'll need to do it via - a temporary register. */ - tempreg = gen_reg_rtx (move_mode); - emit_insn (GEN_FCN (code) (tempreg, src)); - emit_insn (GEN_FCN (code) (dst, tempreg)); - - emit_move_insn (destptr, - gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust)); - emit_move_insn (srcptr, - gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust)); - - dst = adjust_automodify_address_nv (dst, move_mode, destptr, - piece_size); - src = adjust_automodify_address_nv (src, move_mode, srcptr, - piece_size); - } - - /* Update DST and SRC rtx. */ - *srcmem = src; - return dst; -} - -/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */ -static void -expand_movmem_epilogue (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, rtx count, int max_size) -{ - rtx src, dest; - if (CONST_INT_P (count)) - { - HOST_WIDE_INT countval = INTVAL (count); - HOST_WIDE_INT epilogue_size = countval % max_size; - int i; - - /* For now MAX_SIZE should be a power of 2. This assert could be - relaxed, but it'll require a bit more complicated epilogue - expanding. */ - gcc_assert ((max_size & (max_size - 1)) == 0); - for (i = max_size; i >= 1; i >>= 1) - { - if (epilogue_size & i) - destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); - } - return; - } - if (max_size > 8) - { - count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), - count, 1, OPTAB_DIRECT); - expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL, - count, QImode, 1, 4, false); - return; - } - - /* When there are stringops, we can cheaply increase dest and src pointers. - Otherwise we save code size by maintaining offset (zero is readily - available from preceding rep operation) and using x86 addressing modes. - */ - if (TARGET_SINGLE_STRINGOP) - { - if (max_size > 4) - { - rtx_code_label *label = ix86_expand_aligntest (count, 4, true); - src = change_address (srcmem, SImode, srcptr); - dest = change_address (destmem, SImode, destptr); - emit_insn (gen_strmov (destptr, dest, srcptr, src)); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 2) - { - rtx_code_label *label = ix86_expand_aligntest (count, 2, true); - src = change_address (srcmem, HImode, srcptr); - dest = change_address (destmem, HImode, destptr); - emit_insn (gen_strmov (destptr, dest, srcptr, src)); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 1) - { - rtx_code_label *label = ix86_expand_aligntest (count, 1, true); - src = change_address (srcmem, QImode, srcptr); - dest = change_address (destmem, QImode, destptr); - emit_insn (gen_strmov (destptr, dest, srcptr, src)); - emit_label (label); - LABEL_NUSES (label) = 1; - } - } - else - { - rtx offset = force_reg (Pmode, const0_rtx); - rtx tmp; - - if (max_size > 4) - { - rtx_code_label *label = ix86_expand_aligntest (count, 4, true); - src = change_address (srcmem, SImode, srcptr); - dest = change_address (destmem, SImode, destptr); - emit_move_insn (dest, src); - tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, - true, OPTAB_LIB_WIDEN); - if (tmp != offset) - emit_move_insn (offset, tmp); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 2) - { - rtx_code_label *label = ix86_expand_aligntest (count, 2, true); - tmp = gen_rtx_PLUS (Pmode, srcptr, offset); - src = change_address (srcmem, HImode, tmp); - tmp = gen_rtx_PLUS (Pmode, destptr, offset); - dest = change_address (destmem, HImode, tmp); - emit_move_insn (dest, src); - tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp, - true, OPTAB_LIB_WIDEN); - if (tmp != offset) - emit_move_insn (offset, tmp); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 1) - { - rtx_code_label *label = ix86_expand_aligntest (count, 1, true); - tmp = gen_rtx_PLUS (Pmode, srcptr, offset); - src = change_address (srcmem, QImode, tmp); - tmp = gen_rtx_PLUS (Pmode, destptr, offset); - dest = change_address (destmem, QImode, tmp); - emit_move_insn (dest, src); - emit_label (label); - LABEL_NUSES (label) = 1; - } - } -} - -/* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM - with value PROMOTED_VAL. - SRC is passed by pointer to be updated on return. - Return value is updated DST. */ -static rtx -emit_memset (rtx destmem, rtx destptr, rtx promoted_val, - HOST_WIDE_INT size_to_move) -{ - rtx dst = destmem, adjust; - enum insn_code code; - machine_mode move_mode; - int piece_size, i; - - /* Find the widest mode in which we could perform moves. - Start with the biggest power of 2 less than SIZE_TO_MOVE and half - it until move of such size is supported. */ - move_mode = GET_MODE (promoted_val); - if (move_mode == VOIDmode) - move_mode = QImode; - if (size_to_move < GET_MODE_SIZE (move_mode)) - { - unsigned int move_bits = size_to_move * BITS_PER_UNIT; - move_mode = int_mode_for_size (move_bits, 0).require (); - promoted_val = gen_lowpart (move_mode, promoted_val); - } - piece_size = GET_MODE_SIZE (move_mode); - code = optab_handler (mov_optab, move_mode); - gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX); - - dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0); - - /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */ - gcc_assert (size_to_move % piece_size == 0); - adjust = GEN_INT (piece_size); - for (i = 0; i < size_to_move; i += piece_size) - { - if (piece_size <= GET_MODE_SIZE (word_mode)) - { - emit_insn (gen_strset (destptr, dst, promoted_val)); - dst = adjust_automodify_address_nv (dst, move_mode, destptr, - piece_size); - continue; - } - - emit_insn (GEN_FCN (code) (dst, promoted_val)); - - emit_move_insn (destptr, - gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust)); - - dst = adjust_automodify_address_nv (dst, move_mode, destptr, - piece_size); - } - - /* Update DST rtx. */ - return dst; -} -/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ -static void -expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, - rtx count, int max_size) -{ - count = expand_simple_binop (counter_mode (count), AND, count, - GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT); - expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL, - gen_lowpart (QImode, value), count, QImode, - 1, max_size / 2, true); -} - -/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ -static void -expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value, - rtx count, int max_size) -{ - rtx dest; - - if (CONST_INT_P (count)) - { - HOST_WIDE_INT countval = INTVAL (count); - HOST_WIDE_INT epilogue_size = countval % max_size; - int i; - - /* For now MAX_SIZE should be a power of 2. This assert could be - relaxed, but it'll require a bit more complicated epilogue - expanding. */ - gcc_assert ((max_size & (max_size - 1)) == 0); - for (i = max_size; i >= 1; i >>= 1) - { - if (epilogue_size & i) - { - if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) - destmem = emit_memset (destmem, destptr, vec_value, i); - else - destmem = emit_memset (destmem, destptr, value, i); - } - } - return; - } - if (max_size > 32) - { - expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size); - return; - } - if (max_size > 16) - { - rtx_code_label *label = ix86_expand_aligntest (count, 16, true); - if (TARGET_64BIT) - { - dest = change_address (destmem, DImode, destptr); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, DImode, destptr, 8); - emit_insn (gen_strset (destptr, dest, value)); - } - else - { - dest = change_address (destmem, SImode, destptr); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, SImode, destptr, 4); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, SImode, destptr, 8); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, SImode, destptr, 12); - emit_insn (gen_strset (destptr, dest, value)); - } - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 8) - { - rtx_code_label *label = ix86_expand_aligntest (count, 8, true); - if (TARGET_64BIT) - { - dest = change_address (destmem, DImode, destptr); - emit_insn (gen_strset (destptr, dest, value)); - } - else - { - dest = change_address (destmem, SImode, destptr); - emit_insn (gen_strset (destptr, dest, value)); - dest = adjust_automodify_address_nv (dest, SImode, destptr, 4); - emit_insn (gen_strset (destptr, dest, value)); - } - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 4) - { - rtx_code_label *label = ix86_expand_aligntest (count, 4, true); - dest = change_address (destmem, SImode, destptr); - emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 2) - { - rtx_code_label *label = ix86_expand_aligntest (count, 2, true); - dest = change_address (destmem, HImode, destptr); - emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (max_size > 1) - { - rtx_code_label *label = ix86_expand_aligntest (count, 1, true); - dest = change_address (destmem, QImode, destptr); - emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); - emit_label (label); - LABEL_NUSES (label) = 1; - } -} - -/* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to - DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN. - Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are - ignored. - Return value is updated DESTMEM. */ -static rtx -expand_set_or_movmem_prologue (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, rtx value, - rtx vec_value, rtx count, int align, - int desired_alignment, bool issetmem) -{ - int i; - for (i = 1; i < desired_alignment; i <<= 1) - { - if (align <= i) - { - rtx_code_label *label = ix86_expand_aligntest (destptr, i, false); - if (issetmem) - { - if (vec_value && i > GET_MODE_SIZE (GET_MODE (value))) - destmem = emit_memset (destmem, destptr, vec_value, i); - else - destmem = emit_memset (destmem, destptr, value, i); - } - else - destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i); - ix86_adjust_counter (count, i); - emit_label (label); - LABEL_NUSES (label) = 1; - set_mem_align (destmem, i * 2 * BITS_PER_UNIT); - } - } - return destmem; -} - -/* Test if COUNT&SIZE is nonzero and if so, expand movme - or setmem sequence that is valid for SIZE..2*SIZE-1 bytes - and jump to DONE_LABEL. */ -static void -expand_small_movmem_or_setmem (rtx destmem, rtx srcmem, - rtx destptr, rtx srcptr, - rtx value, rtx vec_value, - rtx count, int size, - rtx done_label, bool issetmem) -{ - rtx_code_label *label = ix86_expand_aligntest (count, size, false); - machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk (); - rtx modesize; - int n; - - /* If we do not have vector value to copy, we must reduce size. */ - if (issetmem) - { - if (!vec_value) - { - if (GET_MODE (value) == VOIDmode && size > 8) - mode = Pmode; - else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value))) - mode = GET_MODE (value); - } - else - mode = GET_MODE (vec_value), value = vec_value; - } - else - { - /* Choose appropriate vector mode. */ - if (size >= 32) - mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode; - else if (size >= 16) - mode = TARGET_SSE ? V16QImode : DImode; - srcmem = change_address (srcmem, mode, srcptr); - } - destmem = change_address (destmem, mode, destptr); - modesize = GEN_INT (GET_MODE_SIZE (mode)); - gcc_assert (GET_MODE_SIZE (mode) <= size); - for (n = 0; n * GET_MODE_SIZE (mode) < size; n++) - { - if (issetmem) - emit_move_insn (destmem, gen_lowpart (mode, value)); - else - { - emit_move_insn (destmem, srcmem); - srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); - } - destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); - } - - destmem = offset_address (destmem, count, 1); - destmem = offset_address (destmem, GEN_INT (-2 * size), - GET_MODE_SIZE (mode)); - if (!issetmem) - { - srcmem = offset_address (srcmem, count, 1); - srcmem = offset_address (srcmem, GEN_INT (-2 * size), - GET_MODE_SIZE (mode)); - } - for (n = 0; n * GET_MODE_SIZE (mode) < size; n++) - { - if (issetmem) - emit_move_insn (destmem, gen_lowpart (mode, value)); - else - { - emit_move_insn (destmem, srcmem); - srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); - } - destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); - } - emit_jump_insn (gen_jump (done_label)); - emit_barrier (); - - emit_label (label); - LABEL_NUSES (label) = 1; -} - -/* Handle small memcpy (up to SIZE that is supposed to be small power of 2. - and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN - bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can - proceed with an loop copying SIZE bytes at once. Do moves in MODE. - DONE_LABEL is a label after the whole copying sequence. The label is created - on demand if *DONE_LABEL is NULL. - MIN_SIZE is minimal size of block copied. This value gets adjusted for new - bounds after the initial copies. - - DESTMEM/SRCMEM are memory expressions pointing to the copies block, - DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether - we will dispatch to a library call for large blocks. - - In pseudocode we do: - - if (COUNT < SIZE) - { - Assume that SIZE is 4. Bigger sizes are handled analogously - if (COUNT & 4) - { - copy 4 bytes from SRCPTR to DESTPTR - copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4 - goto done_label - } - if (!COUNT) - goto done_label; - copy 1 byte from SRCPTR to DESTPTR - if (COUNT & 2) - { - copy 2 bytes from SRCPTR to DESTPTR - copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2 - } - } - else - { - copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR - copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE - - OLD_DESPTR = DESTPTR; - Align DESTPTR up to DESIRED_ALIGN - SRCPTR += DESTPTR - OLD_DESTPTR - COUNT -= DEST_PTR - OLD_DESTPTR - if (DYNAMIC_CHECK) - Round COUNT down to multiple of SIZE - << optional caller supplied zero size guard is here >> - << optional caller supplied dynamic check is here >> - << caller supplied main copy loop is here >> - } - done_label: - */ -static void -expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem, - rtx *destptr, rtx *srcptr, - machine_mode mode, - rtx value, rtx vec_value, - rtx *count, - rtx_code_label **done_label, - int size, - int desired_align, - int align, - unsigned HOST_WIDE_INT *min_size, - bool dynamic_check, - bool issetmem) -{ - rtx_code_label *loop_label = NULL, *label; - int n; - rtx modesize; - int prolog_size = 0; - rtx mode_value; - - /* Chose proper value to copy. */ - if (issetmem && VECTOR_MODE_P (mode)) - mode_value = vec_value; - else - mode_value = value; - gcc_assert (GET_MODE_SIZE (mode) <= size); - - /* See if block is big or small, handle small blocks. */ - if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size) - { - int size2 = size; - loop_label = gen_label_rtx (); - - if (!*done_label) - *done_label = gen_label_rtx (); - - emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count), - 1, loop_label); - size2 >>= 1; - - /* Handle sizes > 3. */ - for (;size2 > 2; size2 >>= 1) - expand_small_movmem_or_setmem (destmem, srcmem, - *destptr, *srcptr, - value, vec_value, - *count, - size2, *done_label, issetmem); - /* Nothing to copy? Jump to DONE_LABEL if so */ - emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count), - 1, *done_label); - - /* Do a byte copy. */ - destmem = change_address (destmem, QImode, *destptr); - if (issetmem) - emit_move_insn (destmem, gen_lowpart (QImode, value)); - else - { - srcmem = change_address (srcmem, QImode, *srcptr); - emit_move_insn (destmem, srcmem); - } - - /* Handle sizes 2 and 3. */ - label = ix86_expand_aligntest (*count, 2, false); - destmem = change_address (destmem, HImode, *destptr); - destmem = offset_address (destmem, *count, 1); - destmem = offset_address (destmem, GEN_INT (-2), 2); - if (issetmem) - emit_move_insn (destmem, gen_lowpart (HImode, value)); - else - { - srcmem = change_address (srcmem, HImode, *srcptr); - srcmem = offset_address (srcmem, *count, 1); - srcmem = offset_address (srcmem, GEN_INT (-2), 2); - emit_move_insn (destmem, srcmem); - } - - emit_label (label); - LABEL_NUSES (label) = 1; - emit_jump_insn (gen_jump (*done_label)); - emit_barrier (); - } - else - gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size - || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size); - - /* Start memcpy for COUNT >= SIZE. */ - if (loop_label) - { - emit_label (loop_label); - LABEL_NUSES (loop_label) = 1; - } - - /* Copy first desired_align bytes. */ - if (!issetmem) - srcmem = change_address (srcmem, mode, *srcptr); - destmem = change_address (destmem, mode, *destptr); - modesize = GEN_INT (GET_MODE_SIZE (mode)); - for (n = 0; prolog_size < desired_align - align; n++) - { - if (issetmem) - emit_move_insn (destmem, mode_value); - else - { - emit_move_insn (destmem, srcmem); - srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode)); - } - destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode)); - prolog_size += GET_MODE_SIZE (mode); - } - - - /* Copy last SIZE bytes. */ - destmem = offset_address (destmem, *count, 1); - destmem = offset_address (destmem, - GEN_INT (-size - prolog_size), - 1); - if (issetmem) - emit_move_insn (destmem, mode_value); - else - { - srcmem = offset_address (srcmem, *count, 1); - srcmem = offset_address (srcmem, - GEN_INT (-size - prolog_size), - 1); - emit_move_insn (destmem, srcmem); - } - for (n = 1; n * GET_MODE_SIZE (mode) < size; n++) - { - destmem = offset_address (destmem, modesize, 1); - if (issetmem) - emit_move_insn (destmem, mode_value); - else - { - srcmem = offset_address (srcmem, modesize, 1); - emit_move_insn (destmem, srcmem); - } - } - - /* Align destination. */ - if (desired_align > 1 && desired_align > align) - { - rtx saveddest = *destptr; - - gcc_assert (desired_align <= size); - /* Align destptr up, place it to new register. */ - *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr, - GEN_INT (prolog_size), - NULL_RTX, 1, OPTAB_DIRECT); - if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest)) - REG_POINTER (*destptr) = 1; - *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr, - GEN_INT (-desired_align), - *destptr, 1, OPTAB_DIRECT); - /* See how many bytes we skipped. */ - saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest, - *destptr, - saveddest, 1, OPTAB_DIRECT); - /* Adjust srcptr and count. */ - if (!issetmem) - *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, - saveddest, *srcptr, 1, OPTAB_DIRECT); - *count = expand_simple_binop (GET_MODE (*count), PLUS, *count, - saveddest, *count, 1, OPTAB_DIRECT); - /* We copied at most size + prolog_size. */ - if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size)) - *min_size - = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size); - else - *min_size = 0; - - /* Our loops always round down the block size, but for dispatch to - library we need precise value. */ - if (dynamic_check) - *count = expand_simple_binop (GET_MODE (*count), AND, *count, - GEN_INT (-size), *count, 1, OPTAB_DIRECT); - } - else - { - gcc_assert (prolog_size == 0); - /* Decrease count, so we won't end up copying last word twice. */ - if (!CONST_INT_P (*count)) - *count = expand_simple_binop (GET_MODE (*count), PLUS, *count, - constm1_rtx, *count, 1, OPTAB_DIRECT); - else - *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1, - (unsigned HOST_WIDE_INT)size)); - if (*min_size) - *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size); - } -} - - -/* This function is like the previous one, except here we know how many bytes - need to be copied. That allows us to update alignment not only of DST, which - is returned, but also of SRC, which is passed as a pointer for that - reason. */ -static rtx -expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg, - rtx srcreg, rtx value, rtx vec_value, - int desired_align, int align_bytes, - bool issetmem) -{ - rtx src = NULL; - rtx orig_dst = dst; - rtx orig_src = NULL; - int piece_size = 1; - int copied_bytes = 0; - - if (!issetmem) - { - gcc_assert (srcp != NULL); - src = *srcp; - orig_src = src; - } - - for (piece_size = 1; - piece_size <= desired_align && copied_bytes < align_bytes; - piece_size <<= 1) - { - if (align_bytes & piece_size) - { - if (issetmem) - { - if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value))) - dst = emit_memset (dst, destreg, vec_value, piece_size); - else - dst = emit_memset (dst, destreg, value, piece_size); - } - else - dst = emit_memmov (dst, &src, destreg, srcreg, piece_size); - copied_bytes += piece_size; - } - } - if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT) - set_mem_align (dst, desired_align * BITS_PER_UNIT); - if (MEM_SIZE_KNOWN_P (orig_dst)) - set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes); - - if (!issetmem) - { - int src_align_bytes = get_mem_align_offset (src, desired_align - * BITS_PER_UNIT); - if (src_align_bytes >= 0) - src_align_bytes = desired_align - src_align_bytes; - if (src_align_bytes >= 0) - { - unsigned int src_align; - for (src_align = desired_align; src_align >= 2; src_align >>= 1) - { - if ((src_align_bytes & (src_align - 1)) - == (align_bytes & (src_align - 1))) - break; - } - if (src_align > (unsigned int) desired_align) - src_align = desired_align; - if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT) - set_mem_align (src, src_align * BITS_PER_UNIT); - } - if (MEM_SIZE_KNOWN_P (orig_src)) - set_mem_size (src, MEM_SIZE (orig_src) - align_bytes); - *srcp = src; - } - - return dst; -} - -/* Return true if ALG can be used in current context. - Assume we expand memset if MEMSET is true. */ -static bool -alg_usable_p (enum stringop_alg alg, bool memset, bool have_as) -{ - if (alg == no_stringop) - return false; - if (alg == vector_loop) - return TARGET_SSE || TARGET_AVX; - /* Algorithms using the rep prefix want at least edi and ecx; - additionally, memset wants eax and memcpy wants esi. Don't - consider such algorithms if the user has appropriated those - registers for their own purposes, or if we have a non-default - address space, since some string insns cannot override the segment. */ - if (alg == rep_prefix_1_byte - || alg == rep_prefix_4_byte - || alg == rep_prefix_8_byte) - { - if (have_as) - return false; - if (fixed_regs[CX_REG] - || fixed_regs[DI_REG] - || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])) - return false; - } - return true; -} - -/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */ -static enum stringop_alg -decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, - unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size, - bool memset, bool zero_memset, bool have_as, - int *dynamic_check, bool *noalign, bool recur) -{ - const struct stringop_algs *algs; - bool optimize_for_speed; - int max = 0; - const struct processor_costs *cost; - int i; - bool any_alg_usable_p = false; - - *noalign = false; - *dynamic_check = -1; - - /* Even if the string operation call is cold, we still might spend a lot - of time processing large blocks. */ - if (optimize_function_for_size_p (cfun) - || (optimize_insn_for_size_p () - && (max_size < 256 - || (expected_size != -1 && expected_size < 256)))) - optimize_for_speed = false; - else - optimize_for_speed = true; - - cost = optimize_for_speed ? ix86_cost : &ix86_size_cost; - if (memset) - algs = &cost->memset[TARGET_64BIT != 0]; - else - algs = &cost->memcpy[TARGET_64BIT != 0]; - - /* See maximal size for user defined algorithm. */ - for (i = 0; i < MAX_STRINGOP_ALGS; i++) - { - enum stringop_alg candidate = algs->size[i].alg; - bool usable = alg_usable_p (candidate, memset, have_as); - any_alg_usable_p |= usable; - - if (candidate != libcall && candidate && usable) - max = algs->size[i].max; - } - - /* If expected size is not known but max size is small enough - so inline version is a win, set expected size into - the range. */ - if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1) - && expected_size == -1) - expected_size = min_size / 2 + max_size / 2; - - /* If user specified the algorithm, honor it if possible. */ - if (ix86_stringop_alg != no_stringop - && alg_usable_p (ix86_stringop_alg, memset, have_as)) - return ix86_stringop_alg; - /* rep; movq or rep; movl is the smallest variant. */ - else if (!optimize_for_speed) - { - *noalign = true; - if (!count || (count & 3) || (memset && !zero_memset)) - return alg_usable_p (rep_prefix_1_byte, memset, have_as) - ? rep_prefix_1_byte : loop_1_byte; - else - return alg_usable_p (rep_prefix_4_byte, memset, have_as) - ? rep_prefix_4_byte : loop; - } - /* Very tiny blocks are best handled via the loop, REP is expensive to - setup. */ - else if (expected_size != -1 && expected_size < 4) - return loop_1_byte; - else if (expected_size != -1) - { - enum stringop_alg alg = libcall; - bool alg_noalign = false; - for (i = 0; i < MAX_STRINGOP_ALGS; i++) - { - /* We get here if the algorithms that were not libcall-based - were rep-prefix based and we are unable to use rep prefixes - based on global register usage. Break out of the loop and - use the heuristic below. */ - if (algs->size[i].max == 0) - break; - if (algs->size[i].max >= expected_size || algs->size[i].max == -1) - { - enum stringop_alg candidate = algs->size[i].alg; - - if (candidate != libcall - && alg_usable_p (candidate, memset, have_as)) - { - alg = candidate; - alg_noalign = algs->size[i].noalign; - } - /* Honor TARGET_INLINE_ALL_STRINGOPS by picking - last non-libcall inline algorithm. */ - if (TARGET_INLINE_ALL_STRINGOPS) - { - /* When the current size is best to be copied by a libcall, - but we are still forced to inline, run the heuristic below - that will pick code for medium sized blocks. */ - if (alg != libcall) - { - *noalign = alg_noalign; - return alg; - } - else if (!any_alg_usable_p) - break; - } - else if (alg_usable_p (candidate, memset, have_as)) - { - *noalign = algs->size[i].noalign; - return candidate; - } - } - } - } - /* When asked to inline the call anyway, try to pick meaningful choice. - We look for maximal size of block that is faster to copy by hand and - take blocks of at most of that size guessing that average size will - be roughly half of the block. - - If this turns out to be bad, we might simply specify the preferred - choice in ix86_costs. */ - if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY) - && (algs->unknown_size == libcall - || !alg_usable_p (algs->unknown_size, memset, have_as))) - { - enum stringop_alg alg; - HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2; - - /* If there aren't any usable algorithms or if recursing already, - then recursing on smaller sizes or same size isn't going to - find anything. Just return the simple byte-at-a-time copy loop. */ - if (!any_alg_usable_p || recur) - { - /* Pick something reasonable. */ - if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur) - *dynamic_check = 128; - return loop_1_byte; - } - alg = decide_alg (count, new_expected_size, min_size, max_size, memset, - zero_memset, have_as, dynamic_check, noalign, true); - gcc_assert (*dynamic_check == -1); - if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) - *dynamic_check = max; - else - gcc_assert (alg != libcall); - return alg; - } - return (alg_usable_p (algs->unknown_size, memset, have_as) - ? algs->unknown_size : libcall); -} - -/* Decide on alignment. We know that the operand is already aligned to ALIGN - (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */ -static int -decide_alignment (int align, - enum stringop_alg alg, - int expected_size, - machine_mode move_mode) -{ - int desired_align = 0; - - gcc_assert (alg != no_stringop); - - if (alg == libcall) - return 0; - if (move_mode == VOIDmode) - return 0; - - desired_align = GET_MODE_SIZE (move_mode); - /* PentiumPro has special logic triggering for 8 byte aligned blocks. - copying whole cacheline at once. */ - if (TARGET_PENTIUMPRO - && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte)) - desired_align = 8; - - if (optimize_size) - desired_align = 1; - if (desired_align < align) - desired_align = align; - if (expected_size != -1 && expected_size < 4) - desired_align = align; - - return desired_align; -} - - -/* Helper function for memcpy. For QImode value 0xXY produce - 0xXYXYXYXY of wide specified by MODE. This is essentially - a * 0x10101010, but we can do slightly better than - synth_mult by unwinding the sequence by hand on CPUs with - slow multiply. */ -static rtx -promote_duplicated_reg (machine_mode mode, rtx val) -{ - machine_mode valmode = GET_MODE (val); - rtx tmp; - int nops = mode == DImode ? 3 : 2; - - gcc_assert (mode == SImode || mode == DImode || val == const0_rtx); - if (val == const0_rtx) - return copy_to_mode_reg (mode, CONST0_RTX (mode)); - if (CONST_INT_P (val)) - { - HOST_WIDE_INT v = INTVAL (val) & 255; - - v |= v << 8; - v |= v << 16; - if (mode == DImode) - v |= (v << 16) << 16; - return copy_to_mode_reg (mode, gen_int_mode (v, mode)); - } - - if (valmode == VOIDmode) - valmode = QImode; - if (valmode != QImode) - val = gen_lowpart (QImode, val); - if (mode == QImode) - return val; - if (!TARGET_PARTIAL_REG_STALL) - nops--; - if (ix86_cost->mult_init[mode == DImode ? 3 : 2] - + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) - <= (ix86_cost->shift_const + ix86_cost->add) * nops - + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) - { - rtx reg = convert_modes (mode, QImode, val, true); - tmp = promote_duplicated_reg (mode, const1_rtx); - return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, - OPTAB_DIRECT); - } - else - { - rtx reg = convert_modes (mode, QImode, val, true); - - if (!TARGET_PARTIAL_REG_STALL) - if (mode == SImode) - emit_insn (gen_insvsi_1 (reg, reg)); - else - emit_insn (gen_insvdi_1 (reg, reg)); - else - { - tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), - NULL, 1, OPTAB_DIRECT); - reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, - OPTAB_DIRECT); - } - tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), - NULL, 1, OPTAB_DIRECT); - reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); - if (mode == SImode) - return reg; - tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), - NULL, 1, OPTAB_DIRECT); - reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); - return reg; - } -} - -/* Duplicate value VAL using promote_duplicated_reg into maximal size that will - be needed by main loop copying SIZE_NEEDED chunks and prologue getting - alignment from ALIGN to DESIRED_ALIGN. */ -static rtx -promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, - int align) -{ - rtx promoted_val; - - if (TARGET_64BIT - && (size_needed > 4 || (desired_align > align && desired_align > 4))) - promoted_val = promote_duplicated_reg (DImode, val); - else if (size_needed > 2 || (desired_align > align && desired_align > 2)) - promoted_val = promote_duplicated_reg (SImode, val); - else if (size_needed > 1 || (desired_align > align && desired_align > 1)) - promoted_val = promote_duplicated_reg (HImode, val); - else - promoted_val = val; - - return promoted_val; -} - -/* Expand string move (memcpy) ot store (memset) operation. Use i386 string - operations when profitable. The code depends upon architecture, block size - and alignment, but always has one of the following overall structures: - - Aligned move sequence: - - 1) Prologue guard: Conditional that jumps up to epilogues for small - blocks that can be handled by epilogue alone. This is faster - but also needed for correctness, since prologue assume the block - is larger than the desired alignment. - - Optional dynamic check for size and libcall for large - blocks is emitted here too, with -minline-stringops-dynamically. - - 2) Prologue: copy first few bytes in order to get destination - aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less - than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be - copied. We emit either a jump tree on power of two sized - blocks, or a byte loop. - - 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks - with specified algorithm. - - 4) Epilogue: code copying tail of the block that is too small to be - handled by main body (or up to size guarded by prologue guard). - - Misaligned move sequence - - 1) missaligned move prologue/epilogue containing: - a) Prologue handling small memory blocks and jumping to done_label - (skipped if blocks are known to be large enough) - b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is - needed by single possibly misaligned move - (skipped if alignment is not needed) - c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves - - 2) Zero size guard dispatching to done_label, if needed - - 3) dispatch to library call, if needed, - - 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks - with specified algorithm. */ -bool -ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp, - rtx align_exp, rtx expected_align_exp, - rtx expected_size_exp, rtx min_size_exp, - rtx max_size_exp, rtx probable_max_size_exp, - bool issetmem) -{ - rtx destreg; - rtx srcreg = NULL; - rtx_code_label *label = NULL; - rtx tmp; - rtx_code_label *jump_around_label = NULL; - HOST_WIDE_INT align = 1; - unsigned HOST_WIDE_INT count = 0; - HOST_WIDE_INT expected_size = -1; - int size_needed = 0, epilogue_size_needed; - int desired_align = 0, align_bytes = 0; - enum stringop_alg alg; - rtx promoted_val = NULL; - rtx vec_promoted_val = NULL; - bool force_loopy_epilogue = false; - int dynamic_check; - bool need_zero_guard = false; - bool noalign; - machine_mode move_mode = VOIDmode; - machine_mode wider_mode; - int unroll_factor = 1; - /* TODO: Once value ranges are available, fill in proper data. */ - unsigned HOST_WIDE_INT min_size = 0; - unsigned HOST_WIDE_INT max_size = -1; - unsigned HOST_WIDE_INT probable_max_size = -1; - bool misaligned_prologue_used = false; - bool have_as; - - if (CONST_INT_P (align_exp)) - align = INTVAL (align_exp); - /* i386 can do misaligned access on reasonably increased cost. */ - if (CONST_INT_P (expected_align_exp) - && INTVAL (expected_align_exp) > align) - align = INTVAL (expected_align_exp); - /* ALIGN is the minimum of destination and source alignment, but we care here - just about destination alignment. */ - else if (!issetmem - && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT) - align = MEM_ALIGN (dst) / BITS_PER_UNIT; - - if (CONST_INT_P (count_exp)) - { - min_size = max_size = probable_max_size = count = expected_size - = INTVAL (count_exp); - /* When COUNT is 0, there is nothing to do. */ - if (!count) - return true; - } - else - { - if (min_size_exp) - min_size = INTVAL (min_size_exp); - if (max_size_exp) - max_size = INTVAL (max_size_exp); - if (probable_max_size_exp) - probable_max_size = INTVAL (probable_max_size_exp); - if (CONST_INT_P (expected_size_exp)) - expected_size = INTVAL (expected_size_exp); - } - - /* Make sure we don't need to care about overflow later on. */ - if (count > (HOST_WIDE_INT_1U << 30)) - return false; - - have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst)); - if (!issetmem) - have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)); - - /* Step 0: Decide on preferred algorithm, desired alignment and - size of chunks to be copied by main loop. */ - alg = decide_alg (count, expected_size, min_size, probable_max_size, - issetmem, - issetmem && val_exp == const0_rtx, have_as, - &dynamic_check, &noalign, false); - - if (dump_file) - fprintf (dump_file, "Selected stringop expansion strategy: %s\n", - stringop_alg_names[alg]); - - if (alg == libcall) - return false; - gcc_assert (alg != no_stringop); - - /* For now vector-version of memset is generated only for memory zeroing, as - creating of promoted vector value is very cheap in this case. */ - if (issetmem && alg == vector_loop && val_exp != const0_rtx) - alg = unrolled_loop; - - if (!count) - count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); - destreg = ix86_copy_addr_to_reg (XEXP (dst, 0)); - if (!issetmem) - srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); - - unroll_factor = 1; - move_mode = word_mode; - switch (alg) - { - case libcall: - case no_stringop: - case last_alg: - gcc_unreachable (); - case loop_1_byte: - need_zero_guard = true; - move_mode = QImode; - break; - case loop: - need_zero_guard = true; - break; - case unrolled_loop: - need_zero_guard = true; - unroll_factor = (TARGET_64BIT ? 4 : 2); - break; - case vector_loop: - need_zero_guard = true; - unroll_factor = 4; - /* Find the widest supported mode. */ - move_mode = word_mode; - while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode) - && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing) - move_mode = wider_mode; - - if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128) - move_mode = TImode; - - /* Find the corresponding vector mode with the same size as MOVE_MODE. - MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */ - if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode)) - { - int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode); - if (!mode_for_vector (word_mode, nunits).exists (&move_mode) - || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing) - move_mode = word_mode; - } - gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing); - break; - case rep_prefix_8_byte: - move_mode = DImode; - break; - case rep_prefix_4_byte: - move_mode = SImode; - break; - case rep_prefix_1_byte: - move_mode = QImode; - break; - } - size_needed = GET_MODE_SIZE (move_mode) * unroll_factor; - epilogue_size_needed = size_needed; - - /* If we are going to call any library calls conditionally, make sure any - pending stack adjustment happen before the first conditional branch, - otherwise they will be emitted before the library call only and won't - happen from the other branches. */ - if (dynamic_check != -1) - do_pending_stack_adjust (); - - desired_align = decide_alignment (align, alg, expected_size, move_mode); - if (!TARGET_ALIGN_STRINGOPS || noalign) - align = desired_align; - - /* Step 1: Prologue guard. */ - - /* Alignment code needs count to be in register. */ - if (CONST_INT_P (count_exp) && desired_align > align) - { - if (INTVAL (count_exp) > desired_align - && INTVAL (count_exp) > size_needed) - { - align_bytes - = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT); - if (align_bytes <= 0) - align_bytes = 0; - else - align_bytes = desired_align - align_bytes; - } - if (align_bytes == 0) - count_exp = force_reg (counter_mode (count_exp), count_exp); - } - gcc_assert (desired_align >= 1 && align >= 1); - - /* Misaligned move sequences handle both prologue and epilogue at once. - Default code generation results in a smaller code for large alignments - and also avoids redundant job when sizes are known precisely. */ - misaligned_prologue_used - = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES - && MAX (desired_align, epilogue_size_needed) <= 32 - && desired_align <= epilogue_size_needed - && ((desired_align > align && !align_bytes) - || (!count && epilogue_size_needed > 1))); - - /* Do the cheap promotion to allow better CSE across the - main loop and epilogue (ie one load of the big constant in the - front of all code. - For now the misaligned move sequences do not have fast path - without broadcasting. */ - if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used))) - { - if (alg == vector_loop) - { - gcc_assert (val_exp == const0_rtx); - vec_promoted_val = promote_duplicated_reg (move_mode, val_exp); - promoted_val = promote_duplicated_reg_to_size (val_exp, - GET_MODE_SIZE (word_mode), - desired_align, align); - } - else - { - promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, - desired_align, align); - } - } - /* Misaligned move sequences handles both prologues and epilogues at once. - Default code generation results in smaller code for large alignments and - also avoids redundant job when sizes are known precisely. */ - if (misaligned_prologue_used) - { - /* Misaligned move prologue handled small blocks by itself. */ - expand_set_or_movmem_prologue_epilogue_by_misaligned_moves - (dst, src, &destreg, &srcreg, - move_mode, promoted_val, vec_promoted_val, - &count_exp, - &jump_around_label, - desired_align < align - ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed, - desired_align, align, &min_size, dynamic_check, issetmem); - if (!issetmem) - src = change_address (src, BLKmode, srcreg); - dst = change_address (dst, BLKmode, destreg); - set_mem_align (dst, desired_align * BITS_PER_UNIT); - epilogue_size_needed = 0; - if (need_zero_guard - && min_size < (unsigned HOST_WIDE_INT) size_needed) - { - /* It is possible that we copied enough so the main loop will not - execute. */ - gcc_assert (size_needed > 1); - if (jump_around_label == NULL_RTX) - jump_around_label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (size_needed), - LTU, 0, counter_mode (count_exp), 1, jump_around_label); - if (expected_size == -1 - || expected_size < (desired_align - align) / 2 + size_needed) - predict_jump (REG_BR_PROB_BASE * 20 / 100); - else - predict_jump (REG_BR_PROB_BASE * 60 / 100); - } - } - /* Ensure that alignment prologue won't copy past end of block. */ - else if (size_needed > 1 || (desired_align > 1 && desired_align > align)) - { - epilogue_size_needed = MAX (size_needed - 1, desired_align - align); - /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. - Make sure it is power of 2. */ - epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1); - - /* To improve performance of small blocks, we jump around the VAL - promoting mode. This mean that if the promoted VAL is not constant, - we might not use it in the epilogue and have to use byte - loop variant. */ - if (issetmem && epilogue_size_needed > 2 && !promoted_val) - force_loopy_epilogue = true; - if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed) - || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed) - { - /* If main algorithm works on QImode, no epilogue is needed. - For small sizes just don't align anything. */ - if (size_needed == 1) - desired_align = align; - else - goto epilogue; - } - else if (!count - && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed) - { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (epilogue_size_needed), - LTU, 0, counter_mode (count_exp), 1, label); - if (expected_size == -1 || expected_size < epilogue_size_needed) - predict_jump (REG_BR_PROB_BASE * 60 / 100); - else - predict_jump (REG_BR_PROB_BASE * 20 / 100); - } - } - - /* Emit code to decide on runtime whether library call or inline should be - used. */ - if (dynamic_check != -1) - { - if (!issetmem && CONST_INT_P (count_exp)) - { - if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check) - { - emit_block_copy_via_libcall (dst, src, count_exp); - count_exp = const0_rtx; - goto epilogue; - } - } - else - { - rtx_code_label *hot_label = gen_label_rtx (); - if (jump_around_label == NULL_RTX) - jump_around_label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), - LEU, 0, counter_mode (count_exp), - 1, hot_label); - predict_jump (REG_BR_PROB_BASE * 90 / 100); - if (issetmem) - set_storage_via_libcall (dst, count_exp, val_exp); - else - emit_block_copy_via_libcall (dst, src, count_exp); - emit_jump (jump_around_label); - emit_label (hot_label); - } - } - - /* Step 2: Alignment prologue. */ - /* Do the expensive promotion once we branched off the small blocks. */ - if (issetmem && !promoted_val) - promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, - desired_align, align); - - if (desired_align > align && !misaligned_prologue_used) - { - if (align_bytes == 0) - { - /* Except for the first move in prologue, we no longer know - constant offset in aliasing info. It don't seems to worth - the pain to maintain it for the first move, so throw away - the info early. */ - dst = change_address (dst, BLKmode, destreg); - if (!issetmem) - src = change_address (src, BLKmode, srcreg); - dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg, - promoted_val, vec_promoted_val, - count_exp, align, desired_align, - issetmem); - /* At most desired_align - align bytes are copied. */ - if (min_size < (unsigned)(desired_align - align)) - min_size = 0; - else - min_size -= desired_align - align; - } - else - { - /* If we know how many bytes need to be stored before dst is - sufficiently aligned, maintain aliasing info accurately. */ - dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg, - srcreg, - promoted_val, - vec_promoted_val, - desired_align, - align_bytes, - issetmem); - - count_exp = plus_constant (counter_mode (count_exp), - count_exp, -align_bytes); - count -= align_bytes; - min_size -= align_bytes; - max_size -= align_bytes; - } - if (need_zero_guard - && min_size < (unsigned HOST_WIDE_INT) size_needed - && (count < (unsigned HOST_WIDE_INT) size_needed - || (align_bytes == 0 - && count < ((unsigned HOST_WIDE_INT) size_needed - + desired_align - align)))) - { - /* It is possible that we copied enough so the main loop will not - execute. */ - gcc_assert (size_needed > 1); - if (label == NULL_RTX) - label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (size_needed), - LTU, 0, counter_mode (count_exp), 1, label); - if (expected_size == -1 - || expected_size < (desired_align - align) / 2 + size_needed) - predict_jump (REG_BR_PROB_BASE * 20 / 100); - else - predict_jump (REG_BR_PROB_BASE * 60 / 100); - } - } - if (label && size_needed == 1) - { - emit_label (label); - LABEL_NUSES (label) = 1; - label = NULL; - epilogue_size_needed = 1; - if (issetmem) - promoted_val = val_exp; - } - else if (label == NULL_RTX && !misaligned_prologue_used) - epilogue_size_needed = size_needed; - - /* Step 3: Main loop. */ - - switch (alg) - { - case libcall: - case no_stringop: - case last_alg: - gcc_unreachable (); - case loop_1_byte: - case loop: - case unrolled_loop: - expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val, - count_exp, move_mode, unroll_factor, - expected_size, issetmem); - break; - case vector_loop: - expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, - vec_promoted_val, count_exp, move_mode, - unroll_factor, expected_size, issetmem); - break; - case rep_prefix_8_byte: - case rep_prefix_4_byte: - case rep_prefix_1_byte: - expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val, - val_exp, count_exp, move_mode, issetmem); - break; - } - /* Adjust properly the offset of src and dest memory for aliasing. */ - if (CONST_INT_P (count_exp)) - { - if (!issetmem) - src = adjust_automodify_address_nv (src, BLKmode, srcreg, - (count / size_needed) * size_needed); - dst = adjust_automodify_address_nv (dst, BLKmode, destreg, - (count / size_needed) * size_needed); - } - else - { - if (!issetmem) - src = change_address (src, BLKmode, srcreg); - dst = change_address (dst, BLKmode, destreg); - } - - /* Step 4: Epilogue to copy the remaining bytes. */ - epilogue: - if (label) - { - /* When the main loop is done, COUNT_EXP might hold original count, - while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. - Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED - bytes. Compensate if needed. */ - - if (size_needed < epilogue_size_needed) - { - tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp, - GEN_INT (size_needed - 1), count_exp, 1, - OPTAB_DIRECT); - if (tmp != count_exp) - emit_move_insn (count_exp, tmp); - } - emit_label (label); - LABEL_NUSES (label) = 1; - } - - if (count_exp != const0_rtx && epilogue_size_needed > 1) - { - if (force_loopy_epilogue) - expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, - epilogue_size_needed); - else - { - if (issetmem) - expand_setmem_epilogue (dst, destreg, promoted_val, - vec_promoted_val, count_exp, - epilogue_size_needed); - else - expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, - epilogue_size_needed); - } - } - if (jump_around_label) - emit_label (jump_around_label); - return true; -} - - -/* Expand the appropriate insns for doing strlen if not just doing - repnz; scasb - - out = result, initialized with the start address - align_rtx = alignment of the address. - scratch = scratch register, initialized with the startaddress when - not aligned, otherwise undefined - - This is just the body. It needs the initializations mentioned above and - some address computing at the end. These things are done in i386.md. */ - -static void -ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx) -{ - int align; - rtx tmp; - rtx_code_label *align_2_label = NULL; - rtx_code_label *align_3_label = NULL; - rtx_code_label *align_4_label = gen_label_rtx (); - rtx_code_label *end_0_label = gen_label_rtx (); - rtx mem; - rtx tmpreg = gen_reg_rtx (SImode); - rtx scratch = gen_reg_rtx (SImode); - rtx cmp; - - align = 0; - if (CONST_INT_P (align_rtx)) - align = INTVAL (align_rtx); - - /* Loop to check 1..3 bytes for null to get an aligned pointer. */ - - /* Is there a known alignment and is it less than 4? */ - if (align < 4) - { - rtx scratch1 = gen_reg_rtx (Pmode); - emit_move_insn (scratch1, out); - /* Is there a known alignment and is it not 2? */ - if (align != 2) - { - align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */ - align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */ - - /* Leave just the 3 lower bits. */ - align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3), - NULL_RTX, 0, OPTAB_WIDEN); - - emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, - Pmode, 1, align_4_label); - emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL, - Pmode, 1, align_2_label); - emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL, - Pmode, 1, align_3_label); - } - else - { - /* Since the alignment is 2, we have to check 2 or 0 bytes; - check if is aligned to 4 - byte. */ - - align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx, - NULL_RTX, 0, OPTAB_WIDEN); - - emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, - Pmode, 1, align_4_label); - } - - mem = change_address (src, QImode, out); - - /* Now compare the bytes. */ - - /* Compare the first n unaligned byte on a byte per byte basis. */ - emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, - QImode, 1, end_0_label); - - /* Increment the address. */ - emit_insn (ix86_gen_add3 (out, out, const1_rtx)); - - /* Not needed with an alignment of 2 */ - if (align != 2) - { - emit_label (align_2_label); - - emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, - end_0_label); - - emit_insn (ix86_gen_add3 (out, out, const1_rtx)); - - emit_label (align_3_label); - } - - emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, - end_0_label); - - emit_insn (ix86_gen_add3 (out, out, const1_rtx)); - } - - /* Generate loop to check 4 bytes at a time. It is not a good idea to - align this loop. It gives only huge programs, but does not help to - speed up. */ - emit_label (align_4_label); - - mem = change_address (src, SImode, out); - emit_move_insn (scratch, mem); - emit_insn (ix86_gen_add3 (out, out, GEN_INT (4))); - - /* This formula yields a nonzero result iff one of the bytes is zero. - This saves three branches inside loop and many cycles. */ - - emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101))); - emit_insn (gen_one_cmplsi2 (scratch, scratch)); - emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch)); - emit_insn (gen_andsi3 (tmpreg, tmpreg, - gen_int_mode (0x80808080, SImode))); - emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, - align_4_label); - - if (TARGET_CMOVE) - { - rtx reg = gen_reg_rtx (SImode); - rtx reg2 = gen_reg_rtx (Pmode); - emit_move_insn (reg, tmpreg); - emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16))); - - /* If zero is not in the first two bytes, move two bytes forward. */ - emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); - tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); - tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); - emit_insn (gen_rtx_SET (tmpreg, - gen_rtx_IF_THEN_ELSE (SImode, tmp, - reg, - tmpreg))); - /* Emit lea manually to avoid clobbering of flags. */ - emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx))); - - tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); - tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); - emit_insn (gen_rtx_SET (out, - gen_rtx_IF_THEN_ELSE (Pmode, tmp, - reg2, - out))); - } - else - { - rtx_code_label *end_2_label = gen_label_rtx (); - /* Is zero in the first two bytes? */ - - emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080))); - tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); - tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, end_2_label), - pc_rtx); - tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - JUMP_LABEL (tmp) = end_2_label; - - /* Not in the first two. Move two bytes forward. */ - emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16))); - emit_insn (ix86_gen_add3 (out, out, const2_rtx)); - - emit_label (end_2_label); - - } - - /* Avoid branch in fixing the byte. */ - tmpreg = gen_lowpart (QImode, tmpreg); - emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg)); - tmp = gen_rtx_REG (CCmode, FLAGS_REG); - cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx); - emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp)); - - emit_label (end_0_label); -} - -/* Expand strlen. */ - -bool -ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) -{ -if (TARGET_UNROLL_STRLEN - && TARGET_INLINE_ALL_STRINGOPS - && eoschar == const0_rtx - && optimize > 1) - { - /* The generic case of strlen expander is long. Avoid it's - expanding unless TARGET_INLINE_ALL_STRINGOPS. */ - rtx addr = force_reg (Pmode, XEXP (src, 0)); - /* Well it seems that some optimizer does not combine a call like - foo(strlen(bar), strlen(bar)); - when the move and the subtraction is done here. It does calculate - the length just once when these instructions are done inside of - output_strlen_unroll(). But I think since &bar[strlen(bar)] is - often used and I use one fewer register for the lifetime of - output_strlen_unroll() this is better. */ - - emit_move_insn (out, addr); - - ix86_expand_strlensi_unroll_1 (out, src, align); - - /* strlensi_unroll_1 returns the address of the zero at the end of - the string, like memchr(), so compute the length by subtracting - the start address. */ - emit_insn (ix86_gen_sub3 (out, out, addr)); - return true; - } - else - return false; -} - -/* For given symbol (function) construct code to compute address of it's PLT - entry in large x86-64 PIC model. */ -static rtx -construct_plt_address (rtx symbol) -{ - rtx tmp, unspec; - - gcc_assert (GET_CODE (symbol) == SYMBOL_REF); - gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF); - gcc_assert (Pmode == DImode); - - tmp = gen_reg_rtx (Pmode); - unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF); - - emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec)); - emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx)); - return tmp; -} - -rtx_insn * -ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, - rtx callarg2, - rtx pop, bool sibcall) -{ - rtx vec[3]; - rtx use = NULL, call; - unsigned int vec_len = 0; - tree fndecl; - - if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) - { - fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0)); - if (fndecl - && (lookup_attribute ("interrupt", - TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))) - error ("interrupt service routine can%'t be called directly"); - } - else - fndecl = NULL_TREE; - - if (pop == const0_rtx) - pop = NULL; - gcc_assert (!TARGET_64BIT || !pop); - - if (TARGET_MACHO && !TARGET_64BIT) - { -#if TARGET_MACHO - if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) - fnaddr = machopic_indirect_call_target (fnaddr); -#endif - } - else - { - /* Static functions and indirect calls don't need the pic register. Also, - check if PLT was explicitly avoided via no-plt or "noplt" attribute, making - it an indirect call. */ - rtx addr = XEXP (fnaddr, 0); - if (flag_pic - && GET_CODE (addr) == SYMBOL_REF - && !SYMBOL_REF_LOCAL_P (addr)) - { - if (flag_plt - && (SYMBOL_REF_DECL (addr) == NULL_TREE - || !lookup_attribute ("noplt", - DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr))))) - { - if (!TARGET_64BIT - || (ix86_cmodel == CM_LARGE_PIC - && DEFAULT_ABI != MS_ABI)) - { - use_reg (&use, gen_rtx_REG (Pmode, - REAL_PIC_OFFSET_TABLE_REGNUM)); - if (ix86_use_pseudo_pic_reg ()) - emit_move_insn (gen_rtx_REG (Pmode, - REAL_PIC_OFFSET_TABLE_REGNUM), - pic_offset_table_rtx); - } - } - else if (!TARGET_PECOFF && !TARGET_MACHO) - { - if (TARGET_64BIT) - { - fnaddr = gen_rtx_UNSPEC (Pmode, - gen_rtvec (1, addr), - UNSPEC_GOTPCREL); - fnaddr = gen_rtx_CONST (Pmode, fnaddr); - } - else - { - fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), - UNSPEC_GOT); - fnaddr = gen_rtx_CONST (Pmode, fnaddr); - fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, - fnaddr); - } - fnaddr = gen_const_mem (Pmode, fnaddr); - /* Pmode may not be the same as word_mode for x32, which - doesn't support indirect branch via 32-bit memory slot. - Since x32 GOT slot is 64 bit with zero upper 32 bits, - indirect branch via x32 GOT slot is OK. */ - if (GET_MODE (fnaddr) != word_mode) - fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr); - fnaddr = gen_rtx_MEM (QImode, fnaddr); - } - } - } - - /* Skip setting up RAX register for -mskip-rax-setup when there are no - parameters passed in vector registers. */ - if (TARGET_64BIT - && (INTVAL (callarg2) > 0 - || (INTVAL (callarg2) == 0 - && (TARGET_SSE || !flag_skip_rax_setup)))) - { - rtx al = gen_rtx_REG (QImode, AX_REG); - emit_move_insn (al, callarg2); - use_reg (&use, al); - } - - if (ix86_cmodel == CM_LARGE_PIC - && !TARGET_PECOFF - && MEM_P (fnaddr) - && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF - && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode)) - fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0))); - /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect - branch via x32 GOT slot is OK. */ - else if (!(TARGET_X32 - && MEM_P (fnaddr) - && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND - && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)) - && (sibcall - ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode) - : !call_insn_operand (XEXP (fnaddr, 0), word_mode))) - { - fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1); - fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr)); - } - - call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1); - - if (retval) - call = gen_rtx_SET (retval, call); - vec[vec_len++] = call; - - if (pop) - { - pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop); - pop = gen_rtx_SET (stack_pointer_rtx, pop); - vec[vec_len++] = pop; - } - - if (cfun->machine->no_caller_saved_registers - && (!fndecl - || (!TREE_THIS_VOLATILE (fndecl) - && !lookup_attribute ("no_caller_saved_registers", - TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))) - { - static const char ix86_call_used_regs[] = CALL_USED_REGISTERS; - bool is_64bit_ms_abi = (TARGET_64BIT - && ix86_function_abi (fndecl) == MS_ABI); - char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi); - - /* If there are no caller-saved registers, add all registers - that are clobbered by the call which returns. */ - for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++) - if (!fixed_regs[i] - && (ix86_call_used_regs[i] == 1 - || (ix86_call_used_regs[i] & c_mask)) - && !STACK_REGNO_P (i) - && !MMX_REGNO_P (i)) - clobber_reg (&use, - gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i)); - } - else if (TARGET_64BIT_MS_ABI - && (!callarg2 || INTVAL (callarg2) != -2)) - { - unsigned i; - - for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++) - { - int regno = x86_64_ms_sysv_extra_clobbered_registers[i]; - machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode; - - clobber_reg (&use, gen_rtx_REG (mode, regno)); - } - - /* Set here, but it may get cleared later. */ - if (TARGET_CALL_MS2SYSV_XLOGUES) - { - if (!TARGET_SSE) - ; - - /* Don't break hot-patched functions. */ - else if (ix86_function_ms_hook_prologue (current_function_decl)) - ; - - /* TODO: Cases not yet examined. */ - else if (flag_split_stack) - warn_once_call_ms2sysv_xlogues ("-fsplit-stack"); - - else - { - gcc_assert (!reload_completed); - cfun->machine->call_ms2sysv = true; - } - } - } - - if (vec_len > 1) - call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec)); - rtx_insn *call_insn = emit_call_insn (call); - if (use) - CALL_INSN_FUNCTION_USAGE (call_insn) = use; - - return call_insn; -} - -/* Return true if the function being called was marked with attribute - "noplt" or using -fno-plt and we are compiling for non-PIC. We need - to handle the non-PIC case in the backend because there is no easy - interface for the front-end to force non-PLT calls to use the GOT. - This is currently used only with 64-bit or 32-bit GOT32X ELF targets - to call the function marked "noplt" indirectly. */ - -static bool -ix86_nopic_noplt_attribute_p (rtx call_op) -{ - if (flag_pic || ix86_cmodel == CM_LARGE - || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X) - || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF - || SYMBOL_REF_LOCAL_P (call_op)) - return false; - - tree symbol_decl = SYMBOL_REF_DECL (call_op); - - if (!flag_plt - || (symbol_decl != NULL_TREE - && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl)))) - return true; - - return false; -} - -/* Output indirect branch via a call and return thunk. CALL_OP is a - register which contains the branch target. XASM is the assembly - template for CALL_OP. Branch is a tail call if SIBCALL_P is true. - A normal call is converted to: - - call __x86_indirect_thunk_reg - - and a tail call is converted to: - - jmp __x86_indirect_thunk_reg - */ - -static void -ix86_output_indirect_branch_via_reg (rtx call_op, bool sibcall_p) -{ - char thunk_name_buf[32]; - char *thunk_name; - enum indirect_thunk_prefix need_prefix - = indirect_thunk_need_prefix (current_output_insn); - int regno = REGNO (call_op); - - if (cfun->machine->indirect_branch_type - != indirect_branch_thunk_inline) - { - if (cfun->machine->indirect_branch_type == indirect_branch_thunk) - { - int i = regno; - if (i >= FIRST_REX_INT_REG) - i -= (FIRST_REX_INT_REG - LAST_INT_REG - 1); - indirect_thunks_used |= 1 << i; - } - indirect_thunk_name (thunk_name_buf, regno, need_prefix, false); - thunk_name = thunk_name_buf; - } - else - thunk_name = NULL; - - if (sibcall_p) - { - if (thunk_name != NULL) - fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); - else - output_indirect_thunk (regno); - } - else - { - if (thunk_name != NULL) - { - fprintf (asm_out_file, "\tcall\t%s\n", thunk_name); - return; - } - - char indirectlabel1[32]; - char indirectlabel2[32]; - - ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, - INDIRECT_LABEL, - indirectlabelno++); - ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, - INDIRECT_LABEL, - indirectlabelno++); - - /* Jump. */ - fputs ("\tjmp\t", asm_out_file); - assemble_name_raw (asm_out_file, indirectlabel2); - fputc ('\n', asm_out_file); - - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1); - - if (thunk_name != NULL) - fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); - else - output_indirect_thunk (regno); - - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2); - - /* Call. */ - fputs ("\tcall\t", asm_out_file); - assemble_name_raw (asm_out_file, indirectlabel1); - fputc ('\n', asm_out_file); - } -} - -/* Output indirect branch via a call and return thunk. CALL_OP is - the branch target. XASM is the assembly template for CALL_OP. - Branch is a tail call if SIBCALL_P is true. A normal call is - converted to: - - jmp L2 - L1: - push CALL_OP - jmp __x86_indirect_thunk - L2: - call L1 - - and a tail call is converted to: - - push CALL_OP - jmp __x86_indirect_thunk - */ - -static void -ix86_output_indirect_branch_via_push (rtx call_op, const char *xasm, - bool sibcall_p) -{ - char thunk_name_buf[32]; - char *thunk_name; - char push_buf[64]; - enum indirect_thunk_prefix need_prefix - = indirect_thunk_need_prefix (current_output_insn); - int regno = -1; - - if (cfun->machine->indirect_branch_type - != indirect_branch_thunk_inline) - { - if (cfun->machine->indirect_branch_type == indirect_branch_thunk) - indirect_thunk_needed = true; - indirect_thunk_name (thunk_name_buf, regno, need_prefix, false); - thunk_name = thunk_name_buf; - } - else - thunk_name = NULL; - - snprintf (push_buf, sizeof (push_buf), "push{%c}\t%s", - TARGET_64BIT ? 'q' : 'l', xasm); - - if (sibcall_p) - { - output_asm_insn (push_buf, &call_op); - if (thunk_name != NULL) - fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); - else - output_indirect_thunk (regno); - } - else - { - char indirectlabel1[32]; - char indirectlabel2[32]; - - ASM_GENERATE_INTERNAL_LABEL (indirectlabel1, - INDIRECT_LABEL, - indirectlabelno++); - ASM_GENERATE_INTERNAL_LABEL (indirectlabel2, - INDIRECT_LABEL, - indirectlabelno++); - - /* Jump. */ - fputs ("\tjmp\t", asm_out_file); - assemble_name_raw (asm_out_file, indirectlabel2); - fputc ('\n', asm_out_file); - - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel1); - - /* An external function may be called via GOT, instead of PLT. */ - if (MEM_P (call_op)) - { - struct ix86_address parts; - rtx addr = XEXP (call_op, 0); - if (ix86_decompose_address (addr, &parts) - && parts.base == stack_pointer_rtx) - { - /* Since call will adjust stack by -UNITS_PER_WORD, - we must convert "disp(stack, index, scale)" to - "disp+UNITS_PER_WORD(stack, index, scale)". */ - if (parts.index) - { - addr = gen_rtx_MULT (Pmode, parts.index, - GEN_INT (parts.scale)); - addr = gen_rtx_PLUS (Pmode, stack_pointer_rtx, - addr); - } - else - addr = stack_pointer_rtx; - - rtx disp; - if (parts.disp != NULL_RTX) - disp = plus_constant (Pmode, parts.disp, - UNITS_PER_WORD); - else - disp = GEN_INT (UNITS_PER_WORD); - - addr = gen_rtx_PLUS (Pmode, addr, disp); - call_op = gen_rtx_MEM (GET_MODE (call_op), addr); - } - } - - output_asm_insn (push_buf, &call_op); - - if (thunk_name != NULL) - fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); - else - output_indirect_thunk (regno); - - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, indirectlabel2); - - /* Call. */ - fputs ("\tcall\t", asm_out_file); - assemble_name_raw (asm_out_file, indirectlabel1); - fputc ('\n', asm_out_file); - } -} - -/* Output indirect branch via a call and return thunk. CALL_OP is - the branch target. XASM is the assembly template for CALL_OP. - Branch is a tail call if SIBCALL_P is true. */ - -static void -ix86_output_indirect_branch (rtx call_op, const char *xasm, - bool sibcall_p) -{ - if (REG_P (call_op)) - ix86_output_indirect_branch_via_reg (call_op, sibcall_p); - else - ix86_output_indirect_branch_via_push (call_op, xasm, sibcall_p); -} - -/* Output indirect jump. CALL_OP is the jump target. */ - -const char * -ix86_output_indirect_jmp (rtx call_op) -{ - if (cfun->machine->indirect_branch_type != indirect_branch_keep) - { - /* We can't have red-zone since "call" in the indirect thunk - pushes the return address onto stack, destroying red-zone. */ - if (ix86_red_zone_size != 0) - gcc_unreachable (); - - ix86_output_indirect_branch (call_op, "%0", true); - return ""; - } - else - return "%!jmp\t%A0"; -} - -/* Output return instrumentation for current function if needed. */ - -static void -output_return_instrumentation (void) -{ - if (ix86_instrument_return != instrument_return_none - && flag_fentry - && !DECL_NO_INSTRUMENT_FUNCTION_ENTRY_EXIT (cfun->decl)) - { - if (ix86_flag_record_return) - fprintf (asm_out_file, "1:\n"); - switch (ix86_instrument_return) - { - case instrument_return_call: - fprintf (asm_out_file, "\tcall\t__return__\n"); - break; - case instrument_return_nop5: - /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */ - fprintf (asm_out_file, ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n"); - break; - case instrument_return_none: - break; - } - - if (ix86_flag_record_return) - { - fprintf (asm_out_file, "\t.section __return_loc, \"a\",@progbits\n"); - fprintf (asm_out_file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long"); - fprintf (asm_out_file, "\t.previous\n"); - } - } -} - -/* Output function return. CALL_OP is the jump target. Add a REP - prefix to RET if LONG_P is true and function return is kept. */ - -const char * -ix86_output_function_return (bool long_p) -{ - output_return_instrumentation (); - - if (cfun->machine->function_return_type != indirect_branch_keep) - { - char thunk_name[32]; - enum indirect_thunk_prefix need_prefix - = indirect_thunk_need_prefix (current_output_insn); - - if (cfun->machine->function_return_type - != indirect_branch_thunk_inline) - { - bool need_thunk = (cfun->machine->function_return_type - == indirect_branch_thunk); - indirect_thunk_name (thunk_name, INVALID_REGNUM, need_prefix, - true); - indirect_return_needed |= need_thunk; - fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); - } - else - output_indirect_thunk (INVALID_REGNUM); - - return ""; - } - - if (!long_p) - return "%!ret"; - - return "rep%; ret"; -} - -/* Output indirect function return. RET_OP is the function return - target. */ - -const char * -ix86_output_indirect_function_return (rtx ret_op) -{ - if (cfun->machine->function_return_type != indirect_branch_keep) - { - char thunk_name[32]; - enum indirect_thunk_prefix need_prefix - = indirect_thunk_need_prefix (current_output_insn); - unsigned int regno = REGNO (ret_op); - gcc_assert (regno == CX_REG); - - if (cfun->machine->function_return_type - != indirect_branch_thunk_inline) - { - bool need_thunk = (cfun->machine->function_return_type - == indirect_branch_thunk); - indirect_thunk_name (thunk_name, regno, need_prefix, true); - - if (need_thunk) - { - indirect_return_via_cx = true; - indirect_thunks_used |= 1 << CX_REG; - } - fprintf (asm_out_file, "\tjmp\t%s\n", thunk_name); - } - else - output_indirect_thunk (regno); - - return ""; - } - else - return "%!jmp\t%A0"; -} - -/* Split simple return with popping POPC bytes from stack to indirect - branch with stack adjustment . */ - -void -ix86_split_simple_return_pop_internal (rtx popc) -{ - struct machine_function *m = cfun->machine; - rtx ecx = gen_rtx_REG (SImode, CX_REG); - rtx_insn *insn; - - /* There is no "pascal" calling convention in any 64bit ABI. */ - gcc_assert (!TARGET_64BIT); - - insn = emit_insn (gen_pop (ecx)); - m->fs.cfa_offset -= UNITS_PER_WORD; - m->fs.sp_offset -= UNITS_PER_WORD; - - rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD); - x = gen_rtx_SET (stack_pointer_rtx, x); - add_reg_note (insn, REG_CFA_ADJUST_CFA, x); - add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx)); - RTX_FRAME_RELATED_P (insn) = 1; - - x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc); - x = gen_rtx_SET (stack_pointer_rtx, x); - insn = emit_insn (x); - add_reg_note (insn, REG_CFA_ADJUST_CFA, x); - RTX_FRAME_RELATED_P (insn) = 1; - - /* Now return address is in ECX. */ - emit_jump_insn (gen_simple_return_indirect_internal (ecx)); -} - -/* Output the assembly for a call instruction. */ - -const char * -ix86_output_call_insn (rtx_insn *insn, rtx call_op) -{ - bool direct_p = constant_call_address_operand (call_op, VOIDmode); - bool output_indirect_p - = (!TARGET_SEH - && cfun->machine->indirect_branch_type != indirect_branch_keep); - bool seh_nop_p = false; - const char *xasm; - - if (SIBLING_CALL_P (insn)) - { - output_return_instrumentation (); - if (direct_p) - { - if (ix86_nopic_noplt_attribute_p (call_op)) - { - direct_p = false; - if (TARGET_64BIT) - { - if (output_indirect_p) - xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}"; - else - xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}"; - } - else - { - if (output_indirect_p) - xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}"; - else - xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}"; - } - } - else - xasm = "%!jmp\t%P0"; - } - /* SEH epilogue detection requires the indirect branch case - to include REX.W. */ - else if (TARGET_SEH) - xasm = "%!rex.W jmp\t%A0"; - else - { - if (output_indirect_p) - xasm = "%0"; - else - xasm = "%!jmp\t%A0"; - } - - if (output_indirect_p && !direct_p) - ix86_output_indirect_branch (call_op, xasm, true); - else - output_asm_insn (xasm, &call_op); - return ""; - } - - /* SEH unwinding can require an extra nop to be emitted in several - circumstances. Determine if we have one of those. */ - if (TARGET_SEH) - { - rtx_insn *i; - - for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i)) - { - /* Prevent a catch region from being adjacent to a jump that would - be interpreted as an epilogue sequence by the unwinder. */ - if (JUMP_P(i) && CROSSING_JUMP_P (i)) - { - seh_nop_p = true; - break; - } - - /* If we get to another real insn, we don't need the nop. */ - if (INSN_P (i)) - break; - - /* If we get to the epilogue note, prevent a catch region from - being adjacent to the standard epilogue sequence. If non- - call-exceptions, we'll have done this during epilogue emission. */ - if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG - && !flag_non_call_exceptions - && !can_throw_internal (insn)) - { - seh_nop_p = true; - break; - } - } - - /* If we didn't find a real insn following the call, prevent the - unwinder from looking into the next function. */ - if (i == NULL) - seh_nop_p = true; - } - - if (direct_p) - { - if (ix86_nopic_noplt_attribute_p (call_op)) - { - direct_p = false; - if (TARGET_64BIT) - { - if (output_indirect_p) - xasm = "{%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}"; - else - xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}"; - } - else - { - if (output_indirect_p) - xasm = "{%p0@GOT|[DWORD PTR %p0@GOT]}"; - else - xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}"; - } - } - else - xasm = "%!call\t%P0"; - } - else - { - if (output_indirect_p) - xasm = "%0"; - else - xasm = "%!call\t%A0"; - } - - if (output_indirect_p && !direct_p) - ix86_output_indirect_branch (call_op, xasm, false); - else - output_asm_insn (xasm, &call_op); - - if (seh_nop_p) - return "nop"; - - return ""; -} - -/* Clear stack slot assignments remembered from previous functions. - This is called from INIT_EXPANDERS once before RTL is emitted for each - function. */ - -static struct machine_function * -ix86_init_machine_status (void) -{ - struct machine_function *f; - - f = ggc_cleared_alloc (); - f->call_abi = ix86_abi; - - return f; -} - -/* Return a MEM corresponding to a stack slot with mode MODE. - Allocate a new slot if necessary. - - The RTL for a function can have several slots available: N is - which slot to use. */ - -rtx -assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n) -{ - struct stack_local_entry *s; - - gcc_assert (n < MAX_386_STACK_LOCALS); - - for (s = ix86_stack_locals; s; s = s->next) - if (s->mode == mode && s->n == n) - return validize_mem (copy_rtx (s->rtl)); - - s = ggc_alloc (); - s->n = n; - s->mode = mode; - s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0); - - s->next = ix86_stack_locals; - ix86_stack_locals = s; - return validize_mem (copy_rtx (s->rtl)); -} - -static void -ix86_instantiate_decls (void) -{ - struct stack_local_entry *s; - - for (s = ix86_stack_locals; s; s = s->next) - if (s->rtl != NULL_RTX) - instantiate_decl_rtl (s->rtl); -} - -/* Check whether x86 address PARTS is a pc-relative address. */ - -bool -ix86_rip_relative_addr_p (struct ix86_address *parts) -{ - rtx base, index, disp; - - base = parts->base; - index = parts->index; - disp = parts->disp; - - if (disp && !base && !index) - { - if (TARGET_64BIT) - { - rtx symbol = disp; - - if (GET_CODE (disp) == CONST) - symbol = XEXP (disp, 0); - if (GET_CODE (symbol) == PLUS - && CONST_INT_P (XEXP (symbol, 1))) - symbol = XEXP (symbol, 0); - - if (GET_CODE (symbol) == LABEL_REF - || (GET_CODE (symbol) == SYMBOL_REF - && SYMBOL_REF_TLS_MODEL (symbol) == 0) - || (GET_CODE (symbol) == UNSPEC - && (XINT (symbol, 1) == UNSPEC_GOTPCREL - || XINT (symbol, 1) == UNSPEC_PCREL - || XINT (symbol, 1) == UNSPEC_GOTNTPOFF))) - return true; - } - } - return false; -} - -/* Calculate the length of the memory address in the instruction encoding. - Includes addr32 prefix, does not include the one-byte modrm, opcode, - or other prefixes. We never generate addr32 prefix for LEA insn. */ - -int -memory_address_length (rtx addr, bool lea) -{ - struct ix86_address parts; - rtx base, index, disp; - int len; - int ok; - - if (GET_CODE (addr) == PRE_DEC - || GET_CODE (addr) == POST_INC - || GET_CODE (addr) == PRE_MODIFY - || GET_CODE (addr) == POST_MODIFY) - return 0; - - ok = ix86_decompose_address (addr, &parts); - gcc_assert (ok); - - len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1; - - /* If this is not LEA instruction, add the length of addr32 prefix. */ - if (TARGET_64BIT && !lea - && (SImode_address_operand (addr, VOIDmode) - || (parts.base && GET_MODE (parts.base) == SImode) - || (parts.index && GET_MODE (parts.index) == SImode))) - len++; - - base = parts.base; - index = parts.index; - disp = parts.disp; - - if (base && SUBREG_P (base)) - base = SUBREG_REG (base); - if (index && SUBREG_P (index)) - index = SUBREG_REG (index); - - gcc_assert (base == NULL_RTX || REG_P (base)); - gcc_assert (index == NULL_RTX || REG_P (index)); - - /* Rule of thumb: - - esp as the base always wants an index, - - ebp as the base always wants a displacement, - - r12 as the base always wants an index, - - r13 as the base always wants a displacement. */ - - /* Register Indirect. */ - if (base && !index && !disp) - { - /* esp (for its index) and ebp (for its displacement) need - the two-byte modrm form. Similarly for r12 and r13 in 64-bit - code. */ - if (base == arg_pointer_rtx - || base == frame_pointer_rtx - || REGNO (base) == SP_REG - || REGNO (base) == BP_REG - || REGNO (base) == R12_REG - || REGNO (base) == R13_REG) - len++; - } - - /* Direct Addressing. In 64-bit mode mod 00 r/m 5 - is not disp32, but disp32(%rip), so for disp32 - SIB byte is needed, unless print_operand_address - optimizes it into disp32(%rip) or (%rip) is implied - by UNSPEC. */ - else if (disp && !base && !index) - { - len += 4; - if (!ix86_rip_relative_addr_p (&parts)) - len++; - } - else - { - /* Find the length of the displacement constant. */ - if (disp) - { - if (base && satisfies_constraint_K (disp)) - len += 1; - else - len += 4; - } - /* ebp always wants a displacement. Similarly r13. */ - else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG)) - len++; - - /* An index requires the two-byte modrm form.... */ - if (index - /* ...like esp (or r12), which always wants an index. */ - || base == arg_pointer_rtx - || base == frame_pointer_rtx - || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG))) - len++; - } - - return len; -} - -/* Compute default value for "length_immediate" attribute. When SHORTFORM - is set, expect that insn have 8bit immediate alternative. */ -int -ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform) -{ - int len = 0; - int i; - extract_insn_cached (insn); - for (i = recog_data.n_operands - 1; i >= 0; --i) - if (CONSTANT_P (recog_data.operand[i])) - { - enum attr_mode mode = get_attr_mode (insn); - - gcc_assert (!len); - if (shortform && CONST_INT_P (recog_data.operand[i])) - { - HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]); - switch (mode) - { - case MODE_QI: - len = 1; - continue; - case MODE_HI: - ival = trunc_int_for_mode (ival, HImode); - break; - case MODE_SI: - ival = trunc_int_for_mode (ival, SImode); - break; - default: - break; - } - if (IN_RANGE (ival, -128, 127)) - { - len = 1; - continue; - } - } - switch (mode) - { - case MODE_QI: - len = 1; - break; - case MODE_HI: - len = 2; - break; - case MODE_SI: - len = 4; - break; - /* Immediates for DImode instructions are encoded - as 32bit sign extended values. */ - case MODE_DI: - len = 4; - break; - default: - fatal_insn ("unknown insn mode", insn); - } - } - return len; -} - -/* Compute default value for "length_address" attribute. */ -int -ix86_attr_length_address_default (rtx_insn *insn) -{ - int i; - - if (get_attr_type (insn) == TYPE_LEA) - { - rtx set = PATTERN (insn), addr; - - if (GET_CODE (set) == PARALLEL) - set = XVECEXP (set, 0, 0); - - gcc_assert (GET_CODE (set) == SET); - - addr = SET_SRC (set); - - return memory_address_length (addr, true); - } - - extract_insn_cached (insn); - for (i = recog_data.n_operands - 1; i >= 0; --i) - { - rtx op = recog_data.operand[i]; - if (MEM_P (op)) - { - constrain_operands_cached (insn, reload_completed); - if (which_alternative != -1) - { - const char *constraints = recog_data.constraints[i]; - int alt = which_alternative; - - while (*constraints == '=' || *constraints == '+') - constraints++; - while (alt-- > 0) - while (*constraints++ != ',') - ; - /* Skip ignored operands. */ - if (*constraints == 'X') - continue; - } - - int len = memory_address_length (XEXP (op, 0), false); - - /* Account for segment prefix for non-default addr spaces. */ - if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op))) - len++; - - return len; - } - } - return 0; -} - -/* Compute default value for "length_vex" attribute. It includes - 2 or 3 byte VEX prefix and 1 opcode byte. */ - -int -ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode, - bool has_vex_w) -{ - int i; - - /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3 - byte VEX prefix. */ - if (!has_0f_opcode || has_vex_w) - return 3 + 1; - - /* We can always use 2 byte VEX prefix in 32bit. */ - if (!TARGET_64BIT) - return 2 + 1; - - extract_insn_cached (insn); - - for (i = recog_data.n_operands - 1; i >= 0; --i) - if (REG_P (recog_data.operand[i])) - { - /* REX.W bit uses 3 byte VEX prefix. */ - if (GET_MODE (recog_data.operand[i]) == DImode - && GENERAL_REG_P (recog_data.operand[i])) - return 3 + 1; - } - else - { - /* REX.X or REX.B bits use 3 byte VEX prefix. */ - if (MEM_P (recog_data.operand[i]) - && x86_extended_reg_mentioned_p (recog_data.operand[i])) - return 3 + 1; - } - - return 2 + 1; -} - - -static bool -ix86_class_likely_spilled_p (reg_class_t); - -/* Returns true if lhs of insn is HW function argument register and set up - is_spilled to true if it is likely spilled HW register. */ -static bool -insn_is_function_arg (rtx insn, bool* is_spilled) -{ - rtx dst; - - if (!NONDEBUG_INSN_P (insn)) - return false; - /* Call instructions are not movable, ignore it. */ - if (CALL_P (insn)) - return false; - insn = PATTERN (insn); - if (GET_CODE (insn) == PARALLEL) - insn = XVECEXP (insn, 0, 0); - if (GET_CODE (insn) != SET) - return false; - dst = SET_DEST (insn); - if (REG_P (dst) && HARD_REGISTER_P (dst) - && ix86_function_arg_regno_p (REGNO (dst))) - { - /* Is it likely spilled HW register? */ - if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst)) - && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))) - *is_spilled = true; - return true; - } - return false; -} - -/* Add output dependencies for chain of function adjacent arguments if only - there is a move to likely spilled HW register. Return first argument - if at least one dependence was added or NULL otherwise. */ -static rtx_insn * -add_parameter_dependencies (rtx_insn *call, rtx_insn *head) -{ - rtx_insn *insn; - rtx_insn *last = call; - rtx_insn *first_arg = NULL; - bool is_spilled = false; - - head = PREV_INSN (head); - - /* Find nearest to call argument passing instruction. */ - while (true) - { - last = PREV_INSN (last); - if (last == head) - return NULL; - if (!NONDEBUG_INSN_P (last)) - continue; - if (insn_is_function_arg (last, &is_spilled)) - break; - return NULL; - } - - first_arg = last; - while (true) - { - insn = PREV_INSN (last); - if (!INSN_P (insn)) - break; - if (insn == head) - break; - if (!NONDEBUG_INSN_P (insn)) - { - last = insn; - continue; - } - if (insn_is_function_arg (insn, &is_spilled)) - { - /* Add output depdendence between two function arguments if chain - of output arguments contains likely spilled HW registers. */ - if (is_spilled) - add_dependence (first_arg, insn, REG_DEP_OUTPUT); - first_arg = last = insn; - } - else - break; - } - if (!is_spilled) - return NULL; - return first_arg; -} - -/* Add output or anti dependency from insn to first_arg to restrict its code - motion. */ -static void -avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn) -{ - rtx set; - rtx tmp; - - set = single_set (insn); - if (!set) - return; - tmp = SET_DEST (set); - if (REG_P (tmp)) - { - /* Add output dependency to the first function argument. */ - add_dependence (first_arg, insn, REG_DEP_OUTPUT); - return; - } - /* Add anti dependency. */ - add_dependence (first_arg, insn, REG_DEP_ANTI); -} - -/* Avoid cross block motion of function argument through adding dependency - from the first non-jump instruction in bb. */ -static void -add_dependee_for_func_arg (rtx_insn *arg, basic_block bb) -{ - rtx_insn *insn = BB_END (bb); - - while (insn) - { - if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn)) - { - rtx set = single_set (insn); - if (set) - { - avoid_func_arg_motion (arg, insn); - return; - } - } - if (insn == BB_HEAD (bb)) - return; - insn = PREV_INSN (insn); - } -} - -/* Hook for pre-reload schedule - avoid motion of function arguments - passed in likely spilled HW registers. */ -static void -ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail) -{ - rtx_insn *insn; - rtx_insn *first_arg = NULL; - if (reload_completed) - return; - while (head != tail && DEBUG_INSN_P (head)) - head = NEXT_INSN (head); - for (insn = tail; insn != head; insn = PREV_INSN (insn)) - if (INSN_P (insn) && CALL_P (insn)) - { - first_arg = add_parameter_dependencies (insn, head); - if (first_arg) - { - /* Add dependee for first argument to predecessors if only - region contains more than one block. */ - basic_block bb = BLOCK_FOR_INSN (insn); - int rgn = CONTAINING_RGN (bb->index); - int nr_blks = RGN_NR_BLOCKS (rgn); - /* Skip trivial regions and region head blocks that can have - predecessors outside of region. */ - if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0) - { - edge e; - edge_iterator ei; - - /* Regions are SCCs with the exception of selective - scheduling with pipelining of outer blocks enabled. - So also check that immediate predecessors of a non-head - block are in the same region. */ - FOR_EACH_EDGE (e, ei, bb->preds) - { - /* Avoid creating of loop-carried dependencies through - using topological ordering in the region. */ - if (rgn == CONTAINING_RGN (e->src->index) - && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index)) - add_dependee_for_func_arg (first_arg, e->src); - } - } - insn = first_arg; - if (insn == head) - break; - } - } - else if (first_arg) - avoid_func_arg_motion (first_arg, insn); -} - -/* Hook for pre-reload schedule - set priority of moves from likely spilled - HW registers to maximum, to schedule them at soon as possible. These are - moves from function argument registers at the top of the function entry - and moves from function return value registers after call. */ -static int -ix86_adjust_priority (rtx_insn *insn, int priority) -{ - rtx set; - - if (reload_completed) - return priority; - - if (!NONDEBUG_INSN_P (insn)) - return priority; - - set = single_set (insn); - if (set) - { - rtx tmp = SET_SRC (set); - if (REG_P (tmp) - && HARD_REGISTER_P (tmp) - && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp)) - && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp)))) - return current_sched_info->sched_max_insns_priority; - } - - return priority; -} - -/* Prepare for scheduling pass. */ -static void -ix86_sched_init_global (FILE *, int, int) -{ - /* Install scheduling hooks for current CPU. Some of these hooks are used - in time-critical parts of the scheduler, so we only set them up when - they are actually used. */ - switch (ix86_tune) - { - case PROCESSOR_CORE2: - case PROCESSOR_NEHALEM: - case PROCESSOR_SANDYBRIDGE: - case PROCESSOR_HASWELL: - case PROCESSOR_GENERIC: - /* Do not perform multipass scheduling for pre-reload schedule - to save compile time. */ - if (reload_completed) - { - ix86_core2i7_init_hooks (); - break; - } - /* Fall through. */ - default: - targetm.sched.dfa_post_advance_cycle = NULL; - targetm.sched.first_cycle_multipass_init = NULL; - targetm.sched.first_cycle_multipass_begin = NULL; - targetm.sched.first_cycle_multipass_issue = NULL; - targetm.sched.first_cycle_multipass_backtrack = NULL; - targetm.sched.first_cycle_multipass_end = NULL; - targetm.sched.first_cycle_multipass_fini = NULL; - break; - } -} - - -/* Implement TARGET_STATIC_RTX_ALIGNMENT. */ - -static HOST_WIDE_INT -ix86_static_rtx_alignment (machine_mode mode) -{ - if (mode == DFmode) - return 64; - if (ALIGN_MODE_128 (mode)) - return MAX (128, GET_MODE_ALIGNMENT (mode)); - return GET_MODE_ALIGNMENT (mode); -} - -/* Implement TARGET_CONSTANT_ALIGNMENT. */ - -static HOST_WIDE_INT -ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align) -{ - if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST - || TREE_CODE (exp) == INTEGER_CST) - { - machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); - HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode); - return MAX (mode_align, align); - } - else if (!optimize_size && TREE_CODE (exp) == STRING_CST - && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD) - return BITS_PER_WORD; - - return align; -} - -/* Implement TARGET_EMPTY_RECORD_P. */ - -static bool -ix86_is_empty_record (const_tree type) -{ - if (!TARGET_64BIT) - return false; - return default_is_empty_record (type); -} - -/* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */ - -static void -ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type) -{ - CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); - - if (!cum->warn_empty) - return; - - if (!TYPE_EMPTY_P (type)) - return; - - /* Don't warn if the function isn't visible outside of the TU. */ - if (cum->decl && !TREE_PUBLIC (cum->decl)) - return; - - const_tree ctx = get_ultimate_context (cum->decl); - if (ctx != NULL_TREE - && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx)) - return; - - /* If the actual size of the type is zero, then there is no change - in how objects of this size are passed. */ - if (int_size_in_bytes (type) == 0) - return; - - warning (OPT_Wabi, "empty class %qT parameter passing ABI " - "changes in %<-fabi-version=12%> (GCC 8)", type); - - /* Only warn once. */ - cum->warn_empty = false; -} - -/* This hook returns name of multilib ABI. */ - -static const char * -ix86_get_multilib_abi_name (void) -{ - if (!(TARGET_64BIT_P (ix86_isa_flags))) - return "i386"; - else if (TARGET_X32_P (ix86_isa_flags)) - return "x32"; - else - return "x86_64"; -} - -/* Compute the alignment for a variable for Intel MCU psABI. TYPE is - the data type, and ALIGN is the alignment that the object would - ordinarily have. */ - -static int -iamcu_alignment (tree type, int align) -{ - machine_mode mode; - - if (align < 32 || TYPE_USER_ALIGN (type)) - return align; - - /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4 - bytes. */ - mode = TYPE_MODE (strip_array_types (type)); - switch (GET_MODE_CLASS (mode)) - { - case MODE_INT: - case MODE_COMPLEX_INT: - case MODE_COMPLEX_FLOAT: - case MODE_FLOAT: - case MODE_DECIMAL_FLOAT: - return 32; - default: - return align; - } -} - -/* Compute the alignment for a static variable. - TYPE is the data type, and ALIGN is the alignment that - the object would ordinarily have. The value of this function is used - instead of that alignment to align the object. */ - -int -ix86_data_alignment (tree type, unsigned int align, bool opt) -{ - /* GCC 4.8 and earlier used to incorrectly assume this alignment even - for symbols from other compilation units or symbols that don't need - to bind locally. In order to preserve some ABI compatibility with - those compilers, ensure we don't decrease alignment from what we - used to assume. */ - - unsigned int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT); - - /* A data structure, equal or greater than the size of a cache line - (64 bytes in the Pentium 4 and other recent Intel processors, including - processors based on Intel Core microarchitecture) should be aligned - so that its base address is a multiple of a cache line size. */ - - unsigned int max_align - = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT); - - if (max_align < BITS_PER_WORD) - max_align = BITS_PER_WORD; - - switch (ix86_align_data_type) - { - case ix86_align_data_type_abi: opt = false; break; - case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break; - case ix86_align_data_type_cacheline: break; - } - - if (TARGET_IAMCU) - align = iamcu_alignment (type, align); - - if (opt - && AGGREGATE_TYPE_P (type) - && TYPE_SIZE (type) - && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST) - { - if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat) - && align < max_align_compat) - align = max_align_compat; - if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align) - && align < max_align) - align = max_align; - } - - /* x86-64 ABI requires arrays greater than 16 bytes to be aligned - to 16byte boundary. */ - if (TARGET_64BIT) - { - if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE) - && TYPE_SIZE (type) - && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST - && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128) - && align < 128) - return 128; - } - - if (!opt) - return align; - - if (TREE_CODE (type) == ARRAY_TYPE) - { - if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64) - return 64; - if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128) - return 128; - } - else if (TREE_CODE (type) == COMPLEX_TYPE) - { - - if (TYPE_MODE (type) == DCmode && align < 64) - return 64; - if ((TYPE_MODE (type) == XCmode - || TYPE_MODE (type) == TCmode) && align < 128) - return 128; - } - else if ((TREE_CODE (type) == RECORD_TYPE - || TREE_CODE (type) == UNION_TYPE - || TREE_CODE (type) == QUAL_UNION_TYPE) - && TYPE_FIELDS (type)) - { - if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64) - return 64; - if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128) - return 128; - } - else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE - || TREE_CODE (type) == INTEGER_TYPE) - { - if (TYPE_MODE (type) == DFmode && align < 64) - return 64; - if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128) - return 128; - } - - return align; -} - -/* Compute the alignment for a local variable or a stack slot. EXP is - the data type or decl itself, MODE is the widest mode available and - ALIGN is the alignment that the object would ordinarily have. The - value of this macro is used instead of that alignment to align the - object. */ - -unsigned int -ix86_local_alignment (tree exp, machine_mode mode, - unsigned int align) -{ - tree type, decl; - - if (exp && DECL_P (exp)) - { - type = TREE_TYPE (exp); - decl = exp; - } - else - { - type = exp; - decl = NULL; - } - - /* Don't do dynamic stack realignment for long long objects with - -mpreferred-stack-boundary=2. */ - if (!TARGET_64BIT - && align == 64 - && ix86_preferred_stack_boundary < 64 - && (mode == DImode || (type && TYPE_MODE (type) == DImode)) - && (!type || !TYPE_USER_ALIGN (type)) - && (!decl || !DECL_USER_ALIGN (decl))) - align = 32; - - /* If TYPE is NULL, we are allocating a stack slot for caller-save - register in MODE. We will return the largest alignment of XF - and DF. */ - if (!type) - { - if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode)) - align = GET_MODE_ALIGNMENT (DFmode); - return align; - } - - /* Don't increase alignment for Intel MCU psABI. */ - if (TARGET_IAMCU) - return align; - - /* x86-64 ABI requires arrays greater than 16 bytes to be aligned - to 16byte boundary. Exact wording is: - - An array uses the same alignment as its elements, except that a local or - global array variable of length at least 16 bytes or - a C99 variable-length array variable always has alignment of at least 16 bytes. - - This was added to allow use of aligned SSE instructions at arrays. This - rule is meant for static storage (where compiler cannot do the analysis - by itself). We follow it for automatic variables only when convenient. - We fully control everything in the function compiled and functions from - other unit cannot rely on the alignment. - - Exclude va_list type. It is the common case of local array where - we cannot benefit from the alignment. - - TODO: Probably one should optimize for size only when var is not escaping. */ - if (TARGET_64BIT && optimize_function_for_speed_p (cfun) - && TARGET_SSE) - { - if (AGGREGATE_TYPE_P (type) - && (va_list_type_node == NULL_TREE - || (TYPE_MAIN_VARIANT (type) - != TYPE_MAIN_VARIANT (va_list_type_node))) - && TYPE_SIZE (type) - && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST - && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128) - && align < 128) - return 128; - } - if (TREE_CODE (type) == ARRAY_TYPE) - { - if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64) - return 64; - if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128) - return 128; - } - else if (TREE_CODE (type) == COMPLEX_TYPE) - { - if (TYPE_MODE (type) == DCmode && align < 64) - return 64; - if ((TYPE_MODE (type) == XCmode - || TYPE_MODE (type) == TCmode) && align < 128) - return 128; - } - else if ((TREE_CODE (type) == RECORD_TYPE - || TREE_CODE (type) == UNION_TYPE - || TREE_CODE (type) == QUAL_UNION_TYPE) - && TYPE_FIELDS (type)) - { - if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64) - return 64; - if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128) - return 128; - } - else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE - || TREE_CODE (type) == INTEGER_TYPE) - { - - if (TYPE_MODE (type) == DFmode && align < 64) - return 64; - if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128) - return 128; - } - return align; -} - -/* Compute the minimum required alignment for dynamic stack realignment - purposes for a local variable, parameter or a stack slot. EXP is - the data type or decl itself, MODE is its mode and ALIGN is the - alignment that the object would ordinarily have. */ - -unsigned int -ix86_minimum_alignment (tree exp, machine_mode mode, - unsigned int align) -{ - tree type, decl; - - if (exp && DECL_P (exp)) - { - type = TREE_TYPE (exp); - decl = exp; - } - else - { - type = exp; - decl = NULL; - } - - if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64) - return align; - - /* Don't do dynamic stack realignment for long long objects with - -mpreferred-stack-boundary=2. */ - if ((mode == DImode || (type && TYPE_MODE (type) == DImode)) - && (!type || !TYPE_USER_ALIGN (type)) - && (!decl || !DECL_USER_ALIGN (decl))) - { - gcc_checking_assert (!TARGET_STV); - return 32; - } - - return align; -} - -/* Find a location for the static chain incoming to a nested function. - This is a register, unless all free registers are used by arguments. */ - -static rtx -ix86_static_chain (const_tree fndecl_or_type, bool incoming_p) -{ - unsigned regno; - - if (TARGET_64BIT) - { - /* We always use R10 in 64-bit mode. */ - regno = R10_REG; - } - else - { - const_tree fntype, fndecl; - unsigned int ccvt; - - /* By default in 32-bit mode we use ECX to pass the static chain. */ - regno = CX_REG; - - if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL) - { - fntype = TREE_TYPE (fndecl_or_type); - fndecl = fndecl_or_type; - } - else - { - fntype = fndecl_or_type; - fndecl = NULL; - } - - ccvt = ix86_get_callcvt (fntype); - if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) - { - /* Fastcall functions use ecx/edx for arguments, which leaves - us with EAX for the static chain. - Thiscall functions use ecx for arguments, which also - leaves us with EAX for the static chain. */ - regno = AX_REG; - } - else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) - { - /* Thiscall functions use ecx for arguments, which leaves - us with EAX and EDX for the static chain. - We are using for abi-compatibility EAX. */ - regno = AX_REG; - } - else if (ix86_function_regparm (fntype, fndecl) == 3) - { - /* For regparm 3, we have no free call-clobbered registers in - which to store the static chain. In order to implement this, - we have the trampoline push the static chain to the stack. - However, we can't push a value below the return address when - we call the nested function directly, so we have to use an - alternate entry point. For this we use ESI, and have the - alternate entry point push ESI, so that things appear the - same once we're executing the nested function. */ - if (incoming_p) - { - if (fndecl == current_function_decl - && !ix86_static_chain_on_stack) - { - gcc_assert (!reload_completed); - ix86_static_chain_on_stack = true; - } - return gen_frame_mem (SImode, - plus_constant (Pmode, - arg_pointer_rtx, -8)); - } - regno = SI_REG; - } - } - - return gen_rtx_REG (Pmode, regno); -} - -/* Emit RTL insns to initialize the variable parts of a trampoline. - FNDECL is the decl of the target address; M_TRAMP is a MEM for - the trampoline, and CHAIN_VALUE is an RTX for the static chain - to be passed to the target function. */ - -static void -ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) -{ - rtx mem, fnaddr; - int opcode; - int offset = 0; - bool need_endbr = (flag_cf_protection & CF_BRANCH); - - fnaddr = XEXP (DECL_RTL (fndecl), 0); - - if (TARGET_64BIT) - { - int size; - - if (need_endbr) - { - /* Insert ENDBR64. */ - mem = adjust_address (m_tramp, SImode, offset); - emit_move_insn (mem, gen_int_mode (0xfa1e0ff3, SImode)); - offset += 4; - } - - /* Load the function address to r11. Try to load address using - the shorter movl instead of movabs. We may want to support - movq for kernel mode, but kernel does not use trampolines at - the moment. FNADDR is a 32bit address and may not be in - DImode when ptr_mode == SImode. Always use movl in this - case. */ - if (ptr_mode == SImode - || x86_64_zext_immediate_operand (fnaddr, VOIDmode)) - { - fnaddr = copy_addr_to_reg (fnaddr); - - mem = adjust_address (m_tramp, HImode, offset); - emit_move_insn (mem, gen_int_mode (0xbb41, HImode)); - - mem = adjust_address (m_tramp, SImode, offset + 2); - emit_move_insn (mem, gen_lowpart (SImode, fnaddr)); - offset += 6; - } - else - { - mem = adjust_address (m_tramp, HImode, offset); - emit_move_insn (mem, gen_int_mode (0xbb49, HImode)); - - mem = adjust_address (m_tramp, DImode, offset + 2); - emit_move_insn (mem, fnaddr); - offset += 10; - } - - /* Load static chain using movabs to r10. Use the shorter movl - instead of movabs when ptr_mode == SImode. */ - if (ptr_mode == SImode) - { - opcode = 0xba41; - size = 6; - } - else - { - opcode = 0xba49; - size = 10; - } - - mem = adjust_address (m_tramp, HImode, offset); - emit_move_insn (mem, gen_int_mode (opcode, HImode)); - - mem = adjust_address (m_tramp, ptr_mode, offset + 2); - emit_move_insn (mem, chain_value); - offset += size; - - /* Jump to r11; the last (unused) byte is a nop, only there to - pad the write out to a single 32-bit store. */ - mem = adjust_address (m_tramp, SImode, offset); - emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode)); - offset += 4; - } - else - { - rtx disp, chain; - - /* Depending on the static chain location, either load a register - with a constant, or push the constant to the stack. All of the - instructions are the same size. */ - chain = ix86_static_chain (fndecl, true); - if (REG_P (chain)) - { - switch (REGNO (chain)) - { - case AX_REG: - opcode = 0xb8; break; - case CX_REG: - opcode = 0xb9; break; - default: - gcc_unreachable (); - } - } - else - opcode = 0x68; - - if (need_endbr) - { - /* Insert ENDBR32. */ - mem = adjust_address (m_tramp, SImode, offset); - emit_move_insn (mem, gen_int_mode (0xfb1e0ff3, SImode)); - offset += 4; - } - - mem = adjust_address (m_tramp, QImode, offset); - emit_move_insn (mem, gen_int_mode (opcode, QImode)); - - mem = adjust_address (m_tramp, SImode, offset + 1); - emit_move_insn (mem, chain_value); - offset += 5; - - mem = adjust_address (m_tramp, QImode, offset); - emit_move_insn (mem, gen_int_mode (0xe9, QImode)); - - mem = adjust_address (m_tramp, SImode, offset + 1); - - /* Compute offset from the end of the jmp to the target function. - In the case in which the trampoline stores the static chain on - the stack, we need to skip the first insn which pushes the - (call-saved) register static chain; this push is 1 byte. */ - offset += 5; - disp = expand_binop (SImode, sub_optab, fnaddr, - plus_constant (Pmode, XEXP (m_tramp, 0), - offset - (MEM_P (chain) ? 1 : 0)), - NULL_RTX, 1, OPTAB_DIRECT); - emit_move_insn (mem, disp); - } - - gcc_assert (offset <= TRAMPOLINE_SIZE); - -#ifdef HAVE_ENABLE_EXECUTE_STACK -#ifdef CHECK_EXECUTE_STACK_ENABLED - if (CHECK_EXECUTE_STACK_ENABLED) -#endif - emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"), - LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode); -#endif -} - -static bool -ix86_allocate_stack_slots_for_args (void) -{ - /* Naked functions should not allocate stack slots for arguments. */ - return !ix86_function_naked (current_function_decl); -} - -static bool -ix86_warn_func_return (tree decl) -{ - /* Naked functions are implemented entirely in assembly, including the - return sequence, so suppress warnings about this. */ - return !ix86_function_naked (decl); -} - -/* The following file contains several enumerations and data structures - built from the definitions in i386-builtin-types.def. */ - -#include "i386-builtin-types.inc" - -/* Table for the ix86 builtin non-function types. */ -static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1]; - -/* Retrieve an element from the above table, building some of - the types lazily. */ - -static tree -ix86_get_builtin_type (enum ix86_builtin_type tcode) -{ - unsigned int index; - tree type, itype; - - gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab)); - - type = ix86_builtin_type_tab[(int) tcode]; - if (type != NULL) - return type; - - gcc_assert (tcode > IX86_BT_LAST_PRIM); - if (tcode <= IX86_BT_LAST_VECT) - { - machine_mode mode; - - index = tcode - IX86_BT_LAST_PRIM - 1; - itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]); - mode = ix86_builtin_type_vect_mode[index]; - - type = build_vector_type_for_mode (itype, mode); - } - else - { - int quals; - - index = tcode - IX86_BT_LAST_VECT - 1; - if (tcode <= IX86_BT_LAST_PTR) - quals = TYPE_UNQUALIFIED; - else - quals = TYPE_QUAL_CONST; - - itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]); - if (quals != TYPE_UNQUALIFIED) - itype = build_qualified_type (itype, quals); - - type = build_pointer_type (itype); - } - - ix86_builtin_type_tab[(int) tcode] = type; - return type; -} - -/* Table for the ix86 builtin function types. */ -static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1]; - -/* Retrieve an element from the above table, building some of - the types lazily. */ - -static tree -ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode) -{ - tree type; - - gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab)); - - type = ix86_builtin_func_type_tab[(int) tcode]; - if (type != NULL) - return type; - - if (tcode <= IX86_BT_LAST_FUNC) - { - unsigned start = ix86_builtin_func_start[(int) tcode]; - unsigned after = ix86_builtin_func_start[(int) tcode + 1]; - tree rtype, atype, args = void_list_node; - unsigned i; - - rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]); - for (i = after - 1; i > start; --i) - { - atype = ix86_get_builtin_type (ix86_builtin_func_args[i]); - args = tree_cons (NULL, atype, args); - } - - type = build_function_type (rtype, args); - } - else - { - unsigned index = tcode - IX86_BT_LAST_FUNC - 1; - enum ix86_builtin_func_type icode; - - icode = ix86_builtin_func_alias_base[index]; - type = ix86_get_builtin_func_type (icode); - } - - ix86_builtin_func_type_tab[(int) tcode] = type; - return type; -} - - -/* Codes for all the SSE/MMX builtins. Builtins not mentioned in any - bdesc_* arrays below should come first, then builtins for each bdesc_* - array in ascending order, so that we can use direct array accesses. */ -enum ix86_builtins -{ - IX86_BUILTIN_MASKMOVQ, - IX86_BUILTIN_LDMXCSR, - IX86_BUILTIN_STMXCSR, - IX86_BUILTIN_MASKMOVDQU, - IX86_BUILTIN_PSLLDQ128, - IX86_BUILTIN_CLFLUSH, - IX86_BUILTIN_MONITOR, - IX86_BUILTIN_MWAIT, - IX86_BUILTIN_UMONITOR, - IX86_BUILTIN_UMWAIT, - IX86_BUILTIN_TPAUSE, - IX86_BUILTIN_CLZERO, - IX86_BUILTIN_CLDEMOTE, - IX86_BUILTIN_VEC_INIT_V2SI, - IX86_BUILTIN_VEC_INIT_V4HI, - IX86_BUILTIN_VEC_INIT_V8QI, - IX86_BUILTIN_VEC_EXT_V2DF, - IX86_BUILTIN_VEC_EXT_V2DI, - IX86_BUILTIN_VEC_EXT_V4SF, - IX86_BUILTIN_VEC_EXT_V4SI, - IX86_BUILTIN_VEC_EXT_V8HI, - IX86_BUILTIN_VEC_EXT_V2SI, - IX86_BUILTIN_VEC_EXT_V4HI, - IX86_BUILTIN_VEC_EXT_V16QI, - IX86_BUILTIN_VEC_SET_V2DI, - IX86_BUILTIN_VEC_SET_V4SF, - IX86_BUILTIN_VEC_SET_V4SI, - IX86_BUILTIN_VEC_SET_V8HI, - IX86_BUILTIN_VEC_SET_V4HI, - IX86_BUILTIN_VEC_SET_V16QI, - IX86_BUILTIN_GATHERSIV2DF, - IX86_BUILTIN_GATHERSIV4DF, - IX86_BUILTIN_GATHERDIV2DF, - IX86_BUILTIN_GATHERDIV4DF, - IX86_BUILTIN_GATHERSIV4SF, - IX86_BUILTIN_GATHERSIV8SF, - IX86_BUILTIN_GATHERDIV4SF, - IX86_BUILTIN_GATHERDIV8SF, - IX86_BUILTIN_GATHERSIV2DI, - IX86_BUILTIN_GATHERSIV4DI, - IX86_BUILTIN_GATHERDIV2DI, - IX86_BUILTIN_GATHERDIV4DI, - IX86_BUILTIN_GATHERSIV4SI, - IX86_BUILTIN_GATHERSIV8SI, - IX86_BUILTIN_GATHERDIV4SI, - IX86_BUILTIN_GATHERDIV8SI, - IX86_BUILTIN_GATHER3SIV8SF, - IX86_BUILTIN_GATHER3SIV4SF, - IX86_BUILTIN_GATHER3SIV4DF, - IX86_BUILTIN_GATHER3SIV2DF, - IX86_BUILTIN_GATHER3DIV8SF, - IX86_BUILTIN_GATHER3DIV4SF, - IX86_BUILTIN_GATHER3DIV4DF, - IX86_BUILTIN_GATHER3DIV2DF, - IX86_BUILTIN_GATHER3SIV8SI, - IX86_BUILTIN_GATHER3SIV4SI, - IX86_BUILTIN_GATHER3SIV4DI, - IX86_BUILTIN_GATHER3SIV2DI, - IX86_BUILTIN_GATHER3DIV8SI, - IX86_BUILTIN_GATHER3DIV4SI, - IX86_BUILTIN_GATHER3DIV4DI, - IX86_BUILTIN_GATHER3DIV2DI, - IX86_BUILTIN_SCATTERSIV8SF, - IX86_BUILTIN_SCATTERSIV4SF, - IX86_BUILTIN_SCATTERSIV4DF, - IX86_BUILTIN_SCATTERSIV2DF, - IX86_BUILTIN_SCATTERDIV8SF, - IX86_BUILTIN_SCATTERDIV4SF, - IX86_BUILTIN_SCATTERDIV4DF, - IX86_BUILTIN_SCATTERDIV2DF, - IX86_BUILTIN_SCATTERSIV8SI, - IX86_BUILTIN_SCATTERSIV4SI, - IX86_BUILTIN_SCATTERSIV4DI, - IX86_BUILTIN_SCATTERSIV2DI, - IX86_BUILTIN_SCATTERDIV8SI, - IX86_BUILTIN_SCATTERDIV4SI, - IX86_BUILTIN_SCATTERDIV4DI, - IX86_BUILTIN_SCATTERDIV2DI, - /* Alternate 4 and 8 element gather/scatter for the vectorizer - where all operands are 32-byte or 64-byte wide respectively. */ - IX86_BUILTIN_GATHERALTSIV4DF, - IX86_BUILTIN_GATHERALTDIV8SF, - IX86_BUILTIN_GATHERALTSIV4DI, - IX86_BUILTIN_GATHERALTDIV8SI, - IX86_BUILTIN_GATHER3ALTDIV16SF, - IX86_BUILTIN_GATHER3ALTDIV16SI, - IX86_BUILTIN_GATHER3ALTSIV4DF, - IX86_BUILTIN_GATHER3ALTDIV8SF, - IX86_BUILTIN_GATHER3ALTSIV4DI, - IX86_BUILTIN_GATHER3ALTDIV8SI, - IX86_BUILTIN_GATHER3ALTSIV8DF, - IX86_BUILTIN_GATHER3ALTSIV8DI, - IX86_BUILTIN_GATHER3DIV16SF, - IX86_BUILTIN_GATHER3DIV16SI, - IX86_BUILTIN_GATHER3DIV8DF, - IX86_BUILTIN_GATHER3DIV8DI, - IX86_BUILTIN_GATHER3SIV16SF, - IX86_BUILTIN_GATHER3SIV16SI, - IX86_BUILTIN_GATHER3SIV8DF, - IX86_BUILTIN_GATHER3SIV8DI, - IX86_BUILTIN_SCATTERALTSIV8DF, - IX86_BUILTIN_SCATTERALTDIV16SF, - IX86_BUILTIN_SCATTERALTSIV8DI, - IX86_BUILTIN_SCATTERALTDIV16SI, - IX86_BUILTIN_SCATTERALTSIV4DF, - IX86_BUILTIN_SCATTERALTDIV8SF, - IX86_BUILTIN_SCATTERALTSIV4DI, - IX86_BUILTIN_SCATTERALTDIV8SI, - IX86_BUILTIN_SCATTERALTSIV2DF, - IX86_BUILTIN_SCATTERALTDIV4SF, - IX86_BUILTIN_SCATTERALTSIV2DI, - IX86_BUILTIN_SCATTERALTDIV4SI, - IX86_BUILTIN_SCATTERDIV16SF, - IX86_BUILTIN_SCATTERDIV16SI, - IX86_BUILTIN_SCATTERDIV8DF, - IX86_BUILTIN_SCATTERDIV8DI, - IX86_BUILTIN_SCATTERSIV16SF, - IX86_BUILTIN_SCATTERSIV16SI, - IX86_BUILTIN_SCATTERSIV8DF, - IX86_BUILTIN_SCATTERSIV8DI, - IX86_BUILTIN_GATHERPFQPD, - IX86_BUILTIN_GATHERPFDPS, - IX86_BUILTIN_GATHERPFDPD, - IX86_BUILTIN_GATHERPFQPS, - IX86_BUILTIN_SCATTERPFDPD, - IX86_BUILTIN_SCATTERPFDPS, - IX86_BUILTIN_SCATTERPFQPD, - IX86_BUILTIN_SCATTERPFQPS, - IX86_BUILTIN_CLWB, - IX86_BUILTIN_CLFLUSHOPT, - IX86_BUILTIN_INFQ, - IX86_BUILTIN_HUGE_VALQ, - IX86_BUILTIN_NANQ, - IX86_BUILTIN_NANSQ, - IX86_BUILTIN_XABORT, - IX86_BUILTIN_ADDCARRYX32, - IX86_BUILTIN_ADDCARRYX64, - IX86_BUILTIN_SBB32, - IX86_BUILTIN_SBB64, - IX86_BUILTIN_RDRAND16_STEP, - IX86_BUILTIN_RDRAND32_STEP, - IX86_BUILTIN_RDRAND64_STEP, - IX86_BUILTIN_RDSEED16_STEP, - IX86_BUILTIN_RDSEED32_STEP, - IX86_BUILTIN_RDSEED64_STEP, - IX86_BUILTIN_MONITORX, - IX86_BUILTIN_MWAITX, - IX86_BUILTIN_CFSTRING, - IX86_BUILTIN_CPU_INIT, - IX86_BUILTIN_CPU_IS, - IX86_BUILTIN_CPU_SUPPORTS, - IX86_BUILTIN_READ_FLAGS, - IX86_BUILTIN_WRITE_FLAGS, - - /* All the remaining builtins are tracked in bdesc_* arrays in - i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after - this point. */ -#define BDESC(mask, mask2, icode, name, code, comparison, flag) \ - code, -#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \ - code, \ - IX86_BUILTIN__BDESC_##kindu##_FIRST = code, -#define BDESC_END(kind, next_kind) - -#include "i386-builtin.def" - -#undef BDESC -#undef BDESC_FIRST -#undef BDESC_END - - IX86_BUILTIN_MAX, - - IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX, - - /* Now just the aliases for bdesc_* start/end. */ -#define BDESC(mask, mask2, icode, name, code, comparison, flag) -#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) -#define BDESC_END(kind, next_kind) \ - IX86_BUILTIN__BDESC_##kind##_LAST \ - = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1, - -#include "i386-builtin.def" - -#undef BDESC -#undef BDESC_FIRST -#undef BDESC_END - - /* Just to make sure there is no comma after the last enumerator. */ - IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST -}; - -/* Table for the ix86 builtin decls. */ -static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX]; - -/* Table of all of the builtin functions that are possible with different ISA's - but are waiting to be built until a function is declared to use that - ISA. */ -struct builtin_isa { - HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */ - HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */ - const char *name; /* function name */ - enum ix86_builtin_func_type tcode; /* type to use in the declaration */ - unsigned char const_p:1; /* true if the declaration is constant */ - unsigned char pure_p:1; /* true if the declaration has pure attribute */ - bool set_and_not_built_p; -}; - -static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX]; - -/* Bits that can still enable any inclusion of a builtin. */ -static HOST_WIDE_INT deferred_isa_values = 0; -static HOST_WIDE_INT deferred_isa_values2 = 0; - -/* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the - MASK and MASK2 of which isa_flags and ix86_isa_flags2 to use in the - ix86_builtins_isa array. Stores the function decl in the ix86_builtins - array. Returns the function decl or NULL_TREE, if the builtin was not - added. - - If the front end has a special hook for builtin functions, delay adding - builtin functions that aren't in the current ISA until the ISA is changed - with function specific optimization. Doing so, can save about 300K for the - default compiler. When the builtin is expanded, check at that time whether - it is valid. - - If the front end doesn't have a special hook, record all builtins, even if - it isn't an instruction set in the current ISA in case the user uses - function specific options for a different ISA, so that we don't get scope - errors if a builtin is added in the middle of a function scope. */ - -static inline tree -def_builtin (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, - const char *name, - enum ix86_builtin_func_type tcode, - enum ix86_builtins code) -{ - tree decl = NULL_TREE; - - /* An instruction may be 64bit only regardless of ISAs. */ - if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT) - { - ix86_builtins_isa[(int) code].isa = mask; - ix86_builtins_isa[(int) code].isa2 = mask2; - - mask &= ~OPTION_MASK_ISA_64BIT; - - /* Filter out the masks most often ored together with others. */ - if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL) - && mask != OPTION_MASK_ISA_AVX512VL) - mask &= ~OPTION_MASK_ISA_AVX512VL; - if ((mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512BW) - && mask != OPTION_MASK_ISA_AVX512BW) - mask &= ~OPTION_MASK_ISA_AVX512BW; - - if (((mask2 == 0 || (mask2 & ix86_isa_flags2) != 0) - && (mask == 0 || (mask & ix86_isa_flags) != 0)) - || (lang_hooks.builtin_function - == lang_hooks.builtin_function_ext_scope)) - { - tree type = ix86_get_builtin_func_type (tcode); - decl = add_builtin_function (name, type, code, BUILT_IN_MD, - NULL, NULL_TREE); - ix86_builtins[(int) code] = decl; - ix86_builtins_isa[(int) code].set_and_not_built_p = false; - } - else - { - /* Just MASK and MASK2 where set_and_not_built_p == true can potentially - include a builtin. */ - deferred_isa_values |= mask; - deferred_isa_values2 |= mask2; - ix86_builtins[(int) code] = NULL_TREE; - ix86_builtins_isa[(int) code].tcode = tcode; - ix86_builtins_isa[(int) code].name = name; - ix86_builtins_isa[(int) code].const_p = false; - ix86_builtins_isa[(int) code].pure_p = false; - ix86_builtins_isa[(int) code].set_and_not_built_p = true; - } - } - - return decl; -} - -/* Like def_builtin, but also marks the function decl "const". */ - -static inline tree -def_builtin_const (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name, - enum ix86_builtin_func_type tcode, enum ix86_builtins code) -{ - tree decl = def_builtin (mask, mask2, name, tcode, code); - if (decl) - TREE_READONLY (decl) = 1; - else - ix86_builtins_isa[(int) code].const_p = true; - - return decl; -} - -/* Like def_builtin, but also marks the function decl "pure". */ - -static inline tree -def_builtin_pure (HOST_WIDE_INT mask, HOST_WIDE_INT mask2, const char *name, - enum ix86_builtin_func_type tcode, enum ix86_builtins code) -{ - tree decl = def_builtin (mask, mask2, name, tcode, code); - if (decl) - DECL_PURE_P (decl) = 1; - else - ix86_builtins_isa[(int) code].pure_p = true; - - return decl; -} - -/* Add any new builtin functions for a given ISA that may not have been - declared. This saves a bit of space compared to adding all of the - declarations to the tree, even if we didn't use them. */ - -static void -ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2) -{ - isa &= ~OPTION_MASK_ISA_64BIT; - - if ((isa & deferred_isa_values) == 0 - && (isa2 & deferred_isa_values2) == 0) - return; - - /* Bits in ISA value can be removed from potential isa values. */ - deferred_isa_values &= ~isa; - deferred_isa_values2 &= ~isa2; - - int i; - tree saved_current_target_pragma = current_target_pragma; - current_target_pragma = NULL_TREE; - - for (i = 0; i < (int)IX86_BUILTIN_MAX; i++) - { - if (((ix86_builtins_isa[i].isa & isa) != 0 - || (ix86_builtins_isa[i].isa2 & isa2) != 0) - && ix86_builtins_isa[i].set_and_not_built_p) - { - tree decl, type; - - /* Don't define the builtin again. */ - ix86_builtins_isa[i].set_and_not_built_p = false; - - type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode); - decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name, - type, i, BUILT_IN_MD, NULL, - NULL_TREE); - - ix86_builtins[i] = decl; - if (ix86_builtins_isa[i].const_p) - TREE_READONLY (decl) = 1; - } - } - - current_target_pragma = saved_current_target_pragma; -} - -/* Bits for builtin_description.flag. */ - -/* Set when we don't support the comparison natively, and should - swap_comparison in order to support it. */ -#define BUILTIN_DESC_SWAP_OPERANDS 1 - -struct builtin_description -{ - const HOST_WIDE_INT mask; - const HOST_WIDE_INT mask2; - const enum insn_code icode; - const char *const name; - const enum ix86_builtins code; - const enum rtx_code comparison; - const int flag; -}; - -#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT -#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT -#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT -#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT -#define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF -#define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF -#define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF -#define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF -#define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI -#define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI -#define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI -#define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI -#define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI -#define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI -#define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI -#define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI -#define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI -#define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI -#define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF -#define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF -#define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI -#define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI -#define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI -#define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI -#define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI -#define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI -#define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI -#define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI -#define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP -#define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP -#define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP -#define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP -#define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF -#define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF -#define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF -#define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF -#define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF -#define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF -#define MULTI_ARG_1_SF V4SF_FTYPE_V4SF -#define MULTI_ARG_1_DF V2DF_FTYPE_V2DF -#define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF -#define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF -#define MULTI_ARG_1_DI V2DI_FTYPE_V2DI -#define MULTI_ARG_1_SI V4SI_FTYPE_V4SI -#define MULTI_ARG_1_HI V8HI_FTYPE_V8HI -#define MULTI_ARG_1_QI V16QI_FTYPE_V16QI -#define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI -#define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI -#define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI -#define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI -#define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI -#define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI - -#define BDESC(mask, mask2, icode, name, code, comparison, flag) \ - { mask, mask2, icode, name, code, comparison, flag }, -#define BDESC_FIRST(kind, kindu, mask, mask2, icode, name, code, comparison, flag) \ -static const struct builtin_description bdesc_##kind[] = \ -{ \ - BDESC (mask, mask2, icode, name, code, comparison, flag) -#define BDESC_END(kind, next_kind) \ -}; - -#include "i386-builtin.def" - -#undef BDESC -#undef BDESC_FIRST -#undef BDESC_END - - -/* TM vector builtins. */ - -/* Reuse the existing x86-specific `struct builtin_description' cause - we're lazy. Add casts to make them fit. */ -static const struct builtin_description bdesc_tm[] = -{ - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, - - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, - - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, - - { OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID }, - { OPTION_MASK_ISA_SSE, 0, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID }, - { OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID }, -}; - -/* Initialize the transactional memory vector load/store builtins. */ - -static void -ix86_init_tm_builtins (void) -{ - enum ix86_builtin_func_type ftype; - const struct builtin_description *d; - size_t i; - tree decl; - tree attrs_load, attrs_type_load, attrs_store, attrs_type_store; - tree attrs_log, attrs_type_log; - - if (!flag_tm) - return; - - /* If there are no builtins defined, we must be compiling in a - language without trans-mem support. */ - if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1)) - return; - - /* Use whatever attributes a normal TM load has. */ - decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1); - attrs_load = DECL_ATTRIBUTES (decl); - attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl)); - /* Use whatever attributes a normal TM store has. */ - decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1); - attrs_store = DECL_ATTRIBUTES (decl); - attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl)); - /* Use whatever attributes a normal TM log has. */ - decl = builtin_decl_explicit (BUILT_IN_TM_LOG); - attrs_log = DECL_ATTRIBUTES (decl); - attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl)); - - for (i = 0, d = bdesc_tm; - i < ARRAY_SIZE (bdesc_tm); - i++, d++) - { - if ((d->mask & ix86_isa_flags) != 0 - || (lang_hooks.builtin_function - == lang_hooks.builtin_function_ext_scope)) - { - tree type, attrs, attrs_type; - enum built_in_function code = (enum built_in_function) d->code; - - ftype = (enum ix86_builtin_func_type) d->flag; - type = ix86_get_builtin_func_type (ftype); - - if (BUILTIN_TM_LOAD_P (code)) - { - attrs = attrs_load; - attrs_type = attrs_type_load; - } - else if (BUILTIN_TM_STORE_P (code)) - { - attrs = attrs_store; - attrs_type = attrs_type_store; - } - else - { - attrs = attrs_log; - attrs_type = attrs_type_log; - } - decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL, - /* The builtin without the prefix for - calling it directly. */ - d->name + strlen ("__builtin_"), - attrs); - /* add_builtin_function() will set the DECL_ATTRIBUTES, now - set the TYPE_ATTRIBUTES. */ - decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN); - - set_builtin_decl (code, decl, false); - } - } -} - -/* Macros for verification of enum ix86_builtins order. */ -#define BDESC_VERIFY(x, y, z) \ - gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z))) -#define BDESC_VERIFYS(x, y, z) \ - STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z))) - -BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST, - IX86_BUILTIN__BDESC_COMI_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST, - IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, - IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST, - IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, - IX86_BUILTIN__BDESC_ARGS_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, - IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST, - IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, - IX86_BUILTIN__BDESC_CET_LAST, 1); -BDESC_VERIFYS (IX86_BUILTIN_MAX, - IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1); - -/* Set up all the MMX/SSE builtins, even builtins for instructions that are not - in the current target ISA to allow the user to compile particular modules - with different target specific options that differ from the command line - options. */ -static void -ix86_init_mmx_sse_builtins (void) -{ - const struct builtin_description * d; - enum ix86_builtin_func_type ftype; - size_t i; - - /* Add all special builtins with variable number of operands. */ - for (i = 0, d = bdesc_special_args; - i < ARRAY_SIZE (bdesc_special_args); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, - ARRAY_SIZE (bdesc_special_args) - 1); - - /* Add all builtins with variable number of operands. */ - for (i = 0, d = bdesc_args; - i < ARRAY_SIZE (bdesc_args); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST, - IX86_BUILTIN__BDESC_ARGS_FIRST, - ARRAY_SIZE (bdesc_args) - 1); - - /* Add all builtins with rounding. */ - for (i = 0, d = bdesc_round_args; - i < ARRAY_SIZE (bdesc_round_args); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, - ARRAY_SIZE (bdesc_round_args) - 1); - - /* pcmpestr[im] insns. */ - for (i = 0, d = bdesc_pcmpestr; - i < ARRAY_SIZE (bdesc_pcmpestr); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i); - if (d->code == IX86_BUILTIN_PCMPESTRM128) - ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT; - else - ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST, - IX86_BUILTIN__BDESC_PCMPESTR_FIRST, - ARRAY_SIZE (bdesc_pcmpestr) - 1); - - /* pcmpistr[im] insns. */ - for (i = 0, d = bdesc_pcmpistr; - i < ARRAY_SIZE (bdesc_pcmpistr); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i); - if (d->code == IX86_BUILTIN_PCMPISTRM128) - ftype = V16QI_FTYPE_V16QI_V16QI_INT; - else - ftype = INT_FTYPE_V16QI_V16QI_INT; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST, - IX86_BUILTIN__BDESC_PCMPISTR_FIRST, - ARRAY_SIZE (bdesc_pcmpistr) - 1); - - /* comi/ucomi insns. */ - for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i); - if (d->mask == OPTION_MASK_ISA_SSE2) - ftype = INT_FTYPE_V2DF_V2DF; - else - ftype = INT_FTYPE_V4SF_V4SF; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST, - IX86_BUILTIN__BDESC_COMI_FIRST, - ARRAY_SIZE (bdesc_comi) - 1); - - /* SSE */ - def_builtin (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_ldmxcsr", - VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR); - def_builtin_pure (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_stmxcsr", - UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR); - - /* SSE or 3DNow!A */ - def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A - /* As it uses V4HImode, we have to require -mmmx too. */ - | OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR, - IX86_BUILTIN_MASKMOVQ); - - /* SSE2 */ - def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_maskmovdqu", - VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU); - - def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_clflush", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH); - x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_mfence", - VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE); - - /* SSE3. */ - def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_monitor", - VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR); - def_builtin (OPTION_MASK_ISA_SSE3, 0, "__builtin_ia32_mwait", - VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT); - - /* AES */ - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesenc128", - V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesenclast128", - V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesdec128", - V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesdeclast128", - V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aesimc128", - V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128); - def_builtin_const (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_aeskeygenassist128", - V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128); - - /* PCLMUL */ - def_builtin_const (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2, 0, - "__builtin_ia32_pclmulqdq128", - V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128); - - /* RDRND */ - def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand16_step", - INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP); - def_builtin (OPTION_MASK_ISA_RDRND, 0, "__builtin_ia32_rdrand32_step", - INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP); - def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG, - IX86_BUILTIN_RDRAND64_STEP); - - /* AVX2 */ - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2df", - V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT, - IX86_BUILTIN_GATHERSIV2DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4df", - V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT, - IX86_BUILTIN_GATHERSIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2df", - V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT, - IX86_BUILTIN_GATHERDIV2DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4df", - V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT, - IX86_BUILTIN_GATHERDIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4sf", - V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT, - IX86_BUILTIN_GATHERSIV4SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8sf", - V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT, - IX86_BUILTIN_GATHERSIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf", - V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT, - IX86_BUILTIN_GATHERDIV4SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4sf256", - V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT, - IX86_BUILTIN_GATHERDIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv2di", - V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT, - IX86_BUILTIN_GATHERSIV2DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4di", - V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT, - IX86_BUILTIN_GATHERSIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv2di", - V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT, - IX86_BUILTIN_GATHERDIV2DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4di", - V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT, - IX86_BUILTIN_GATHERDIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv4si", - V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT, - IX86_BUILTIN_GATHERSIV4SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gathersiv8si", - V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT, - IX86_BUILTIN_GATHERSIV8SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si", - V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT, - IX86_BUILTIN_GATHERDIV4SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatherdiv4si256", - V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT, - IX86_BUILTIN_GATHERDIV8SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4df ", - V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT, - IX86_BUILTIN_GATHERALTSIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8sf ", - V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT, - IX86_BUILTIN_GATHERALTDIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltsiv4di ", - V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT, - IX86_BUILTIN_GATHERALTSIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX2, 0, "__builtin_ia32_gatheraltdiv8si ", - V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT, - IX86_BUILTIN_GATHERALTDIV8SI); - - /* AVX512F */ - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16sf", - V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT, - IX86_BUILTIN_GATHER3SIV16SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8df", - V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT, - IX86_BUILTIN_GATHER3SIV8DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16sf", - V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT, - IX86_BUILTIN_GATHER3DIV16SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8df", - V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT, - IX86_BUILTIN_GATHER3DIV8DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv16si", - V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT, - IX86_BUILTIN_GATHER3SIV16SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gathersiv8di", - V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT, - IX86_BUILTIN_GATHER3SIV8DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv16si", - V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT, - IX86_BUILTIN_GATHER3DIV16SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gatherdiv8di", - V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT, - IX86_BUILTIN_GATHER3DIV8DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8df ", - V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT, - IX86_BUILTIN_GATHER3ALTSIV8DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16sf ", - V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT, - IX86_BUILTIN_GATHER3ALTDIV16SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altsiv8di ", - V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT, - IX86_BUILTIN_GATHER3ALTSIV8DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_gather3altdiv16si ", - V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT, - IX86_BUILTIN_GATHER3ALTDIV16SI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16sf", - VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT, - IX86_BUILTIN_SCATTERSIV16SF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8df", - VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT, - IX86_BUILTIN_SCATTERSIV8DF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16sf", - VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT, - IX86_BUILTIN_SCATTERDIV16SF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8df", - VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT, - IX86_BUILTIN_SCATTERDIV8DF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv16si", - VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT, - IX86_BUILTIN_SCATTERSIV16SI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scattersiv8di", - VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT, - IX86_BUILTIN_SCATTERSIV8DI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv16si", - VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT, - IX86_BUILTIN_SCATTERDIV16SI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatterdiv8di", - VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT, - IX86_BUILTIN_SCATTERDIV8DI); - - /* AVX512VL */ - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2df", - V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV2DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4df", - V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2df", - V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT, - IX86_BUILTIN_GATHER3DIV2DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4df", - V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT, - IX86_BUILTIN_GATHER3DIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4sf", - V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV4SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8sf", - V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT, - IX86_BUILTIN_GATHER3SIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4sf", - V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT, - IX86_BUILTIN_GATHER3DIV4SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8sf", - V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT, - IX86_BUILTIN_GATHER3DIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv2di", - V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV2DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4di", - V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div2di", - V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT, - IX86_BUILTIN_GATHER3DIV2DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4di", - V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT, - IX86_BUILTIN_GATHER3DIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv4si", - V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT, - IX86_BUILTIN_GATHER3SIV4SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3siv8si", - V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT, - IX86_BUILTIN_GATHER3SIV8SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div4si", - V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT, - IX86_BUILTIN_GATHER3DIV4SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3div8si", - V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT, - IX86_BUILTIN_GATHER3DIV8SI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4df ", - V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT, - IX86_BUILTIN_GATHER3ALTSIV4DF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8sf ", - V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT, - IX86_BUILTIN_GATHER3ALTDIV8SF); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altsiv4di ", - V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT, - IX86_BUILTIN_GATHER3ALTSIV4DI); - - def_builtin_pure (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_gather3altdiv8si ", - V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT, - IX86_BUILTIN_GATHER3ALTDIV8SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8sf", - VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT, - IX86_BUILTIN_SCATTERSIV8SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4sf", - VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT, - IX86_BUILTIN_SCATTERSIV4SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4df", - VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT, - IX86_BUILTIN_SCATTERSIV4DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2df", - VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT, - IX86_BUILTIN_SCATTERSIV2DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8sf", - VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT, - IX86_BUILTIN_SCATTERDIV8SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4sf", - VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT, - IX86_BUILTIN_SCATTERDIV4SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4df", - VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT, - IX86_BUILTIN_SCATTERDIV4DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2df", - VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT, - IX86_BUILTIN_SCATTERDIV2DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv8si", - VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT, - IX86_BUILTIN_SCATTERSIV8SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4si", - VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT, - IX86_BUILTIN_SCATTERSIV4SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv4di", - VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT, - IX86_BUILTIN_SCATTERSIV4DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scattersiv2di", - VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT, - IX86_BUILTIN_SCATTERSIV2DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv8si", - VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT, - IX86_BUILTIN_SCATTERDIV8SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4si", - VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT, - IX86_BUILTIN_SCATTERDIV4SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv4di", - VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT, - IX86_BUILTIN_SCATTERDIV4DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatterdiv2di", - VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT, - IX86_BUILTIN_SCATTERDIV2DI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8df ", - VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT, - IX86_BUILTIN_SCATTERALTSIV8DF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16sf ", - VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT, - IX86_BUILTIN_SCATTERALTDIV16SF); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltsiv8di ", - VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT, - IX86_BUILTIN_SCATTERALTSIV8DI); - - def_builtin (OPTION_MASK_ISA_AVX512F, 0, "__builtin_ia32_scatteraltdiv16si ", - VOID_FTYPE_PINT_HI_V8DI_V16SI_INT, - IX86_BUILTIN_SCATTERALTDIV16SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4df ", - VOID_FTYPE_PDOUBLE_QI_V8SI_V4DF_INT, - IX86_BUILTIN_SCATTERALTSIV4DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8sf ", - VOID_FTYPE_PFLOAT_QI_V4DI_V8SF_INT, - IX86_BUILTIN_SCATTERALTDIV8SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv4di ", - VOID_FTYPE_PLONGLONG_QI_V8SI_V4DI_INT, - IX86_BUILTIN_SCATTERALTSIV4DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv8si ", - VOID_FTYPE_PINT_QI_V4DI_V8SI_INT, - IX86_BUILTIN_SCATTERALTDIV8SI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2df ", - VOID_FTYPE_PDOUBLE_QI_V4SI_V2DF_INT, - IX86_BUILTIN_SCATTERALTSIV2DF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4sf ", - VOID_FTYPE_PFLOAT_QI_V2DI_V4SF_INT, - IX86_BUILTIN_SCATTERALTDIV4SF); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltsiv2di ", - VOID_FTYPE_PLONGLONG_QI_V4SI_V2DI_INT, - IX86_BUILTIN_SCATTERALTSIV2DI); - - def_builtin (OPTION_MASK_ISA_AVX512VL, 0, "__builtin_ia32_scatteraltdiv4si ", - VOID_FTYPE_PINT_QI_V2DI_V4SI_INT, - IX86_BUILTIN_SCATTERALTDIV4SI); - - /* AVX512PF */ - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdpd", - VOID_FTYPE_QI_V8SI_PCVOID_INT_INT, - IX86_BUILTIN_GATHERPFDPD); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfdps", - VOID_FTYPE_HI_V16SI_PCVOID_INT_INT, - IX86_BUILTIN_GATHERPFDPS); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqpd", - VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, - IX86_BUILTIN_GATHERPFQPD); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_gatherpfqps", - VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, - IX86_BUILTIN_GATHERPFQPS); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdpd", - VOID_FTYPE_QI_V8SI_PCVOID_INT_INT, - IX86_BUILTIN_SCATTERPFDPD); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfdps", - VOID_FTYPE_HI_V16SI_PCVOID_INT_INT, - IX86_BUILTIN_SCATTERPFDPS); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqpd", - VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, - IX86_BUILTIN_SCATTERPFQPD); - def_builtin (OPTION_MASK_ISA_AVX512PF, 0, "__builtin_ia32_scatterpfqps", - VOID_FTYPE_QI_V8DI_PCVOID_INT_INT, - IX86_BUILTIN_SCATTERPFQPS); - - /* SHA */ - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg1", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1msg2", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1nexte", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha1rnds4", - V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg1", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256msg2", - V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2); - def_builtin_const (OPTION_MASK_ISA_SHA, 0, "__builtin_ia32_sha256rnds2", - V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2); - - /* RTM. */ - def_builtin (OPTION_MASK_ISA_RTM, 0, "__builtin_ia32_xabort", - VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT); - - /* MMX access to the vec_init patterns. */ - def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v2si", - V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI); - - def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v4hi", - V4HI_FTYPE_HI_HI_HI_HI, - IX86_BUILTIN_VEC_INIT_V4HI); - - def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_init_v8qi", - V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI, - IX86_BUILTIN_VEC_INIT_V8QI); - - /* Access to the vec_extract patterns. */ - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2df", - DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF); - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v2di", - DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI); - def_builtin_const (OPTION_MASK_ISA_SSE, 0, "__builtin_ia32_vec_ext_v4sf", - FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF); - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v4si", - SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI); - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v8hi", - HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI); - - def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A - /* As it uses V4HImode, we have to require -mmmx too. */ - | OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_vec_ext_v4hi", - HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI); - - def_builtin_const (OPTION_MASK_ISA_MMX, 0, "__builtin_ia32_vec_ext_v2si", - SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI); - - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_ext_v16qi", - QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI); - - /* Access to the vec_set patterns. */ - def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_vec_set_v2di", - V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI); - - def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4sf", - V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF); - - def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v4si", - V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI); - - def_builtin_const (OPTION_MASK_ISA_SSE2, 0, "__builtin_ia32_vec_set_v8hi", - V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI); - - def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A - /* As it uses V4HImode, we have to require -mmmx too. */ - | OPTION_MASK_ISA_MMX, 0, - "__builtin_ia32_vec_set_v4hi", - V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI); - - def_builtin_const (OPTION_MASK_ISA_SSE4_1, 0, "__builtin_ia32_vec_set_v16qi", - V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI); - - /* RDSEED */ - def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_hi_step", - INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP); - def_builtin (OPTION_MASK_ISA_RDSEED, 0, "__builtin_ia32_rdseed_si_step", - INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP); - def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_rdseed_di_step", - INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP); - - /* ADCX */ - def_builtin (0, 0, "__builtin_ia32_addcarryx_u32", - UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32); - def_builtin (OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_addcarryx_u64", - UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG, - IX86_BUILTIN_ADDCARRYX64); - - /* SBB */ - def_builtin (0, 0, "__builtin_ia32_sbb_u32", - UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32); - def_builtin (OPTION_MASK_ISA_64BIT, 0, - "__builtin_ia32_sbb_u64", - UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG, - IX86_BUILTIN_SBB64); - - /* Read/write FLAGS. */ - if (TARGET_64BIT) - { - def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_readeflags_u64", - UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS); - def_builtin (OPTION_MASK_ISA_64BIT, 0, "__builtin_ia32_writeeflags_u64", - VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS); - } - else - { - def_builtin (0, 0, "__builtin_ia32_readeflags_u32", - UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS); - def_builtin (0, 0, "__builtin_ia32_writeeflags_u32", - VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS); - } - - /* CLFLUSHOPT. */ - def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, 0, "__builtin_ia32_clflushopt", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT); - - /* CLWB. */ - def_builtin (OPTION_MASK_ISA_CLWB, 0, "__builtin_ia32_clwb", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB); - - /* MONITORX and MWAITX. */ - def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx", - VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX); - def_builtin (0, OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx", - VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX); - - /* CLZERO. */ - def_builtin (0, OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO); - - /* WAITPKG. */ - def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umonitor", - VOID_FTYPE_PVOID, IX86_BUILTIN_UMONITOR); - def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_umwait", - UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_UMWAIT); - def_builtin (0, OPTION_MASK_ISA_WAITPKG, "__builtin_ia32_tpause", - UINT8_FTYPE_UNSIGNED_UINT64, IX86_BUILTIN_TPAUSE); - - /* CLDEMOTE. */ - def_builtin (0, OPTION_MASK_ISA_CLDEMOTE, "__builtin_ia32_cldemote", - VOID_FTYPE_PCVOID, IX86_BUILTIN_CLDEMOTE); - - /* Add FMA4 multi-arg argument instructions */ - for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin_const (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST, - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, - ARRAY_SIZE (bdesc_multi_arg) - 1); - - /* Add CET inrinsics. */ - for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST, - IX86_BUILTIN__BDESC_CET_FIRST, - ARRAY_SIZE (bdesc_cet) - 1); - - for (i = 0, d = bdesc_cet_rdssp; - i < ARRAY_SIZE (bdesc_cet_rdssp); - i++, d++) - { - BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i); - if (d->name == 0) - continue; - - ftype = (enum ix86_builtin_func_type) d->flag; - def_builtin (d->mask, d->mask2, d->name, ftype, d->code); - } - BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST, - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, - ARRAY_SIZE (bdesc_cet_rdssp) - 1); -} - -#undef BDESC_VERIFY -#undef BDESC_VERIFYS - -/* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL - to return a pointer to VERSION_DECL if the outcome of the expression - formed by PREDICATE_CHAIN is true. This function will be called during - version dispatch to decide which function version to execute. It returns - the basic block at the end, to which more conditions can be added. */ - -static basic_block -add_condition_to_bb (tree function_decl, tree version_decl, - tree predicate_chain, basic_block new_bb) -{ - gimple *return_stmt; - tree convert_expr, result_var; - gimple *convert_stmt; - gimple *call_cond_stmt; - gimple *if_else_stmt; - - basic_block bb1, bb2, bb3; - edge e12, e23; - - tree cond_var, and_expr_var = NULL_TREE; - gimple_seq gseq; - - tree predicate_decl, predicate_arg; - - push_cfun (DECL_STRUCT_FUNCTION (function_decl)); - - gcc_assert (new_bb != NULL); - gseq = bb_seq (new_bb); - - - convert_expr = build1 (CONVERT_EXPR, ptr_type_node, - build_fold_addr_expr (version_decl)); - result_var = create_tmp_var (ptr_type_node); - convert_stmt = gimple_build_assign (result_var, convert_expr); - return_stmt = gimple_build_return (result_var); - - if (predicate_chain == NULL_TREE) - { - gimple_seq_add_stmt (&gseq, convert_stmt); - gimple_seq_add_stmt (&gseq, return_stmt); - set_bb_seq (new_bb, gseq); - gimple_set_bb (convert_stmt, new_bb); - gimple_set_bb (return_stmt, new_bb); - pop_cfun (); - return new_bb; - } - - while (predicate_chain != NULL) - { - cond_var = create_tmp_var (integer_type_node); - predicate_decl = TREE_PURPOSE (predicate_chain); - predicate_arg = TREE_VALUE (predicate_chain); - call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg); - gimple_call_set_lhs (call_cond_stmt, cond_var); - - gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl)); - gimple_set_bb (call_cond_stmt, new_bb); - gimple_seq_add_stmt (&gseq, call_cond_stmt); - - predicate_chain = TREE_CHAIN (predicate_chain); - - if (and_expr_var == NULL) - and_expr_var = cond_var; - else - { - gimple *assign_stmt; - /* Use MIN_EXPR to check if any integer is zero?. - and_expr_var = min_expr */ - assign_stmt = gimple_build_assign (and_expr_var, - build2 (MIN_EXPR, integer_type_node, - cond_var, and_expr_var)); - - gimple_set_block (assign_stmt, DECL_INITIAL (function_decl)); - gimple_set_bb (assign_stmt, new_bb); - gimple_seq_add_stmt (&gseq, assign_stmt); - } - } - - if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var, - integer_zero_node, - NULL_TREE, NULL_TREE); - gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl)); - gimple_set_bb (if_else_stmt, new_bb); - gimple_seq_add_stmt (&gseq, if_else_stmt); - - gimple_seq_add_stmt (&gseq, convert_stmt); - gimple_seq_add_stmt (&gseq, return_stmt); - set_bb_seq (new_bb, gseq); - - bb1 = new_bb; - e12 = split_block (bb1, if_else_stmt); - bb2 = e12->dest; - e12->flags &= ~EDGE_FALLTHRU; - e12->flags |= EDGE_TRUE_VALUE; - - e23 = split_block (bb2, return_stmt); - - gimple_set_bb (convert_stmt, bb2); - gimple_set_bb (return_stmt, bb2); - - bb3 = e23->dest; - make_edge (bb1, bb3, EDGE_FALSE_VALUE); - - remove_edge (e23); - make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0); - - pop_cfun (); - - return bb3; -} - -/* Priority of i386 features, greater value is higher priority. This is - used to decide the order in which function dispatch must happen. For - instance, a version specialized for SSE4.2 should be checked for dispatch - before a version for SSE3, as SSE4.2 implies SSE3. */ -enum feature_priority -{ - P_ZERO = 0, - P_MMX, - P_SSE, - P_SSE2, - P_SSE3, - P_SSSE3, - P_PROC_SSSE3, - P_SSE4_A, - P_PROC_SSE4_A, - P_SSE4_1, - P_SSE4_2, - P_PROC_SSE4_2, - P_POPCNT, - P_AES, - P_PCLMUL, - P_AVX, - P_PROC_AVX, - P_BMI, - P_PROC_BMI, - P_FMA4, - P_XOP, - P_PROC_XOP, - P_FMA, - P_PROC_FMA, - P_BMI2, - P_AVX2, - P_PROC_AVX2, - P_AVX512F, - P_PROC_AVX512F -}; - -/* This is the order of bit-fields in __processor_features in cpuinfo.c */ -enum processor_features -{ - F_CMOV = 0, - F_MMX, - F_POPCNT, - F_SSE, - F_SSE2, - F_SSE3, - F_SSSE3, - F_SSE4_1, - F_SSE4_2, - F_AVX, - F_AVX2, - F_SSE4_A, - F_FMA4, - F_XOP, - F_FMA, - F_AVX512F, - F_BMI, - F_BMI2, - F_AES, - F_PCLMUL, - F_AVX512VL, - F_AVX512BW, - F_AVX512DQ, - F_AVX512CD, - F_AVX512ER, - F_AVX512PF, - F_AVX512VBMI, - F_AVX512IFMA, - F_AVX5124VNNIW, - F_AVX5124FMAPS, - F_AVX512VPOPCNTDQ, - F_AVX512VBMI2, - F_GFNI, - F_VPCLMULQDQ, - F_AVX512VNNI, - F_AVX512BITALG, - F_MAX -}; - -/* These are the values for vendor types and cpu types and subtypes - in cpuinfo.c. Cpu types and subtypes should be subtracted by - the corresponding start value. */ -enum processor_model -{ - M_INTEL = 1, - M_AMD, - M_CPU_TYPE_START, - M_INTEL_BONNELL, - M_INTEL_CORE2, - M_INTEL_COREI7, - M_AMDFAM10H, - M_AMDFAM15H, - M_INTEL_SILVERMONT, - M_INTEL_KNL, - M_AMD_BTVER1, - M_AMD_BTVER2, - M_AMDFAM17H, - M_INTEL_KNM, - M_INTEL_GOLDMONT, - M_INTEL_GOLDMONT_PLUS, - M_INTEL_TREMONT, - M_CPU_SUBTYPE_START, - M_INTEL_COREI7_NEHALEM, - M_INTEL_COREI7_WESTMERE, - M_INTEL_COREI7_SANDYBRIDGE, - M_AMDFAM10H_BARCELONA, - M_AMDFAM10H_SHANGHAI, - M_AMDFAM10H_ISTANBUL, - M_AMDFAM15H_BDVER1, - M_AMDFAM15H_BDVER2, - M_AMDFAM15H_BDVER3, - M_AMDFAM15H_BDVER4, - M_AMDFAM17H_ZNVER1, - M_INTEL_COREI7_IVYBRIDGE, - M_INTEL_COREI7_HASWELL, - M_INTEL_COREI7_BROADWELL, - M_INTEL_COREI7_SKYLAKE, - M_INTEL_COREI7_SKYLAKE_AVX512, - M_INTEL_COREI7_CANNONLAKE, - M_INTEL_COREI7_ICELAKE_CLIENT, - M_INTEL_COREI7_ICELAKE_SERVER, - M_AMDFAM17H_ZNVER2, - M_INTEL_COREI7_CASCADELAKE -}; - -struct _arch_names_table -{ - const char *const name; - const enum processor_model model; -}; - -static const _arch_names_table arch_names_table[] = -{ - {"amd", M_AMD}, - {"intel", M_INTEL}, - {"atom", M_INTEL_BONNELL}, - {"slm", M_INTEL_SILVERMONT}, - {"core2", M_INTEL_CORE2}, - {"corei7", M_INTEL_COREI7}, - {"nehalem", M_INTEL_COREI7_NEHALEM}, - {"westmere", M_INTEL_COREI7_WESTMERE}, - {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE}, - {"ivybridge", M_INTEL_COREI7_IVYBRIDGE}, - {"haswell", M_INTEL_COREI7_HASWELL}, - {"broadwell", M_INTEL_COREI7_BROADWELL}, - {"skylake", M_INTEL_COREI7_SKYLAKE}, - {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512}, - {"cannonlake", M_INTEL_COREI7_CANNONLAKE}, - {"icelake-client", M_INTEL_COREI7_ICELAKE_CLIENT}, - {"icelake-server", M_INTEL_COREI7_ICELAKE_SERVER}, - {"cascadelake", M_INTEL_COREI7_CASCADELAKE}, - {"bonnell", M_INTEL_BONNELL}, - {"silvermont", M_INTEL_SILVERMONT}, - {"goldmont", M_INTEL_GOLDMONT}, - {"goldmont-plus", M_INTEL_GOLDMONT_PLUS}, - {"tremont", M_INTEL_TREMONT}, - {"knl", M_INTEL_KNL}, - {"knm", M_INTEL_KNM}, - {"amdfam10h", M_AMDFAM10H}, - {"barcelona", M_AMDFAM10H_BARCELONA}, - {"shanghai", M_AMDFAM10H_SHANGHAI}, - {"istanbul", M_AMDFAM10H_ISTANBUL}, - {"btver1", M_AMD_BTVER1}, - {"amdfam15h", M_AMDFAM15H}, - {"bdver1", M_AMDFAM15H_BDVER1}, - {"bdver2", M_AMDFAM15H_BDVER2}, - {"bdver3", M_AMDFAM15H_BDVER3}, - {"bdver4", M_AMDFAM15H_BDVER4}, - {"btver2", M_AMD_BTVER2}, - {"amdfam17h", M_AMDFAM17H}, - {"znver1", M_AMDFAM17H_ZNVER1}, - {"znver2", M_AMDFAM17H_ZNVER2}, -}; - -/* These are the target attribute strings for which a dispatcher is - available, from fold_builtin_cpu. */ -struct _isa_names_table -{ - const char *const name; - const enum processor_features feature; - const enum feature_priority priority; -}; - -static const _isa_names_table isa_names_table[] = -{ - {"cmov", F_CMOV, P_ZERO}, - {"mmx", F_MMX, P_MMX}, - {"popcnt", F_POPCNT, P_POPCNT}, - {"sse", F_SSE, P_SSE}, - {"sse2", F_SSE2, P_SSE2}, - {"sse3", F_SSE3, P_SSE3}, - {"ssse3", F_SSSE3, P_SSSE3}, - {"sse4a", F_SSE4_A, P_SSE4_A}, - {"sse4.1", F_SSE4_1, P_SSE4_1}, - {"sse4.2", F_SSE4_2, P_SSE4_2}, - {"avx", F_AVX, P_AVX}, - {"fma4", F_FMA4, P_FMA4}, - {"xop", F_XOP, P_XOP}, - {"fma", F_FMA, P_FMA}, - {"avx2", F_AVX2, P_AVX2}, - {"avx512f", F_AVX512F, P_AVX512F}, - {"bmi", F_BMI, P_BMI}, - {"bmi2", F_BMI2, P_BMI2}, - {"aes", F_AES, P_AES}, - {"pclmul", F_PCLMUL, P_PCLMUL}, - {"avx512vl",F_AVX512VL, P_ZERO}, - {"avx512bw",F_AVX512BW, P_ZERO}, - {"avx512dq",F_AVX512DQ, P_ZERO}, - {"avx512cd",F_AVX512CD, P_ZERO}, - {"avx512er",F_AVX512ER, P_ZERO}, - {"avx512pf",F_AVX512PF, P_ZERO}, - {"avx512vbmi",F_AVX512VBMI, P_ZERO}, - {"avx512ifma",F_AVX512IFMA, P_ZERO}, - {"avx5124vnniw",F_AVX5124VNNIW, P_ZERO}, - {"avx5124fmaps",F_AVX5124FMAPS, P_ZERO}, - {"avx512vpopcntdq",F_AVX512VPOPCNTDQ, P_ZERO}, - {"avx512vbmi2", F_AVX512VBMI2, P_ZERO}, - {"gfni", F_GFNI, P_ZERO}, - {"vpclmulqdq", F_VPCLMULQDQ, P_ZERO}, - {"avx512vnni", F_AVX512VNNI, P_ZERO}, - {"avx512bitalg", F_AVX512BITALG, P_ZERO} -}; - -/* This parses the attribute arguments to target in DECL and determines - the right builtin to use to match the platform specification. - It returns the priority value for this version decl. If PREDICATE_LIST - is not NULL, it stores the list of cpu features that need to be checked - before dispatching this function. */ - -static unsigned int -get_builtin_code_for_version (tree decl, tree *predicate_list) -{ - tree attrs; - struct cl_target_option cur_target; - tree target_node; - struct cl_target_option *new_target; - const char *arg_str = NULL; - const char *attrs_str = NULL; - char *tok_str = NULL; - char *token; - - enum feature_priority priority = P_ZERO; - - static unsigned int NUM_FEATURES - = sizeof (isa_names_table) / sizeof (_isa_names_table); - - unsigned int i; - - tree predicate_chain = NULL_TREE; - tree predicate_decl, predicate_arg; - - attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl)); - gcc_assert (attrs != NULL); - - attrs = TREE_VALUE (TREE_VALUE (attrs)); - - gcc_assert (TREE_CODE (attrs) == STRING_CST); - attrs_str = TREE_STRING_POINTER (attrs); - - /* Return priority zero for default function. */ - if (strcmp (attrs_str, "default") == 0) - return 0; - - /* Handle arch= if specified. For priority, set it to be 1 more than - the best instruction set the processor can handle. For instance, if - there is a version for atom and a version for ssse3 (the highest ISA - priority for atom), the atom version must be checked for dispatch - before the ssse3 version. */ - if (strstr (attrs_str, "arch=") != NULL) - { - cl_target_option_save (&cur_target, &global_options); - target_node - = ix86_valid_target_attribute_tree (decl, attrs, &global_options, - &global_options_set, 0); - - gcc_assert (target_node); - if (target_node == error_mark_node) - return 0; - new_target = TREE_TARGET_OPTION (target_node); - gcc_assert (new_target); - - if (new_target->arch_specified && new_target->arch > 0) - { - switch (new_target->arch) - { - case PROCESSOR_CORE2: - arg_str = "core2"; - priority = P_PROC_SSSE3; - break; - case PROCESSOR_NEHALEM: - if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_PCLMUL) - { - arg_str = "westmere"; - priority = P_PCLMUL; - } - else - { - /* We translate "arch=corei7" and "arch=nehalem" to - "corei7" so that it will be mapped to M_INTEL_COREI7 - as cpu type to cover all M_INTEL_COREI7_XXXs. */ - arg_str = "corei7"; - priority = P_PROC_SSE4_2; - } - break; - case PROCESSOR_SANDYBRIDGE: - if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C) - arg_str = "ivybridge"; - else - arg_str = "sandybridge"; - priority = P_PROC_AVX; - break; - case PROCESSOR_HASWELL: - if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX) - arg_str = "broadwell"; - else - arg_str = "haswell"; - priority = P_PROC_AVX2; - break; - case PROCESSOR_SKYLAKE: - arg_str = "skylake"; - priority = P_PROC_AVX2; - break; - case PROCESSOR_SKYLAKE_AVX512: - arg_str = "skylake-avx512"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_CANNONLAKE: - arg_str = "cannonlake"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_ICELAKE_CLIENT: - arg_str = "icelake-client"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_ICELAKE_SERVER: - arg_str = "icelake-server"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_CASCADELAKE: - arg_str = "cascadelake"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_BONNELL: - arg_str = "bonnell"; - priority = P_PROC_SSSE3; - break; - case PROCESSOR_KNL: - arg_str = "knl"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_KNM: - arg_str = "knm"; - priority = P_PROC_AVX512F; - break; - case PROCESSOR_SILVERMONT: - arg_str = "silvermont"; - priority = P_PROC_SSE4_2; - break; - case PROCESSOR_GOLDMONT: - arg_str = "goldmont"; - priority = P_PROC_SSE4_2; - break; - case PROCESSOR_GOLDMONT_PLUS: - arg_str = "goldmont-plus"; - priority = P_PROC_SSE4_2; - break; - case PROCESSOR_TREMONT: - arg_str = "tremont"; - priority = P_PROC_SSE4_2; - break; - case PROCESSOR_AMDFAM10: - arg_str = "amdfam10h"; - priority = P_PROC_SSE4_A; - break; - case PROCESSOR_BTVER1: - arg_str = "btver1"; - priority = P_PROC_SSE4_A; - break; - case PROCESSOR_BTVER2: - arg_str = "btver2"; - priority = P_PROC_BMI; - break; - case PROCESSOR_BDVER1: - arg_str = "bdver1"; - priority = P_PROC_XOP; - break; - case PROCESSOR_BDVER2: - arg_str = "bdver2"; - priority = P_PROC_FMA; - break; - case PROCESSOR_BDVER3: - arg_str = "bdver3"; - priority = P_PROC_FMA; - break; - case PROCESSOR_BDVER4: - arg_str = "bdver4"; - priority = P_PROC_AVX2; - break; - case PROCESSOR_ZNVER1: - arg_str = "znver1"; - priority = P_PROC_AVX2; - break; - case PROCESSOR_ZNVER2: - arg_str = "znver2"; - priority = P_PROC_AVX2; - break; - } - } - - cl_target_option_restore (&global_options, &cur_target); - - if (predicate_list && arg_str == NULL) - { - error_at (DECL_SOURCE_LOCATION (decl), - "no dispatcher found for the versioning attributes"); - return 0; - } - - if (predicate_list) - { - predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS]; - /* For a C string literal the length includes the trailing NULL. */ - predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str); - predicate_chain = tree_cons (predicate_decl, predicate_arg, - predicate_chain); - } - } - - /* Process feature name. */ - tok_str = (char *) xmalloc (strlen (attrs_str) + 1); - strcpy (tok_str, attrs_str); - token = strtok (tok_str, ","); - predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS]; - - while (token != NULL) - { - /* Do not process "arch=" */ - if (strncmp (token, "arch=", 5) == 0) - { - token = strtok (NULL, ","); - continue; - } - for (i = 0; i < NUM_FEATURES; ++i) - { - if (strcmp (token, isa_names_table[i].name) == 0) - { - if (predicate_list) - { - predicate_arg = build_string_literal ( - strlen (isa_names_table[i].name) + 1, - isa_names_table[i].name); - predicate_chain = tree_cons (predicate_decl, predicate_arg, - predicate_chain); - } - /* Find the maximum priority feature. */ - if (isa_names_table[i].priority > priority) - priority = isa_names_table[i].priority; - - break; - } - } - if (predicate_list && priority == P_ZERO) - { - error_at (DECL_SOURCE_LOCATION (decl), - "ISA %qs is not supported in % attribute, " - "use % syntax", token); - return 0; - } - token = strtok (NULL, ","); - } - free (tok_str); - - if (predicate_list && predicate_chain == NULL_TREE) - { - error_at (DECL_SOURCE_LOCATION (decl), - "no dispatcher found for the versioning attributes: %s", - attrs_str); - return 0; - } - else if (predicate_list) - { - predicate_chain = nreverse (predicate_chain); - *predicate_list = predicate_chain; - } - - return priority; -} - -/* This compares the priority of target features in function DECL1 - and DECL2. It returns positive value if DECL1 is higher priority, - negative value if DECL2 is higher priority and 0 if they are the - same. */ - -static int -ix86_compare_version_priority (tree decl1, tree decl2) -{ - unsigned int priority1 = get_builtin_code_for_version (decl1, NULL); - unsigned int priority2 = get_builtin_code_for_version (decl2, NULL); - - return (int)priority1 - (int)priority2; -} - -/* V1 and V2 point to function versions with different priorities - based on the target ISA. This function compares their priorities. */ - -static int -feature_compare (const void *v1, const void *v2) -{ - typedef struct _function_version_info - { - tree version_decl; - tree predicate_chain; - unsigned int dispatch_priority; - } function_version_info; - - const function_version_info c1 = *(const function_version_info *)v1; - const function_version_info c2 = *(const function_version_info *)v2; - return (c2.dispatch_priority - c1.dispatch_priority); -} - -/* This function generates the dispatch function for - multi-versioned functions. DISPATCH_DECL is the function which will - contain the dispatch logic. FNDECLS are the function choices for - dispatch, and is a tree chain. EMPTY_BB is the basic block pointer - in DISPATCH_DECL in which the dispatch code is generated. */ - -static int -dispatch_function_versions (tree dispatch_decl, - void *fndecls_p, - basic_block *empty_bb) -{ - tree default_decl; - gimple *ifunc_cpu_init_stmt; - gimple_seq gseq; - int ix; - tree ele; - vec *fndecls; - unsigned int num_versions = 0; - unsigned int actual_versions = 0; - unsigned int i; - - struct _function_version_info - { - tree version_decl; - tree predicate_chain; - unsigned int dispatch_priority; - }*function_version_info; - - gcc_assert (dispatch_decl != NULL - && fndecls_p != NULL - && empty_bb != NULL); - - /*fndecls_p is actually a vector. */ - fndecls = static_cast *> (fndecls_p); - - /* At least one more version other than the default. */ - num_versions = fndecls->length (); - gcc_assert (num_versions >= 2); - - function_version_info = (struct _function_version_info *) - XNEWVEC (struct _function_version_info, (num_versions - 1)); - - /* The first version in the vector is the default decl. */ - default_decl = (*fndecls)[0]; - - push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl)); - - gseq = bb_seq (*empty_bb); - /* Function version dispatch is via IFUNC. IFUNC resolvers fire before - constructors, so explicity call __builtin_cpu_init here. */ - ifunc_cpu_init_stmt = gimple_build_call_vec ( - ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL); - gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt); - gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb); - set_bb_seq (*empty_bb, gseq); - - pop_cfun (); - - - for (ix = 1; fndecls->iterate (ix, &ele); ++ix) - { - tree version_decl = ele; - tree predicate_chain = NULL_TREE; - unsigned int priority; - /* Get attribute string, parse it and find the right predicate decl. - The predicate function could be a lengthy combination of many - features, like arch-type and various isa-variants. */ - priority = get_builtin_code_for_version (version_decl, - &predicate_chain); - - if (predicate_chain == NULL_TREE) - continue; - - function_version_info [actual_versions].version_decl = version_decl; - function_version_info [actual_versions].predicate_chain - = predicate_chain; - function_version_info [actual_versions].dispatch_priority = priority; - actual_versions++; - } - - /* Sort the versions according to descending order of dispatch priority. The - priority is based on the ISA. This is not a perfect solution. There - could still be ambiguity. If more than one function version is suitable - to execute, which one should be dispatched? In future, allow the user - to specify a dispatch priority next to the version. */ - qsort (function_version_info, actual_versions, - sizeof (struct _function_version_info), feature_compare); - - for (i = 0; i < actual_versions; ++i) - *empty_bb = add_condition_to_bb (dispatch_decl, - function_version_info[i].version_decl, - function_version_info[i].predicate_chain, - *empty_bb); - - /* dispatch default version at the end. */ - *empty_bb = add_condition_to_bb (dispatch_decl, default_decl, - NULL, *empty_bb); - - free (function_version_info); - return 0; -} - -/* This function changes the assembler name for functions that are - versions. If DECL is a function version and has a "target" - attribute, it appends the attribute string to its assembler name. */ - -static tree -ix86_mangle_function_version_assembler_name (tree decl, tree id) -{ - tree version_attr; - const char *orig_name, *version_string; - char *attr_str, *assembler_name; - - if (DECL_DECLARED_INLINE_P (decl) - && lookup_attribute ("gnu_inline", - DECL_ATTRIBUTES (decl))) - error_at (DECL_SOURCE_LOCATION (decl), - "function versions cannot be marked as gnu_inline," - " bodies have to be generated"); - - if (DECL_VIRTUAL_P (decl) - || DECL_VINDEX (decl)) - sorry ("virtual function multiversioning not supported"); - - version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl)); - - /* target attribute string cannot be NULL. */ - gcc_assert (version_attr != NULL_TREE); - - orig_name = IDENTIFIER_POINTER (id); - version_string - = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr))); - - if (strcmp (version_string, "default") == 0) - return id; - - attr_str = sorted_attr_string (TREE_VALUE (version_attr)); - assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2); - - sprintf (assembler_name, "%s.%s", orig_name, attr_str); - - /* Allow assembler name to be modified if already set. */ - if (DECL_ASSEMBLER_NAME_SET_P (decl)) - SET_DECL_RTL (decl, NULL); - - tree ret = get_identifier (assembler_name); - XDELETEVEC (attr_str); - XDELETEVEC (assembler_name); - return ret; -} - - -static tree -ix86_mangle_decl_assembler_name (tree decl, tree id) -{ - /* For function version, add the target suffix to the assembler name. */ - if (TREE_CODE (decl) == FUNCTION_DECL - && DECL_FUNCTION_VERSIONED (decl)) - id = ix86_mangle_function_version_assembler_name (decl, id); -#ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME - id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id); -#endif - - return id; -} - -/* Make a dispatcher declaration for the multi-versioned function DECL. - Calls to DECL function will be replaced with calls to the dispatcher - by the front-end. Returns the decl of the dispatcher function. */ - -static tree -ix86_get_function_versions_dispatcher (void *decl) -{ - tree fn = (tree) decl; - struct cgraph_node *node = NULL; - struct cgraph_node *default_node = NULL; - struct cgraph_function_version_info *node_v = NULL; - struct cgraph_function_version_info *first_v = NULL; - - tree dispatch_decl = NULL; - - struct cgraph_function_version_info *default_version_info = NULL; - - gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn)); - - node = cgraph_node::get (fn); - gcc_assert (node != NULL); - - node_v = node->function_version (); - gcc_assert (node_v != NULL); - - if (node_v->dispatcher_resolver != NULL) - return node_v->dispatcher_resolver; - - /* Find the default version and make it the first node. */ - first_v = node_v; - /* Go to the beginning of the chain. */ - while (first_v->prev != NULL) - first_v = first_v->prev; - default_version_info = first_v; - while (default_version_info != NULL) - { - if (is_function_default_version - (default_version_info->this_node->decl)) - break; - default_version_info = default_version_info->next; - } - - /* If there is no default node, just return NULL. */ - if (default_version_info == NULL) - return NULL; - - /* Make default info the first node. */ - if (first_v != default_version_info) - { - default_version_info->prev->next = default_version_info->next; - if (default_version_info->next) - default_version_info->next->prev = default_version_info->prev; - first_v->prev = default_version_info; - default_version_info->next = first_v; - default_version_info->prev = NULL; - } - - default_node = default_version_info->this_node; - -#if defined (ASM_OUTPUT_TYPE_DIRECTIVE) - if (targetm.has_ifunc_p ()) - { - struct cgraph_function_version_info *it_v = NULL; - struct cgraph_node *dispatcher_node = NULL; - struct cgraph_function_version_info *dispatcher_version_info = NULL; - - /* Right now, the dispatching is done via ifunc. */ - dispatch_decl = make_dispatcher_decl (default_node->decl); - - dispatcher_node = cgraph_node::get_create (dispatch_decl); - gcc_assert (dispatcher_node != NULL); - dispatcher_node->dispatcher_function = 1; - dispatcher_version_info - = dispatcher_node->insert_new_function_version (); - dispatcher_version_info->next = default_version_info; - dispatcher_node->definition = 1; - - /* Set the dispatcher for all the versions. */ - it_v = default_version_info; - while (it_v != NULL) - { - it_v->dispatcher_resolver = dispatch_decl; - it_v = it_v->next; - } - } - else -#endif - { - error_at (DECL_SOURCE_LOCATION (default_node->decl), - "multiversioning needs ifunc which is not supported " - "on this target"); - } - - return dispatch_decl; -} - -/* Make the resolver function decl to dispatch the versions of - a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is - ifunc alias that will point to the created resolver. Create an - empty basic block in the resolver and store the pointer in - EMPTY_BB. Return the decl of the resolver function. */ - -static tree -make_resolver_func (const tree default_decl, - const tree ifunc_alias_decl, - basic_block *empty_bb) -{ - char *resolver_name; - tree decl, type, decl_name, t; - - /* IFUNC's have to be globally visible. So, if the default_decl is - not, then the name of the IFUNC should be made unique. */ - if (TREE_PUBLIC (default_decl) == 0) - { - char *ifunc_name = make_unique_name (default_decl, "ifunc", true); - symtab->change_decl_assembler_name (ifunc_alias_decl, - get_identifier (ifunc_name)); - XDELETEVEC (ifunc_name); - } - - resolver_name = make_unique_name (default_decl, "resolver", false); - - /* The resolver function should return a (void *). */ - type = build_function_type_list (ptr_type_node, NULL_TREE); - - decl = build_fn_decl (resolver_name, type); - decl_name = get_identifier (resolver_name); - SET_DECL_ASSEMBLER_NAME (decl, decl_name); - - DECL_NAME (decl) = decl_name; - TREE_USED (decl) = 1; - DECL_ARTIFICIAL (decl) = 1; - DECL_IGNORED_P (decl) = 1; - TREE_PUBLIC (decl) = 0; - DECL_UNINLINABLE (decl) = 1; - - /* Resolver is not external, body is generated. */ - DECL_EXTERNAL (decl) = 0; - DECL_EXTERNAL (ifunc_alias_decl) = 0; - - DECL_CONTEXT (decl) = NULL_TREE; - DECL_INITIAL (decl) = make_node (BLOCK); - DECL_STATIC_CONSTRUCTOR (decl) = 0; - - if (DECL_COMDAT_GROUP (default_decl) - || TREE_PUBLIC (default_decl)) - { - /* In this case, each translation unit with a call to this - versioned function will put out a resolver. Ensure it - is comdat to keep just one copy. */ - DECL_COMDAT (decl) = 1; - make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl)); - } - /* Build result decl and add to function_decl. */ - t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node); - DECL_CONTEXT (t) = decl; - DECL_ARTIFICIAL (t) = 1; - DECL_IGNORED_P (t) = 1; - DECL_RESULT (decl) = t; - - gimplify_function_tree (decl); - push_cfun (DECL_STRUCT_FUNCTION (decl)); - *empty_bb = init_lowered_empty_function (decl, false, - profile_count::uninitialized ()); - - cgraph_node::add_new_function (decl, true); - symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl)); - - pop_cfun (); - - gcc_assert (ifunc_alias_decl != NULL); - /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */ - DECL_ATTRIBUTES (ifunc_alias_decl) - = make_attribute ("ifunc", resolver_name, - DECL_ATTRIBUTES (ifunc_alias_decl)); - - /* Create the alias for dispatch to resolver here. */ - cgraph_node::create_same_body_alias (ifunc_alias_decl, decl); - XDELETEVEC (resolver_name); - return decl; -} - -/* Generate the dispatching code body to dispatch multi-versioned function - DECL. The target hook is called to process the "target" attributes and - provide the code to dispatch the right function at run-time. NODE points - to the dispatcher decl whose body will be created. */ - -static tree -ix86_generate_version_dispatcher_body (void *node_p) -{ - tree resolver_decl; - basic_block empty_bb; - tree default_ver_decl; - struct cgraph_node *versn; - struct cgraph_node *node; - - struct cgraph_function_version_info *node_version_info = NULL; - struct cgraph_function_version_info *versn_info = NULL; - - node = (cgraph_node *)node_p; - - node_version_info = node->function_version (); - gcc_assert (node->dispatcher_function - && node_version_info != NULL); - - if (node_version_info->dispatcher_resolver) - return node_version_info->dispatcher_resolver; - - /* The first version in the chain corresponds to the default version. */ - default_ver_decl = node_version_info->next->this_node->decl; - - /* node is going to be an alias, so remove the finalized bit. */ - node->definition = false; - - resolver_decl = make_resolver_func (default_ver_decl, - node->decl, &empty_bb); - - node_version_info->dispatcher_resolver = resolver_decl; - - push_cfun (DECL_STRUCT_FUNCTION (resolver_decl)); - - auto_vec fn_ver_vec; - - for (versn_info = node_version_info->next; versn_info; - versn_info = versn_info->next) - { - versn = versn_info->this_node; - /* Check for virtual functions here again, as by this time it should - have been determined if this function needs a vtable index or - not. This happens for methods in derived classes that override - virtual methods in base classes but are not explicitly marked as - virtual. */ - if (DECL_VINDEX (versn->decl)) - sorry ("virtual function multiversioning not supported"); - - fn_ver_vec.safe_push (versn->decl); - } - - dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb); - cgraph_edge::rebuild_edges (); - pop_cfun (); - return resolver_decl; -} -/* This builds the processor_model struct type defined in - libgcc/config/i386/cpuinfo.c */ - -static tree -build_processor_model_struct (void) -{ - const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype", - "__cpu_features"}; - tree field = NULL_TREE, field_chain = NULL_TREE; - int i; - tree type = make_node (RECORD_TYPE); - - /* The first 3 fields are unsigned int. */ - for (i = 0; i < 3; ++i) - { - field = build_decl (UNKNOWN_LOCATION, FIELD_DECL, - get_identifier (field_name[i]), unsigned_type_node); - if (field_chain != NULL_TREE) - DECL_CHAIN (field) = field_chain; - field_chain = field; - } - - /* The last field is an array of unsigned integers of size one. */ - field = build_decl (UNKNOWN_LOCATION, FIELD_DECL, - get_identifier (field_name[3]), - build_array_type (unsigned_type_node, - build_index_type (size_one_node))); - if (field_chain != NULL_TREE) - DECL_CHAIN (field) = field_chain; - field_chain = field; - - finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE); - return type; -} - -/* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */ - -static tree -make_var_decl (tree type, const char *name) -{ - tree new_decl; - - new_decl = build_decl (UNKNOWN_LOCATION, - VAR_DECL, - get_identifier(name), - type); - - DECL_EXTERNAL (new_decl) = 1; - TREE_STATIC (new_decl) = 1; - TREE_PUBLIC (new_decl) = 1; - DECL_INITIAL (new_decl) = 0; - DECL_ARTIFICIAL (new_decl) = 0; - DECL_PRESERVE_P (new_decl) = 1; - - make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl)); - assemble_variable (new_decl, 0, 0, 0); - - return new_decl; -} - -/* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded - into an integer defined in libgcc/config/i386/cpuinfo.c */ - -static tree -fold_builtin_cpu (tree fndecl, tree *args) -{ - unsigned int i; - enum ix86_builtins fn_code = (enum ix86_builtins) - DECL_FUNCTION_CODE (fndecl); - tree param_string_cst = NULL; - - tree __processor_model_type = build_processor_model_struct (); - tree __cpu_model_var = make_var_decl (__processor_model_type, - "__cpu_model"); - - - varpool_node::add (__cpu_model_var); - - gcc_assert ((args != NULL) && (*args != NULL)); - - param_string_cst = *args; - while (param_string_cst - && TREE_CODE (param_string_cst) != STRING_CST) - { - /* *args must be a expr that can contain other EXPRS leading to a - STRING_CST. */ - if (!EXPR_P (param_string_cst)) - { - error ("parameter to builtin must be a string constant or literal"); - return integer_zero_node; - } - param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0); - } - - gcc_assert (param_string_cst); - - if (fn_code == IX86_BUILTIN_CPU_IS) - { - tree ref; - tree field; - tree final; - - unsigned int field_val = 0; - unsigned int NUM_ARCH_NAMES - = sizeof (arch_names_table) / sizeof (struct _arch_names_table); - - for (i = 0; i < NUM_ARCH_NAMES; i++) - if (strcmp (arch_names_table[i].name, - TREE_STRING_POINTER (param_string_cst)) == 0) - break; - - if (i == NUM_ARCH_NAMES) - { - error ("parameter to builtin not valid: %s", - TREE_STRING_POINTER (param_string_cst)); - return integer_zero_node; - } - - field = TYPE_FIELDS (__processor_model_type); - field_val = arch_names_table[i].model; - - /* CPU types are stored in the next field. */ - if (field_val > M_CPU_TYPE_START - && field_val < M_CPU_SUBTYPE_START) - { - field = DECL_CHAIN (field); - field_val -= M_CPU_TYPE_START; - } - - /* CPU subtypes are stored in the next field. */ - if (field_val > M_CPU_SUBTYPE_START) - { - field = DECL_CHAIN ( DECL_CHAIN (field)); - field_val -= M_CPU_SUBTYPE_START; - } - - /* Get the appropriate field in __cpu_model. */ - ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var, - field, NULL_TREE); - - /* Check the value. */ - final = build2 (EQ_EXPR, unsigned_type_node, ref, - build_int_cstu (unsigned_type_node, field_val)); - return build1 (CONVERT_EXPR, integer_type_node, final); - } - else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS) - { - tree ref; - tree array_elt; - tree field; - tree final; - - unsigned int field_val = 0; - unsigned int NUM_ISA_NAMES - = sizeof (isa_names_table) / sizeof (struct _isa_names_table); - - for (i = 0; i < NUM_ISA_NAMES; i++) - if (strcmp (isa_names_table[i].name, - TREE_STRING_POINTER (param_string_cst)) == 0) - break; - - if (i == NUM_ISA_NAMES) - { - error ("parameter to builtin not valid: %s", - TREE_STRING_POINTER (param_string_cst)); - return integer_zero_node; - } - - if (isa_names_table[i].feature >= 32) - { - tree __cpu_features2_var = make_var_decl (unsigned_type_node, - "__cpu_features2"); - - varpool_node::add (__cpu_features2_var); - field_val = (1U << (isa_names_table[i].feature - 32)); - /* Return __cpu_features2 & field_val */ - final = build2 (BIT_AND_EXPR, unsigned_type_node, - __cpu_features2_var, - build_int_cstu (unsigned_type_node, field_val)); - return build1 (CONVERT_EXPR, integer_type_node, final); - } - - field = TYPE_FIELDS (__processor_model_type); - /* Get the last field, which is __cpu_features. */ - while (DECL_CHAIN (field)) - field = DECL_CHAIN (field); - - /* Get the appropriate field: __cpu_model.__cpu_features */ - ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var, - field, NULL_TREE); - - /* Access the 0th element of __cpu_features array. */ - array_elt = build4 (ARRAY_REF, unsigned_type_node, ref, - integer_zero_node, NULL_TREE, NULL_TREE); - - field_val = (1U << isa_names_table[i].feature); - /* Return __cpu_model.__cpu_features[0] & field_val */ - final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt, - build_int_cstu (unsigned_type_node, field_val)); - return build1 (CONVERT_EXPR, integer_type_node, final); - } - gcc_unreachable (); -} - -/* Return the shift count of a vector by scalar shift builtin second argument - ARG1. */ -static tree -ix86_vector_shift_count (tree arg1) -{ - if (tree_fits_uhwi_p (arg1)) - return arg1; - else if (TREE_CODE (arg1) == VECTOR_CST && CHAR_BIT == 8) - { - /* The count argument is weird, passed in as various 128-bit - (or 64-bit) vectors, the low 64 bits from it are the count. */ - unsigned char buf[16]; - int len = native_encode_expr (arg1, buf, 16); - if (len == 0) - return NULL_TREE; - tree t = native_interpret_expr (uint64_type_node, buf, len); - if (t && tree_fits_uhwi_p (t)) - return t; - } - return NULL_TREE; -} - -static tree -ix86_fold_builtin (tree fndecl, int n_args, - tree *args, bool ignore ATTRIBUTE_UNUSED) -{ - if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD) - { - enum ix86_builtins fn_code = (enum ix86_builtins) - DECL_FUNCTION_CODE (fndecl); - enum rtx_code rcode; - bool is_vshift; - unsigned HOST_WIDE_INT mask; - - switch (fn_code) - { - case IX86_BUILTIN_CPU_IS: - case IX86_BUILTIN_CPU_SUPPORTS: - gcc_assert (n_args == 1); - return fold_builtin_cpu (fndecl, args); - - case IX86_BUILTIN_NANQ: - case IX86_BUILTIN_NANSQ: - { - tree type = TREE_TYPE (TREE_TYPE (fndecl)); - const char *str = c_getstr (*args); - int quiet = fn_code == IX86_BUILTIN_NANQ; - REAL_VALUE_TYPE real; - - if (str && real_nan (&real, str, quiet, TYPE_MODE (type))) - return build_real (type, real); - return NULL_TREE; - } - - case IX86_BUILTIN_INFQ: - case IX86_BUILTIN_HUGE_VALQ: - { - tree type = TREE_TYPE (TREE_TYPE (fndecl)); - REAL_VALUE_TYPE inf; - real_inf (&inf); - return build_real (type, inf); - } - - case IX86_BUILTIN_TZCNT16: - case IX86_BUILTIN_CTZS: - case IX86_BUILTIN_TZCNT32: - case IX86_BUILTIN_TZCNT64: - gcc_assert (n_args == 1); - if (TREE_CODE (args[0]) == INTEGER_CST) - { - tree type = TREE_TYPE (TREE_TYPE (fndecl)); - tree arg = args[0]; - if (fn_code == IX86_BUILTIN_TZCNT16 - || fn_code == IX86_BUILTIN_CTZS) - arg = fold_convert (short_unsigned_type_node, arg); - if (integer_zerop (arg)) - return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg))); - else - return fold_const_call (CFN_CTZ, type, arg); - } - break; - - case IX86_BUILTIN_LZCNT16: - case IX86_BUILTIN_CLZS: - case IX86_BUILTIN_LZCNT32: - case IX86_BUILTIN_LZCNT64: - gcc_assert (n_args == 1); - if (TREE_CODE (args[0]) == INTEGER_CST) - { - tree type = TREE_TYPE (TREE_TYPE (fndecl)); - tree arg = args[0]; - if (fn_code == IX86_BUILTIN_LZCNT16 - || fn_code == IX86_BUILTIN_CLZS) - arg = fold_convert (short_unsigned_type_node, arg); - if (integer_zerop (arg)) - return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg))); - else - return fold_const_call (CFN_CLZ, type, arg); - } - break; - - case IX86_BUILTIN_BEXTR32: - case IX86_BUILTIN_BEXTR64: - case IX86_BUILTIN_BEXTRI32: - case IX86_BUILTIN_BEXTRI64: - gcc_assert (n_args == 2); - if (tree_fits_uhwi_p (args[1])) - { - unsigned HOST_WIDE_INT res = 0; - unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0])); - unsigned int start = tree_to_uhwi (args[1]); - unsigned int len = (start & 0xff00) >> 8; - start &= 0xff; - if (start >= prec || len == 0) - res = 0; - else if (!tree_fits_uhwi_p (args[0])) - break; - else - res = tree_to_uhwi (args[0]) >> start; - if (len > prec) - len = prec; - if (len < HOST_BITS_PER_WIDE_INT) - res &= (HOST_WIDE_INT_1U << len) - 1; - return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res); - } - break; - - case IX86_BUILTIN_BZHI32: - case IX86_BUILTIN_BZHI64: - gcc_assert (n_args == 2); - if (tree_fits_uhwi_p (args[1])) - { - unsigned int idx = tree_to_uhwi (args[1]) & 0xff; - if (idx >= TYPE_PRECISION (TREE_TYPE (args[0]))) - return args[0]; - if (idx == 0) - return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), 0); - if (!tree_fits_uhwi_p (args[0])) - break; - unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]); - res &= ~(HOST_WIDE_INT_M1U << idx); - return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res); - } - break; - - case IX86_BUILTIN_PDEP32: - case IX86_BUILTIN_PDEP64: - gcc_assert (n_args == 2); - if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1])) - { - unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]); - unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]); - unsigned HOST_WIDE_INT res = 0; - unsigned HOST_WIDE_INT m, k = 1; - for (m = 1; m; m <<= 1) - if ((mask & m) != 0) - { - if ((src & k) != 0) - res |= m; - k <<= 1; - } - return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res); - } - break; - - case IX86_BUILTIN_PEXT32: - case IX86_BUILTIN_PEXT64: - gcc_assert (n_args == 2); - if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1])) - { - unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]); - unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]); - unsigned HOST_WIDE_INT res = 0; - unsigned HOST_WIDE_INT m, k = 1; - for (m = 1; m; m <<= 1) - if ((mask & m) != 0) - { - if ((src & m) != 0) - res |= k; - k <<= 1; - } - return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res); - } - break; - - case IX86_BUILTIN_MOVMSKPS: - case IX86_BUILTIN_PMOVMSKB: - case IX86_BUILTIN_MOVMSKPD: - case IX86_BUILTIN_PMOVMSKB128: - case IX86_BUILTIN_MOVMSKPD256: - case IX86_BUILTIN_MOVMSKPS256: - case IX86_BUILTIN_PMOVMSKB256: - gcc_assert (n_args == 1); - if (TREE_CODE (args[0]) == VECTOR_CST) - { - HOST_WIDE_INT res = 0; - for (unsigned i = 0; i < VECTOR_CST_NELTS (args[0]); ++i) - { - tree e = VECTOR_CST_ELT (args[0], i); - if (TREE_CODE (e) == INTEGER_CST && !TREE_OVERFLOW (e)) - { - if (wi::neg_p (wi::to_wide (e))) - res |= HOST_WIDE_INT_1 << i; - } - else if (TREE_CODE (e) == REAL_CST && !TREE_OVERFLOW (e)) - { - if (TREE_REAL_CST (e).sign) - res |= HOST_WIDE_INT_1 << i; - } - else - return NULL_TREE; - } - return build_int_cst (TREE_TYPE (TREE_TYPE (fndecl)), res); - } - break; - - case IX86_BUILTIN_PSLLD: - case IX86_BUILTIN_PSLLD128: - case IX86_BUILTIN_PSLLD128_MASK: - case IX86_BUILTIN_PSLLD256: - case IX86_BUILTIN_PSLLD256_MASK: - case IX86_BUILTIN_PSLLD512: - case IX86_BUILTIN_PSLLDI: - case IX86_BUILTIN_PSLLDI128: - case IX86_BUILTIN_PSLLDI128_MASK: - case IX86_BUILTIN_PSLLDI256: - case IX86_BUILTIN_PSLLDI256_MASK: - case IX86_BUILTIN_PSLLDI512: - case IX86_BUILTIN_PSLLQ: - case IX86_BUILTIN_PSLLQ128: - case IX86_BUILTIN_PSLLQ128_MASK: - case IX86_BUILTIN_PSLLQ256: - case IX86_BUILTIN_PSLLQ256_MASK: - case IX86_BUILTIN_PSLLQ512: - case IX86_BUILTIN_PSLLQI: - case IX86_BUILTIN_PSLLQI128: - case IX86_BUILTIN_PSLLQI128_MASK: - case IX86_BUILTIN_PSLLQI256: - case IX86_BUILTIN_PSLLQI256_MASK: - case IX86_BUILTIN_PSLLQI512: - case IX86_BUILTIN_PSLLW: - case IX86_BUILTIN_PSLLW128: - case IX86_BUILTIN_PSLLW128_MASK: - case IX86_BUILTIN_PSLLW256: - case IX86_BUILTIN_PSLLW256_MASK: - case IX86_BUILTIN_PSLLW512_MASK: - case IX86_BUILTIN_PSLLWI: - case IX86_BUILTIN_PSLLWI128: - case IX86_BUILTIN_PSLLWI128_MASK: - case IX86_BUILTIN_PSLLWI256: - case IX86_BUILTIN_PSLLWI256_MASK: - case IX86_BUILTIN_PSLLWI512_MASK: - rcode = ASHIFT; - is_vshift = false; - goto do_shift; - case IX86_BUILTIN_PSRAD: - case IX86_BUILTIN_PSRAD128: - case IX86_BUILTIN_PSRAD128_MASK: - case IX86_BUILTIN_PSRAD256: - case IX86_BUILTIN_PSRAD256_MASK: - case IX86_BUILTIN_PSRAD512: - case IX86_BUILTIN_PSRADI: - case IX86_BUILTIN_PSRADI128: - case IX86_BUILTIN_PSRADI128_MASK: - case IX86_BUILTIN_PSRADI256: - case IX86_BUILTIN_PSRADI256_MASK: - case IX86_BUILTIN_PSRADI512: - case IX86_BUILTIN_PSRAQ128_MASK: - case IX86_BUILTIN_PSRAQ256_MASK: - case IX86_BUILTIN_PSRAQ512: - case IX86_BUILTIN_PSRAQI128_MASK: - case IX86_BUILTIN_PSRAQI256_MASK: - case IX86_BUILTIN_PSRAQI512: - case IX86_BUILTIN_PSRAW: - case IX86_BUILTIN_PSRAW128: - case IX86_BUILTIN_PSRAW128_MASK: - case IX86_BUILTIN_PSRAW256: - case IX86_BUILTIN_PSRAW256_MASK: - case IX86_BUILTIN_PSRAW512: - case IX86_BUILTIN_PSRAWI: - case IX86_BUILTIN_PSRAWI128: - case IX86_BUILTIN_PSRAWI128_MASK: - case IX86_BUILTIN_PSRAWI256: - case IX86_BUILTIN_PSRAWI256_MASK: - case IX86_BUILTIN_PSRAWI512: - rcode = ASHIFTRT; - is_vshift = false; - goto do_shift; - case IX86_BUILTIN_PSRLD: - case IX86_BUILTIN_PSRLD128: - case IX86_BUILTIN_PSRLD128_MASK: - case IX86_BUILTIN_PSRLD256: - case IX86_BUILTIN_PSRLD256_MASK: - case IX86_BUILTIN_PSRLD512: - case IX86_BUILTIN_PSRLDI: - case IX86_BUILTIN_PSRLDI128: - case IX86_BUILTIN_PSRLDI128_MASK: - case IX86_BUILTIN_PSRLDI256: - case IX86_BUILTIN_PSRLDI256_MASK: - case IX86_BUILTIN_PSRLDI512: - case IX86_BUILTIN_PSRLQ: - case IX86_BUILTIN_PSRLQ128: - case IX86_BUILTIN_PSRLQ128_MASK: - case IX86_BUILTIN_PSRLQ256: - case IX86_BUILTIN_PSRLQ256_MASK: - case IX86_BUILTIN_PSRLQ512: - case IX86_BUILTIN_PSRLQI: - case IX86_BUILTIN_PSRLQI128: - case IX86_BUILTIN_PSRLQI128_MASK: - case IX86_BUILTIN_PSRLQI256: - case IX86_BUILTIN_PSRLQI256_MASK: - case IX86_BUILTIN_PSRLQI512: - case IX86_BUILTIN_PSRLW: - case IX86_BUILTIN_PSRLW128: - case IX86_BUILTIN_PSRLW128_MASK: - case IX86_BUILTIN_PSRLW256: - case IX86_BUILTIN_PSRLW256_MASK: - case IX86_BUILTIN_PSRLW512: - case IX86_BUILTIN_PSRLWI: - case IX86_BUILTIN_PSRLWI128: - case IX86_BUILTIN_PSRLWI128_MASK: - case IX86_BUILTIN_PSRLWI256: - case IX86_BUILTIN_PSRLWI256_MASK: - case IX86_BUILTIN_PSRLWI512: - rcode = LSHIFTRT; - is_vshift = false; - goto do_shift; - case IX86_BUILTIN_PSLLVV16HI: - case IX86_BUILTIN_PSLLVV16SI: - case IX86_BUILTIN_PSLLVV2DI: - case IX86_BUILTIN_PSLLVV2DI_MASK: - case IX86_BUILTIN_PSLLVV32HI: - case IX86_BUILTIN_PSLLVV4DI: - case IX86_BUILTIN_PSLLVV4DI_MASK: - case IX86_BUILTIN_PSLLVV4SI: - case IX86_BUILTIN_PSLLVV4SI_MASK: - case IX86_BUILTIN_PSLLVV8DI: - case IX86_BUILTIN_PSLLVV8HI: - case IX86_BUILTIN_PSLLVV8SI: - case IX86_BUILTIN_PSLLVV8SI_MASK: - rcode = ASHIFT; - is_vshift = true; - goto do_shift; - case IX86_BUILTIN_PSRAVQ128: - case IX86_BUILTIN_PSRAVQ256: - case IX86_BUILTIN_PSRAVV16HI: - case IX86_BUILTIN_PSRAVV16SI: - case IX86_BUILTIN_PSRAVV32HI: - case IX86_BUILTIN_PSRAVV4SI: - case IX86_BUILTIN_PSRAVV4SI_MASK: - case IX86_BUILTIN_PSRAVV8DI: - case IX86_BUILTIN_PSRAVV8HI: - case IX86_BUILTIN_PSRAVV8SI: - case IX86_BUILTIN_PSRAVV8SI_MASK: - rcode = ASHIFTRT; - is_vshift = true; - goto do_shift; - case IX86_BUILTIN_PSRLVV16HI: - case IX86_BUILTIN_PSRLVV16SI: - case IX86_BUILTIN_PSRLVV2DI: - case IX86_BUILTIN_PSRLVV2DI_MASK: - case IX86_BUILTIN_PSRLVV32HI: - case IX86_BUILTIN_PSRLVV4DI: - case IX86_BUILTIN_PSRLVV4DI_MASK: - case IX86_BUILTIN_PSRLVV4SI: - case IX86_BUILTIN_PSRLVV4SI_MASK: - case IX86_BUILTIN_PSRLVV8DI: - case IX86_BUILTIN_PSRLVV8HI: - case IX86_BUILTIN_PSRLVV8SI: - case IX86_BUILTIN_PSRLVV8SI_MASK: - rcode = LSHIFTRT; - is_vshift = true; - goto do_shift; - - do_shift: - gcc_assert (n_args >= 2); - if (TREE_CODE (args[0]) != VECTOR_CST) - break; - mask = HOST_WIDE_INT_M1U; - if (n_args > 2) - { - /* This is masked shift. */ - if (!tree_fits_uhwi_p (args[n_args - 1]) - || TREE_SIDE_EFFECTS (args[n_args - 2])) - break; - mask = tree_to_uhwi (args[n_args - 1]); - unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (args[0])); - mask |= HOST_WIDE_INT_M1U << elems; - if (mask != HOST_WIDE_INT_M1U - && TREE_CODE (args[n_args - 2]) != VECTOR_CST) - break; - if (mask == (HOST_WIDE_INT_M1U << elems)) - return args[n_args - 2]; - } - if (is_vshift && TREE_CODE (args[1]) != VECTOR_CST) - break; - if (tree tem = (is_vshift ? integer_one_node - : ix86_vector_shift_count (args[1]))) - { - unsigned HOST_WIDE_INT count = tree_to_uhwi (tem); - unsigned HOST_WIDE_INT prec - = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (args[0]))); - if (count == 0 && mask == HOST_WIDE_INT_M1U) - return args[0]; - if (count >= prec) - { - if (rcode == ASHIFTRT) - count = prec - 1; - else if (mask == HOST_WIDE_INT_M1U) - return build_zero_cst (TREE_TYPE (args[0])); - } - tree countt = NULL_TREE; - if (!is_vshift) - { - if (count >= prec) - countt = integer_zero_node; - else - countt = build_int_cst (integer_type_node, count); - } - tree_vector_builder builder; - builder.new_unary_operation (TREE_TYPE (args[0]), args[0], - false); - unsigned int cnt = builder.encoded_nelts (); - for (unsigned int i = 0; i < cnt; ++i) - { - tree elt = VECTOR_CST_ELT (args[0], i); - if (TREE_CODE (elt) != INTEGER_CST || TREE_OVERFLOW (elt)) - return NULL_TREE; - tree type = TREE_TYPE (elt); - if (rcode == LSHIFTRT) - elt = fold_convert (unsigned_type_for (type), elt); - if (is_vshift) - { - countt = VECTOR_CST_ELT (args[1], i); - if (TREE_CODE (countt) != INTEGER_CST - || TREE_OVERFLOW (countt)) - return NULL_TREE; - if (wi::neg_p (wi::to_wide (countt)) - || wi::to_widest (countt) >= prec) - { - if (rcode == ASHIFTRT) - countt = build_int_cst (TREE_TYPE (countt), - prec - 1); - else - { - elt = build_zero_cst (TREE_TYPE (elt)); - countt = build_zero_cst (TREE_TYPE (countt)); - } - } - } - else if (count >= prec) - elt = build_zero_cst (TREE_TYPE (elt)); - elt = const_binop (rcode == ASHIFT - ? LSHIFT_EXPR : RSHIFT_EXPR, - TREE_TYPE (elt), elt, countt); - if (!elt || TREE_CODE (elt) != INTEGER_CST) - return NULL_TREE; - if (rcode == LSHIFTRT) - elt = fold_convert (type, elt); - if ((mask & (HOST_WIDE_INT_1U << i)) == 0) - { - elt = VECTOR_CST_ELT (args[n_args - 2], i); - if (TREE_CODE (elt) != INTEGER_CST - || TREE_OVERFLOW (elt)) - return NULL_TREE; - } - builder.quick_push (elt); - } - return builder.build (); - } - break; - - default: - break; - } - } - -#ifdef SUBTARGET_FOLD_BUILTIN - return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore); -#endif - - return NULL_TREE; -} - -/* Fold a MD builtin (use ix86_fold_builtin for folding into - constant) in GIMPLE. */ - -bool -ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) -{ - gimple *stmt = gsi_stmt (*gsi); - tree fndecl = gimple_call_fndecl (stmt); - gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD)); - int n_args = gimple_call_num_args (stmt); - enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl); - tree decl = NULL_TREE; - tree arg0, arg1; - enum rtx_code rcode; - unsigned HOST_WIDE_INT count; - bool is_vshift; - - switch (fn_code) - { - case IX86_BUILTIN_TZCNT32: - decl = builtin_decl_implicit (BUILT_IN_CTZ); - goto fold_tzcnt_lzcnt; - - case IX86_BUILTIN_TZCNT64: - decl = builtin_decl_implicit (BUILT_IN_CTZLL); - goto fold_tzcnt_lzcnt; - - case IX86_BUILTIN_LZCNT32: - decl = builtin_decl_implicit (BUILT_IN_CLZ); - goto fold_tzcnt_lzcnt; - - case IX86_BUILTIN_LZCNT64: - decl = builtin_decl_implicit (BUILT_IN_CLZLL); - goto fold_tzcnt_lzcnt; - - fold_tzcnt_lzcnt: - gcc_assert (n_args == 1); - arg0 = gimple_call_arg (stmt, 0); - if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt)) - { - int prec = TYPE_PRECISION (TREE_TYPE (arg0)); - /* If arg0 is provably non-zero, optimize into generic - __builtin_c[tl]z{,ll} function the middle-end handles - better. */ - if (!expr_not_equal_to (arg0, wi::zero (prec))) - return false; - - location_t loc = gimple_location (stmt); - gimple *g = gimple_build_call (decl, 1, arg0); - gimple_set_location (g, loc); - tree lhs = make_ssa_name (integer_type_node); - gimple_call_set_lhs (g, lhs); - gsi_insert_before (gsi, g, GSI_SAME_STMT); - g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs); - gimple_set_location (g, loc); - gsi_replace (gsi, g, false); - return true; - } - break; - - case IX86_BUILTIN_BZHI32: - case IX86_BUILTIN_BZHI64: - gcc_assert (n_args == 2); - arg1 = gimple_call_arg (stmt, 1); - if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt)) - { - unsigned int idx = tree_to_uhwi (arg1) & 0xff; - arg0 = gimple_call_arg (stmt, 0); - if (idx < TYPE_PRECISION (TREE_TYPE (arg0))) - break; - location_t loc = gimple_location (stmt); - gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0); - gimple_set_location (g, loc); - gsi_replace (gsi, g, false); - return true; - } - break; - - case IX86_BUILTIN_PDEP32: - case IX86_BUILTIN_PDEP64: - case IX86_BUILTIN_PEXT32: - case IX86_BUILTIN_PEXT64: - gcc_assert (n_args == 2); - arg1 = gimple_call_arg (stmt, 1); - if (integer_all_onesp (arg1) && gimple_call_lhs (stmt)) - { - location_t loc = gimple_location (stmt); - arg0 = gimple_call_arg (stmt, 0); - gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0); - gimple_set_location (g, loc); - gsi_replace (gsi, g, false); - return true; - } - break; - - case IX86_BUILTIN_PSLLD: - case IX86_BUILTIN_PSLLD128: - case IX86_BUILTIN_PSLLD128_MASK: - case IX86_BUILTIN_PSLLD256: - case IX86_BUILTIN_PSLLD256_MASK: - case IX86_BUILTIN_PSLLD512: - case IX86_BUILTIN_PSLLDI: - case IX86_BUILTIN_PSLLDI128: - case IX86_BUILTIN_PSLLDI128_MASK: - case IX86_BUILTIN_PSLLDI256: - case IX86_BUILTIN_PSLLDI256_MASK: - case IX86_BUILTIN_PSLLDI512: - case IX86_BUILTIN_PSLLQ: - case IX86_BUILTIN_PSLLQ128: - case IX86_BUILTIN_PSLLQ128_MASK: - case IX86_BUILTIN_PSLLQ256: - case IX86_BUILTIN_PSLLQ256_MASK: - case IX86_BUILTIN_PSLLQ512: - case IX86_BUILTIN_PSLLQI: - case IX86_BUILTIN_PSLLQI128: - case IX86_BUILTIN_PSLLQI128_MASK: - case IX86_BUILTIN_PSLLQI256: - case IX86_BUILTIN_PSLLQI256_MASK: - case IX86_BUILTIN_PSLLQI512: - case IX86_BUILTIN_PSLLW: - case IX86_BUILTIN_PSLLW128: - case IX86_BUILTIN_PSLLW128_MASK: - case IX86_BUILTIN_PSLLW256: - case IX86_BUILTIN_PSLLW256_MASK: - case IX86_BUILTIN_PSLLW512_MASK: - case IX86_BUILTIN_PSLLWI: - case IX86_BUILTIN_PSLLWI128: - case IX86_BUILTIN_PSLLWI128_MASK: - case IX86_BUILTIN_PSLLWI256: - case IX86_BUILTIN_PSLLWI256_MASK: - case IX86_BUILTIN_PSLLWI512_MASK: - rcode = ASHIFT; - is_vshift = false; - goto do_shift; - case IX86_BUILTIN_PSRAD: - case IX86_BUILTIN_PSRAD128: - case IX86_BUILTIN_PSRAD128_MASK: - case IX86_BUILTIN_PSRAD256: - case IX86_BUILTIN_PSRAD256_MASK: - case IX86_BUILTIN_PSRAD512: - case IX86_BUILTIN_PSRADI: - case IX86_BUILTIN_PSRADI128: - case IX86_BUILTIN_PSRADI128_MASK: - case IX86_BUILTIN_PSRADI256: - case IX86_BUILTIN_PSRADI256_MASK: - case IX86_BUILTIN_PSRADI512: - case IX86_BUILTIN_PSRAQ128_MASK: - case IX86_BUILTIN_PSRAQ256_MASK: - case IX86_BUILTIN_PSRAQ512: - case IX86_BUILTIN_PSRAQI128_MASK: - case IX86_BUILTIN_PSRAQI256_MASK: - case IX86_BUILTIN_PSRAQI512: - case IX86_BUILTIN_PSRAW: - case IX86_BUILTIN_PSRAW128: - case IX86_BUILTIN_PSRAW128_MASK: - case IX86_BUILTIN_PSRAW256: - case IX86_BUILTIN_PSRAW256_MASK: - case IX86_BUILTIN_PSRAW512: - case IX86_BUILTIN_PSRAWI: - case IX86_BUILTIN_PSRAWI128: - case IX86_BUILTIN_PSRAWI128_MASK: - case IX86_BUILTIN_PSRAWI256: - case IX86_BUILTIN_PSRAWI256_MASK: - case IX86_BUILTIN_PSRAWI512: - rcode = ASHIFTRT; - is_vshift = false; - goto do_shift; - case IX86_BUILTIN_PSRLD: - case IX86_BUILTIN_PSRLD128: - case IX86_BUILTIN_PSRLD128_MASK: - case IX86_BUILTIN_PSRLD256: - case IX86_BUILTIN_PSRLD256_MASK: - case IX86_BUILTIN_PSRLD512: - case IX86_BUILTIN_PSRLDI: - case IX86_BUILTIN_PSRLDI128: - case IX86_BUILTIN_PSRLDI128_MASK: - case IX86_BUILTIN_PSRLDI256: - case IX86_BUILTIN_PSRLDI256_MASK: - case IX86_BUILTIN_PSRLDI512: - case IX86_BUILTIN_PSRLQ: - case IX86_BUILTIN_PSRLQ128: - case IX86_BUILTIN_PSRLQ128_MASK: - case IX86_BUILTIN_PSRLQ256: - case IX86_BUILTIN_PSRLQ256_MASK: - case IX86_BUILTIN_PSRLQ512: - case IX86_BUILTIN_PSRLQI: - case IX86_BUILTIN_PSRLQI128: - case IX86_BUILTIN_PSRLQI128_MASK: - case IX86_BUILTIN_PSRLQI256: - case IX86_BUILTIN_PSRLQI256_MASK: - case IX86_BUILTIN_PSRLQI512: - case IX86_BUILTIN_PSRLW: - case IX86_BUILTIN_PSRLW128: - case IX86_BUILTIN_PSRLW128_MASK: - case IX86_BUILTIN_PSRLW256: - case IX86_BUILTIN_PSRLW256_MASK: - case IX86_BUILTIN_PSRLW512: - case IX86_BUILTIN_PSRLWI: - case IX86_BUILTIN_PSRLWI128: - case IX86_BUILTIN_PSRLWI128_MASK: - case IX86_BUILTIN_PSRLWI256: - case IX86_BUILTIN_PSRLWI256_MASK: - case IX86_BUILTIN_PSRLWI512: - rcode = LSHIFTRT; - is_vshift = false; - goto do_shift; - case IX86_BUILTIN_PSLLVV16HI: - case IX86_BUILTIN_PSLLVV16SI: - case IX86_BUILTIN_PSLLVV2DI: - case IX86_BUILTIN_PSLLVV2DI_MASK: - case IX86_BUILTIN_PSLLVV32HI: - case IX86_BUILTIN_PSLLVV4DI: - case IX86_BUILTIN_PSLLVV4DI_MASK: - case IX86_BUILTIN_PSLLVV4SI: - case IX86_BUILTIN_PSLLVV4SI_MASK: - case IX86_BUILTIN_PSLLVV8DI: - case IX86_BUILTIN_PSLLVV8HI: - case IX86_BUILTIN_PSLLVV8SI: - case IX86_BUILTIN_PSLLVV8SI_MASK: - rcode = ASHIFT; - is_vshift = true; - goto do_shift; - case IX86_BUILTIN_PSRAVQ128: - case IX86_BUILTIN_PSRAVQ256: - case IX86_BUILTIN_PSRAVV16HI: - case IX86_BUILTIN_PSRAVV16SI: - case IX86_BUILTIN_PSRAVV32HI: - case IX86_BUILTIN_PSRAVV4SI: - case IX86_BUILTIN_PSRAVV4SI_MASK: - case IX86_BUILTIN_PSRAVV8DI: - case IX86_BUILTIN_PSRAVV8HI: - case IX86_BUILTIN_PSRAVV8SI: - case IX86_BUILTIN_PSRAVV8SI_MASK: - rcode = ASHIFTRT; - is_vshift = true; - goto do_shift; - case IX86_BUILTIN_PSRLVV16HI: - case IX86_BUILTIN_PSRLVV16SI: - case IX86_BUILTIN_PSRLVV2DI: - case IX86_BUILTIN_PSRLVV2DI_MASK: - case IX86_BUILTIN_PSRLVV32HI: - case IX86_BUILTIN_PSRLVV4DI: - case IX86_BUILTIN_PSRLVV4DI_MASK: - case IX86_BUILTIN_PSRLVV4SI: - case IX86_BUILTIN_PSRLVV4SI_MASK: - case IX86_BUILTIN_PSRLVV8DI: - case IX86_BUILTIN_PSRLVV8HI: - case IX86_BUILTIN_PSRLVV8SI: - case IX86_BUILTIN_PSRLVV8SI_MASK: - rcode = LSHIFTRT; - is_vshift = true; - goto do_shift; - - do_shift: - gcc_assert (n_args >= 2); - arg0 = gimple_call_arg (stmt, 0); - arg1 = gimple_call_arg (stmt, 1); - if (n_args > 2) - { - /* This is masked shift. Only optimize if the mask is all ones. */ - tree argl = gimple_call_arg (stmt, n_args - 1); - if (!tree_fits_uhwi_p (argl)) - break; - unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); - unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); - if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) - break; - } - if (is_vshift) - { - if (TREE_CODE (arg1) != VECTOR_CST) - break; - count = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0))); - if (integer_zerop (arg1)) - count = 0; - else if (rcode == ASHIFTRT) - break; - else - for (unsigned int i = 0; i < VECTOR_CST_NELTS (arg1); ++i) - { - tree elt = VECTOR_CST_ELT (arg1, i); - if (!wi::neg_p (wi::to_wide (elt)) - && wi::to_widest (elt) < count) - return false; - } - } - else - { - arg1 = ix86_vector_shift_count (arg1); - if (!arg1) - break; - count = tree_to_uhwi (arg1); - } - if (count == 0) - { - /* Just return the first argument for shift by 0. */ - location_t loc = gimple_location (stmt); - gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0); - gimple_set_location (g, loc); - gsi_replace (gsi, g, false); - return true; - } - if (rcode != ASHIFTRT - && count >= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0)))) - { - /* For shift counts equal or greater than precision, except for - arithmetic right shift the result is zero. */ - location_t loc = gimple_location (stmt); - gimple *g = gimple_build_assign (gimple_call_lhs (stmt), - build_zero_cst (TREE_TYPE (arg0))); - gimple_set_location (g, loc); - gsi_replace (gsi, g, false); - return true; - } - break; - - default: - break; - } - - return false; -} - -/* Make builtins to detect cpu type and features supported. NAME is - the builtin name, CODE is the builtin code, and FTYPE is the function - type of the builtin. */ - -static void -make_cpu_type_builtin (const char* name, int code, - enum ix86_builtin_func_type ftype, bool is_const) -{ - tree decl; - tree type; - - type = ix86_get_builtin_func_type (ftype); - decl = add_builtin_function (name, type, code, BUILT_IN_MD, - NULL, NULL_TREE); - gcc_assert (decl != NULL_TREE); - ix86_builtins[(int) code] = decl; - TREE_READONLY (decl) = is_const; -} - -/* Make builtins to get CPU type and features supported. The created - builtins are : - - __builtin_cpu_init (), to detect cpu type and features, - __builtin_cpu_is (""), to check if cpu is of type , - __builtin_cpu_supports (""), to check if cpu supports - */ - -static void -ix86_init_platform_type_builtins (void) -{ - make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT, - INT_FTYPE_VOID, false); - make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS, - INT_FTYPE_PCCHAR, true); - make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS, - INT_FTYPE_PCCHAR, true); -} - -/* Internal method for ix86_init_builtins. */ - -static void -ix86_init_builtins_va_builtins_abi (void) -{ - tree ms_va_ref, sysv_va_ref; - tree fnvoid_va_end_ms, fnvoid_va_end_sysv; - tree fnvoid_va_start_ms, fnvoid_va_start_sysv; - tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv; - tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE; - - if (!TARGET_64BIT) - return; - fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE); - fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE); - ms_va_ref = build_reference_type (ms_va_list_type_node); - sysv_va_ref = build_pointer_type (TREE_TYPE (sysv_va_list_type_node)); - - fnvoid_va_end_ms = build_function_type_list (void_type_node, ms_va_ref, - NULL_TREE); - fnvoid_va_start_ms - = build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE); - fnvoid_va_end_sysv - = build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE); - fnvoid_va_start_sysv - = build_varargs_function_type_list (void_type_node, sysv_va_ref, - NULL_TREE); - fnvoid_va_copy_ms - = build_function_type_list (void_type_node, ms_va_ref, - ms_va_list_type_node, NULL_TREE); - fnvoid_va_copy_sysv - = build_function_type_list (void_type_node, sysv_va_ref, - sysv_va_ref, NULL_TREE); - - add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms, - BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms); - add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms, - BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms); - add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms, - BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms); - add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv, - BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv); - add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv, - BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv); - add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv, - BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv); -} - -static void -ix86_init_builtin_types (void) -{ - tree float80_type_node, const_string_type_node; - - /* The __float80 type. */ - float80_type_node = long_double_type_node; - if (TYPE_MODE (float80_type_node) != XFmode) - { - if (float64x_type_node != NULL_TREE - && TYPE_MODE (float64x_type_node) == XFmode) - float80_type_node = float64x_type_node; - else - { - /* The __float80 type. */ - float80_type_node = make_node (REAL_TYPE); - - TYPE_PRECISION (float80_type_node) = 80; - layout_type (float80_type_node); - } - } - lang_hooks.types.register_builtin_type (float80_type_node, "__float80"); - - /* The __float128 type. The node has already been created as - _Float128, so we only need to register the __float128 name for - it. */ - lang_hooks.types.register_builtin_type (float128_type_node, "__float128"); - - const_string_type_node - = build_pointer_type (build_qualified_type - (char_type_node, TYPE_QUAL_CONST)); - - /* This macro is built by i386-builtin-types.awk. */ - DEFINE_BUILTIN_PRIMITIVE_TYPES; -} - -static void -ix86_init_builtins (void) -{ - tree ftype, decl; - - ix86_init_builtin_types (); - - /* Builtins to get CPU type and features. */ - ix86_init_platform_type_builtins (); - - /* TFmode support builtins. */ - def_builtin_const (0, 0, "__builtin_infq", - FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ); - def_builtin_const (0, 0, "__builtin_huge_valq", - FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ); - - ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING); - decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ, - BUILT_IN_MD, "nanq", NULL_TREE); - TREE_READONLY (decl) = 1; - ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl; - - decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ, - BUILT_IN_MD, "nansq", NULL_TREE); - TREE_READONLY (decl) = 1; - ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl; - - /* We will expand them to normal call if SSE isn't available since - they are used by libgcc. */ - ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128); - decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ, - BUILT_IN_MD, "__fabstf2", NULL_TREE); - TREE_READONLY (decl) = 1; - ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl; - - ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128); - decl = add_builtin_function ("__builtin_copysignq", ftype, - IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD, - "__copysigntf3", NULL_TREE); - TREE_READONLY (decl) = 1; - ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl; - - ix86_init_tm_builtins (); - ix86_init_mmx_sse_builtins (); - - if (TARGET_LP64) - ix86_init_builtins_va_builtins_abi (); - -#ifdef SUBTARGET_INIT_BUILTINS - SUBTARGET_INIT_BUILTINS; -#endif -} - -/* Return the ix86 builtin for CODE. */ - -static tree -ix86_builtin_decl (unsigned code, bool) -{ - if (code >= IX86_BUILTIN_MAX) - return error_mark_node; - - return ix86_builtins[code]; -} - -/* Errors in the source file can cause expand_expr to return const0_rtx - where we expect a vector. To avoid crashing, use one of the vector - clear instructions. */ -static rtx -safe_vector_operand (rtx x, machine_mode mode) -{ - if (x == const0_rtx) - x = CONST0_RTX (mode); - return x; -} - -/* Fixup modeless constants to fit required mode. */ -static rtx -fixup_modeless_constant (rtx x, machine_mode mode) -{ - if (GET_MODE (x) == VOIDmode) - x = convert_to_mode (mode, x, 1); - return x; -} - -/* Subroutine of ix86_expand_builtin to take care of binop insns. */ - -static rtx -ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - machine_mode tmode = insn_data[icode].operand[0].mode; - machine_mode mode0 = insn_data[icode].operand[1].mode; - machine_mode mode1 = insn_data[icode].operand[2].mode; - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - if (optimize || !target - || GET_MODE (target) != tmode - || !insn_data[icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - if (GET_MODE (op1) == SImode && mode1 == TImode) - { - rtx x = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_loadd (x, op1)); - op1 = gen_lowpart (TImode, x); - } - - if (!insn_data[icode].operand[1].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if (!insn_data[icode].operand[2].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - pat = GEN_FCN (icode) (target, op0, op1); - if (! pat) - return 0; - - emit_insn (pat); - - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */ - -static rtx -ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, - enum ix86_builtin_func_type m_type, - enum rtx_code sub_code) -{ - rtx pat; - int i; - int nargs; - bool comparison_p = false; - bool tf_p = false; - bool last_arg_constant = false; - int num_memory = 0; - struct { - rtx op; - machine_mode mode; - } args[4]; - - machine_mode tmode = insn_data[icode].operand[0].mode; - - switch (m_type) - { - case MULTI_ARG_4_DF2_DI_I: - case MULTI_ARG_4_DF2_DI_I1: - case MULTI_ARG_4_SF2_SI_I: - case MULTI_ARG_4_SF2_SI_I1: - nargs = 4; - last_arg_constant = true; - break; - - case MULTI_ARG_3_SF: - case MULTI_ARG_3_DF: - case MULTI_ARG_3_SF2: - case MULTI_ARG_3_DF2: - case MULTI_ARG_3_DI: - case MULTI_ARG_3_SI: - case MULTI_ARG_3_SI_DI: - case MULTI_ARG_3_HI: - case MULTI_ARG_3_HI_SI: - case MULTI_ARG_3_QI: - case MULTI_ARG_3_DI2: - case MULTI_ARG_3_SI2: - case MULTI_ARG_3_HI2: - case MULTI_ARG_3_QI2: - nargs = 3; - break; - - case MULTI_ARG_2_SF: - case MULTI_ARG_2_DF: - case MULTI_ARG_2_DI: - case MULTI_ARG_2_SI: - case MULTI_ARG_2_HI: - case MULTI_ARG_2_QI: - nargs = 2; - break; - - case MULTI_ARG_2_DI_IMM: - case MULTI_ARG_2_SI_IMM: - case MULTI_ARG_2_HI_IMM: - case MULTI_ARG_2_QI_IMM: - nargs = 2; - last_arg_constant = true; - break; - - case MULTI_ARG_1_SF: - case MULTI_ARG_1_DF: - case MULTI_ARG_1_SF2: - case MULTI_ARG_1_DF2: - case MULTI_ARG_1_DI: - case MULTI_ARG_1_SI: - case MULTI_ARG_1_HI: - case MULTI_ARG_1_QI: - case MULTI_ARG_1_SI_DI: - case MULTI_ARG_1_HI_DI: - case MULTI_ARG_1_HI_SI: - case MULTI_ARG_1_QI_DI: - case MULTI_ARG_1_QI_SI: - case MULTI_ARG_1_QI_HI: - nargs = 1; - break; - - case MULTI_ARG_2_DI_CMP: - case MULTI_ARG_2_SI_CMP: - case MULTI_ARG_2_HI_CMP: - case MULTI_ARG_2_QI_CMP: - nargs = 2; - comparison_p = true; - break; - - case MULTI_ARG_2_SF_TF: - case MULTI_ARG_2_DF_TF: - case MULTI_ARG_2_DI_TF: - case MULTI_ARG_2_SI_TF: - case MULTI_ARG_2_HI_TF: - case MULTI_ARG_2_QI_TF: - nargs = 2; - tf_p = true; - break; - - default: - gcc_unreachable (); - } - - if (optimize || !target - || GET_MODE (target) != tmode - || !insn_data[icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - else if (memory_operand (target, tmode)) - num_memory++; - - gcc_assert (nargs <= 4); - - for (i = 0; i < nargs; i++) - { - tree arg = CALL_EXPR_ARG (exp, i); - rtx op = expand_normal (arg); - int adjust = (comparison_p) ? 1 : 0; - machine_mode mode = insn_data[icode].operand[i+adjust+1].mode; - - if (last_arg_constant && i == nargs - 1) - { - if (!insn_data[icode].operand[i + 1].predicate (op, mode)) - { - enum insn_code new_icode = icode; - switch (icode) - { - case CODE_FOR_xop_vpermil2v2df3: - case CODE_FOR_xop_vpermil2v4sf3: - case CODE_FOR_xop_vpermil2v4df3: - case CODE_FOR_xop_vpermil2v8sf3: - error ("the last argument must be a 2-bit immediate"); - return gen_reg_rtx (tmode); - case CODE_FOR_xop_rotlv2di3: - new_icode = CODE_FOR_rotlv2di3; - goto xop_rotl; - case CODE_FOR_xop_rotlv4si3: - new_icode = CODE_FOR_rotlv4si3; - goto xop_rotl; - case CODE_FOR_xop_rotlv8hi3: - new_icode = CODE_FOR_rotlv8hi3; - goto xop_rotl; - case CODE_FOR_xop_rotlv16qi3: - new_icode = CODE_FOR_rotlv16qi3; - xop_rotl: - if (CONST_INT_P (op)) - { - int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1; - op = GEN_INT (INTVAL (op) & mask); - gcc_checking_assert - (insn_data[icode].operand[i + 1].predicate (op, mode)); - } - else - { - gcc_checking_assert - (nargs == 2 - && insn_data[new_icode].operand[0].mode == tmode - && insn_data[new_icode].operand[1].mode == tmode - && insn_data[new_icode].operand[2].mode == mode - && insn_data[new_icode].operand[0].predicate - == insn_data[icode].operand[0].predicate - && insn_data[new_icode].operand[1].predicate - == insn_data[icode].operand[1].predicate); - icode = new_icode; - goto non_constant; - } - break; - default: - gcc_unreachable (); - } - } - } - else - { - non_constant: - if (VECTOR_MODE_P (mode)) - op = safe_vector_operand (op, mode); - - /* If we aren't optimizing, only allow one memory operand to be - generated. */ - if (memory_operand (op, mode)) - num_memory++; - - gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode); - - if (optimize - || !insn_data[icode].operand[i+adjust+1].predicate (op, mode) - || num_memory > 1) - op = force_reg (mode, op); - } - - args[i].op = op; - args[i].mode = mode; - } - - switch (nargs) - { - case 1: - pat = GEN_FCN (icode) (target, args[0].op); - break; - - case 2: - if (tf_p) - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - GEN_INT ((int)sub_code)); - else if (! comparison_p) - pat = GEN_FCN (icode) (target, args[0].op, args[1].op); - else - { - rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target), - args[0].op, - args[1].op); - - pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op); - } - break; - - case 3: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); - break; - - case 4: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op); - break; - - default: - gcc_unreachable (); - } - - if (! pat) - return 0; - - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_args_builtin to take care of scalar unop - insns with vec_merge. */ - -static rtx -ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp, - rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - rtx op1, op0 = expand_normal (arg0); - machine_mode tmode = insn_data[icode].operand[0].mode; - machine_mode mode0 = insn_data[icode].operand[1].mode; - - if (optimize || !target - || GET_MODE (target) != tmode - || !insn_data[icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[icode].operand[1].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - - op1 = op0; - if (!insn_data[icode].operand[2].predicate (op1, mode0)) - op1 = copy_to_mode_reg (mode0, op1); - - pat = GEN_FCN (icode) (target, op0, op1); - if (! pat) - return 0; - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of comparison insns. */ - -static rtx -ix86_expand_sse_compare (const struct builtin_description *d, - tree exp, rtx target, bool swap) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2; - machine_mode tmode = insn_data[d->icode].operand[0].mode; - machine_mode mode0 = insn_data[d->icode].operand[1].mode; - machine_mode mode1 = insn_data[d->icode].operand[2].mode; - enum rtx_code comparison = d->comparison; - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - /* Swap operands if we have a comparison that isn't available in - hardware. */ - if (swap) - std::swap (op0, op1); - - if (optimize || !target - || GET_MODE (target) != tmode - || !insn_data[d->icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[1].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_data[d->icode].operand[2].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1); - pat = GEN_FCN (d->icode) (target, op0, op1, op2); - if (! pat) - return 0; - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of comi insns. */ - -static rtx -ix86_expand_sse_comi (const struct builtin_description *d, tree exp, - rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - machine_mode mode0 = insn_data[d->icode].operand[0].mode; - machine_mode mode1 = insn_data[d->icode].operand[1].mode; - enum rtx_code comparison = d->comparison; - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - /* Swap operands if we have a comparison that isn't available in - hardware. */ - if (d->flag & BUILTIN_DESC_SWAP_OPERANDS) - std::swap (op0, op1); - - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_data[d->icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - pat = GEN_FCN (d->icode) (op0, op1); - if (! pat) - return 0; - emit_insn (pat); - emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (comparison, QImode, - SET_DEST (pat), - const0_rtx))); - - return SUBREG_REG (target); -} - -/* Subroutines of ix86_expand_args_builtin to take care of round insns. */ - -static rtx -ix86_expand_sse_round (const struct builtin_description *d, tree exp, - rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - rtx op1, op0 = expand_normal (arg0); - machine_mode tmode = insn_data[d->icode].operand[0].mode; - machine_mode mode0 = insn_data[d->icode].operand[1].mode; - - if (optimize || target == 0 - || GET_MODE (target) != tmode - || !insn_data[d->icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - - op1 = GEN_INT (d->comparison); - - pat = GEN_FCN (d->icode) (target, op0, op1); - if (! pat) - return 0; - emit_insn (pat); - return target; -} - -static rtx -ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2; - machine_mode tmode = insn_data[d->icode].operand[0].mode; - machine_mode mode0 = insn_data[d->icode].operand[1].mode; - machine_mode mode1 = insn_data[d->icode].operand[2].mode; - - if (optimize || target == 0 - || GET_MODE (target) != tmode - || !insn_data[d->icode].operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - op0 = safe_vector_operand (op0, mode0); - op1 = safe_vector_operand (op1, mode1); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_data[d->icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - op2 = GEN_INT (d->comparison); - - pat = GEN_FCN (d->icode) (target, op0, op1, op2); - if (! pat) - return 0; - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of ptest insns. */ - -static rtx -ix86_expand_sse_ptest (const struct builtin_description *d, tree exp, - rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - machine_mode mode0 = insn_data[d->icode].operand[0].mode; - machine_mode mode1 = insn_data[d->icode].operand[1].mode; - enum rtx_code comparison = d->comparison; - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_data[d->icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_data[d->icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - pat = GEN_FCN (d->icode) (op0, op1); - if (! pat) - return 0; - emit_insn (pat); - emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (comparison, QImode, - SET_DEST (pat), - const0_rtx))); - - return SUBREG_REG (target); -} - -/* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */ - -static rtx -ix86_expand_sse_pcmpestr (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - tree arg2 = CALL_EXPR_ARG (exp, 2); - tree arg3 = CALL_EXPR_ARG (exp, 3); - tree arg4 = CALL_EXPR_ARG (exp, 4); - rtx scratch0, scratch1; - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2 = expand_normal (arg2); - rtx op3 = expand_normal (arg3); - rtx op4 = expand_normal (arg4); - machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm; - - tmode0 = insn_data[d->icode].operand[0].mode; - tmode1 = insn_data[d->icode].operand[1].mode; - modev2 = insn_data[d->icode].operand[2].mode; - modei3 = insn_data[d->icode].operand[3].mode; - modev4 = insn_data[d->icode].operand[4].mode; - modei5 = insn_data[d->icode].operand[5].mode; - modeimm = insn_data[d->icode].operand[6].mode; - - if (VECTOR_MODE_P (modev2)) - op0 = safe_vector_operand (op0, modev2); - if (VECTOR_MODE_P (modev4)) - op2 = safe_vector_operand (op2, modev4); - - if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) - op0 = copy_to_mode_reg (modev2, op0); - if (!insn_data[d->icode].operand[3].predicate (op1, modei3)) - op1 = copy_to_mode_reg (modei3, op1); - if ((optimize && !register_operand (op2, modev4)) - || !insn_data[d->icode].operand[4].predicate (op2, modev4)) - op2 = copy_to_mode_reg (modev4, op2); - if (!insn_data[d->icode].operand[5].predicate (op3, modei5)) - op3 = copy_to_mode_reg (modei5, op3); - - if (!insn_data[d->icode].operand[6].predicate (op4, modeimm)) - { - error ("the fifth argument must be an 8-bit immediate"); - return const0_rtx; - } - - if (d->code == IX86_BUILTIN_PCMPESTRI128) - { - if (optimize || !target - || GET_MODE (target) != tmode0 - || !insn_data[d->icode].operand[0].predicate (target, tmode0)) - target = gen_reg_rtx (tmode0); - - scratch1 = gen_reg_rtx (tmode1); - - pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4); - } - else if (d->code == IX86_BUILTIN_PCMPESTRM128) - { - if (optimize || !target - || GET_MODE (target) != tmode1 - || !insn_data[d->icode].operand[1].predicate (target, tmode1)) - target = gen_reg_rtx (tmode1); - - scratch0 = gen_reg_rtx (tmode0); - - pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4); - } - else - { - gcc_assert (d->flag); - - scratch0 = gen_reg_rtx (tmode0); - scratch1 = gen_reg_rtx (tmode1); - - pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4); - } - - if (! pat) - return 0; - - emit_insn (pat); - - if (d->flag) - { - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - emit_insn - (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (EQ, QImode, - gen_rtx_REG ((machine_mode) d->flag, - FLAGS_REG), - const0_rtx))); - return SUBREG_REG (target); - } - else - return target; -} - - -/* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */ - -static rtx -ix86_expand_sse_pcmpistr (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - tree arg2 = CALL_EXPR_ARG (exp, 2); - rtx scratch0, scratch1; - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2 = expand_normal (arg2); - machine_mode tmode0, tmode1, modev2, modev3, modeimm; - - tmode0 = insn_data[d->icode].operand[0].mode; - tmode1 = insn_data[d->icode].operand[1].mode; - modev2 = insn_data[d->icode].operand[2].mode; - modev3 = insn_data[d->icode].operand[3].mode; - modeimm = insn_data[d->icode].operand[4].mode; - - if (VECTOR_MODE_P (modev2)) - op0 = safe_vector_operand (op0, modev2); - if (VECTOR_MODE_P (modev3)) - op1 = safe_vector_operand (op1, modev3); - - if (!insn_data[d->icode].operand[2].predicate (op0, modev2)) - op0 = copy_to_mode_reg (modev2, op0); - if ((optimize && !register_operand (op1, modev3)) - || !insn_data[d->icode].operand[3].predicate (op1, modev3)) - op1 = copy_to_mode_reg (modev3, op1); - - if (!insn_data[d->icode].operand[4].predicate (op2, modeimm)) - { - error ("the third argument must be an 8-bit immediate"); - return const0_rtx; - } - - if (d->code == IX86_BUILTIN_PCMPISTRI128) - { - if (optimize || !target - || GET_MODE (target) != tmode0 - || !insn_data[d->icode].operand[0].predicate (target, tmode0)) - target = gen_reg_rtx (tmode0); - - scratch1 = gen_reg_rtx (tmode1); - - pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2); - } - else if (d->code == IX86_BUILTIN_PCMPISTRM128) - { - if (optimize || !target - || GET_MODE (target) != tmode1 - || !insn_data[d->icode].operand[1].predicate (target, tmode1)) - target = gen_reg_rtx (tmode1); - - scratch0 = gen_reg_rtx (tmode0); - - pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2); - } - else - { - gcc_assert (d->flag); - - scratch0 = gen_reg_rtx (tmode0); - scratch1 = gen_reg_rtx (tmode1); - - pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2); - } - - if (! pat) - return 0; - - emit_insn (pat); - - if (d->flag) - { - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - emit_insn - (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (EQ, QImode, - gen_rtx_REG ((machine_mode) d->flag, - FLAGS_REG), - const0_rtx))); - return SUBREG_REG (target); - } - else - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of insns with - variable number of operands. */ - -static rtx -ix86_expand_args_builtin (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat, real_target; - unsigned int i, nargs; - unsigned int nargs_constant = 0; - unsigned int mask_pos = 0; - int num_memory = 0; - struct - { - rtx op; - machine_mode mode; - } args[6]; - bool second_arg_count = false; - enum insn_code icode = d->icode; - const struct insn_data_d *insn_p = &insn_data[icode]; - machine_mode tmode = insn_p->operand[0].mode; - machine_mode rmode = VOIDmode; - bool swap = false; - enum rtx_code comparison = d->comparison; - - switch ((enum ix86_builtin_func_type) d->flag) - { - case V2DF_FTYPE_V2DF_ROUND: - case V4DF_FTYPE_V4DF_ROUND: - case V8DF_FTYPE_V8DF_ROUND: - case V4SF_FTYPE_V4SF_ROUND: - case V8SF_FTYPE_V8SF_ROUND: - case V16SF_FTYPE_V16SF_ROUND: - case V4SI_FTYPE_V4SF_ROUND: - case V8SI_FTYPE_V8SF_ROUND: - case V16SI_FTYPE_V16SF_ROUND: - return ix86_expand_sse_round (d, exp, target); - case V4SI_FTYPE_V2DF_V2DF_ROUND: - case V8SI_FTYPE_V4DF_V4DF_ROUND: - case V16SI_FTYPE_V8DF_V8DF_ROUND: - return ix86_expand_sse_round_vec_pack_sfix (d, exp, target); - case INT_FTYPE_V8SF_V8SF_PTEST: - case INT_FTYPE_V4DI_V4DI_PTEST: - case INT_FTYPE_V4DF_V4DF_PTEST: - case INT_FTYPE_V4SF_V4SF_PTEST: - case INT_FTYPE_V2DI_V2DI_PTEST: - case INT_FTYPE_V2DF_V2DF_PTEST: - return ix86_expand_sse_ptest (d, exp, target); - case FLOAT128_FTYPE_FLOAT128: - case FLOAT_FTYPE_FLOAT: - case INT_FTYPE_INT: - case UINT_FTYPE_UINT: - case UINT16_FTYPE_UINT16: - case UINT64_FTYPE_INT: - case UINT64_FTYPE_UINT64: - case INT64_FTYPE_INT64: - case INT64_FTYPE_V4SF: - case INT64_FTYPE_V2DF: - case INT_FTYPE_V16QI: - case INT_FTYPE_V8QI: - case INT_FTYPE_V8SF: - case INT_FTYPE_V4DF: - case INT_FTYPE_V4SF: - case INT_FTYPE_V2DF: - case INT_FTYPE_V32QI: - case V16QI_FTYPE_V16QI: - case V8SI_FTYPE_V8SF: - case V8SI_FTYPE_V4SI: - case V8HI_FTYPE_V8HI: - case V8HI_FTYPE_V16QI: - case V8QI_FTYPE_V8QI: - case V8SF_FTYPE_V8SF: - case V8SF_FTYPE_V8SI: - case V8SF_FTYPE_V4SF: - case V8SF_FTYPE_V8HI: - case V4SI_FTYPE_V4SI: - case V4SI_FTYPE_V16QI: - case V4SI_FTYPE_V4SF: - case V4SI_FTYPE_V8SI: - case V4SI_FTYPE_V8HI: - case V4SI_FTYPE_V4DF: - case V4SI_FTYPE_V2DF: - case V4HI_FTYPE_V4HI: - case V4DF_FTYPE_V4DF: - case V4DF_FTYPE_V4SI: - case V4DF_FTYPE_V4SF: - case V4DF_FTYPE_V2DF: - case V4SF_FTYPE_V4SF: - case V4SF_FTYPE_V4SI: - case V4SF_FTYPE_V8SF: - case V4SF_FTYPE_V4DF: - case V4SF_FTYPE_V8HI: - case V4SF_FTYPE_V2DF: - case V2DI_FTYPE_V2DI: - case V2DI_FTYPE_V16QI: - case V2DI_FTYPE_V8HI: - case V2DI_FTYPE_V4SI: - case V2DF_FTYPE_V2DF: - case V2DF_FTYPE_V4SI: - case V2DF_FTYPE_V4DF: - case V2DF_FTYPE_V4SF: - case V2DF_FTYPE_V2SI: - case V2SI_FTYPE_V2SI: - case V2SI_FTYPE_V4SF: - case V2SI_FTYPE_V2SF: - case V2SI_FTYPE_V2DF: - case V2SF_FTYPE_V2SF: - case V2SF_FTYPE_V2SI: - case V32QI_FTYPE_V32QI: - case V32QI_FTYPE_V16QI: - case V16HI_FTYPE_V16HI: - case V16HI_FTYPE_V8HI: - case V8SI_FTYPE_V8SI: - case V16HI_FTYPE_V16QI: - case V8SI_FTYPE_V16QI: - case V4DI_FTYPE_V16QI: - case V8SI_FTYPE_V8HI: - case V4DI_FTYPE_V8HI: - case V4DI_FTYPE_V4SI: - case V4DI_FTYPE_V2DI: - case UQI_FTYPE_UQI: - case UHI_FTYPE_UHI: - case USI_FTYPE_USI: - case USI_FTYPE_UQI: - case USI_FTYPE_UHI: - case UDI_FTYPE_UDI: - case UHI_FTYPE_V16QI: - case USI_FTYPE_V32QI: - case UDI_FTYPE_V64QI: - case V16QI_FTYPE_UHI: - case V32QI_FTYPE_USI: - case V64QI_FTYPE_UDI: - case V8HI_FTYPE_UQI: - case V16HI_FTYPE_UHI: - case V32HI_FTYPE_USI: - case V4SI_FTYPE_UQI: - case V8SI_FTYPE_UQI: - case V4SI_FTYPE_UHI: - case V8SI_FTYPE_UHI: - case UQI_FTYPE_V8HI: - case UHI_FTYPE_V16HI: - case USI_FTYPE_V32HI: - case UQI_FTYPE_V4SI: - case UQI_FTYPE_V8SI: - case UHI_FTYPE_V16SI: - case UQI_FTYPE_V2DI: - case UQI_FTYPE_V4DI: - case UQI_FTYPE_V8DI: - case V16SI_FTYPE_UHI: - case V2DI_FTYPE_UQI: - case V4DI_FTYPE_UQI: - case V16SI_FTYPE_INT: - case V16SF_FTYPE_V8SF: - case V16SI_FTYPE_V8SI: - case V16SF_FTYPE_V4SF: - case V16SI_FTYPE_V4SI: - case V16SI_FTYPE_V16SF: - case V16SI_FTYPE_V16SI: - case V64QI_FTYPE_V64QI: - case V32HI_FTYPE_V32HI: - case V16SF_FTYPE_V16SF: - case V8DI_FTYPE_UQI: - case V8DI_FTYPE_V8DI: - case V8DF_FTYPE_V4DF: - case V8DF_FTYPE_V2DF: - case V8DF_FTYPE_V8DF: - case V4DI_FTYPE_V4DI: - nargs = 1; - break; - case V4SF_FTYPE_V4SF_VEC_MERGE: - case V2DF_FTYPE_V2DF_VEC_MERGE: - return ix86_expand_unop_vec_merge_builtin (icode, exp, target); - case FLOAT128_FTYPE_FLOAT128_FLOAT128: - case V16QI_FTYPE_V16QI_V16QI: - case V16QI_FTYPE_V8HI_V8HI: - case V16SF_FTYPE_V16SF_V16SF: - case V8QI_FTYPE_V8QI_V8QI: - case V8QI_FTYPE_V4HI_V4HI: - case V8HI_FTYPE_V8HI_V8HI: - case V8HI_FTYPE_V16QI_V16QI: - case V8HI_FTYPE_V4SI_V4SI: - case V8SF_FTYPE_V8SF_V8SF: - case V8SF_FTYPE_V8SF_V8SI: - case V8DF_FTYPE_V8DF_V8DF: - case V4SI_FTYPE_V4SI_V4SI: - case V4SI_FTYPE_V8HI_V8HI: - case V4SI_FTYPE_V2DF_V2DF: - case V4HI_FTYPE_V4HI_V4HI: - case V4HI_FTYPE_V8QI_V8QI: - case V4HI_FTYPE_V2SI_V2SI: - case V4DF_FTYPE_V4DF_V4DF: - case V4DF_FTYPE_V4DF_V4DI: - case V4SF_FTYPE_V4SF_V4SF: - case V4SF_FTYPE_V4SF_V4SI: - case V4SF_FTYPE_V4SF_V2SI: - case V4SF_FTYPE_V4SF_V2DF: - case V4SF_FTYPE_V4SF_UINT: - case V4SF_FTYPE_V4SF_DI: - case V4SF_FTYPE_V4SF_SI: - case V2DI_FTYPE_V2DI_V2DI: - case V2DI_FTYPE_V16QI_V16QI: - case V2DI_FTYPE_V4SI_V4SI: - case V2DI_FTYPE_V2DI_V16QI: - case V2SI_FTYPE_V2SI_V2SI: - case V2SI_FTYPE_V4HI_V4HI: - case V2SI_FTYPE_V2SF_V2SF: - case V2DF_FTYPE_V2DF_V2DF: - case V2DF_FTYPE_V2DF_V4SF: - case V2DF_FTYPE_V2DF_V2DI: - case V2DF_FTYPE_V2DF_DI: - case V2DF_FTYPE_V2DF_SI: - case V2DF_FTYPE_V2DF_UINT: - case V2SF_FTYPE_V2SF_V2SF: - case V1DI_FTYPE_V1DI_V1DI: - case V1DI_FTYPE_V8QI_V8QI: - case V1DI_FTYPE_V2SI_V2SI: - case V32QI_FTYPE_V16HI_V16HI: - case V16HI_FTYPE_V8SI_V8SI: - case V64QI_FTYPE_V64QI_V64QI: - case V32QI_FTYPE_V32QI_V32QI: - case V16HI_FTYPE_V32QI_V32QI: - case V16HI_FTYPE_V16HI_V16HI: - case V8SI_FTYPE_V4DF_V4DF: - case V8SI_FTYPE_V8SI_V8SI: - case V8SI_FTYPE_V16HI_V16HI: - case V4DI_FTYPE_V4DI_V4DI: - case V4DI_FTYPE_V8SI_V8SI: - case V8DI_FTYPE_V64QI_V64QI: - if (comparison == UNKNOWN) - return ix86_expand_binop_builtin (icode, exp, target); - nargs = 2; - break; - case V4SF_FTYPE_V4SF_V4SF_SWAP: - case V2DF_FTYPE_V2DF_V2DF_SWAP: - gcc_assert (comparison != UNKNOWN); - nargs = 2; - swap = true; - break; - case V16HI_FTYPE_V16HI_V8HI_COUNT: - case V16HI_FTYPE_V16HI_SI_COUNT: - case V8SI_FTYPE_V8SI_V4SI_COUNT: - case V8SI_FTYPE_V8SI_SI_COUNT: - case V4DI_FTYPE_V4DI_V2DI_COUNT: - case V4DI_FTYPE_V4DI_INT_COUNT: - case V8HI_FTYPE_V8HI_V8HI_COUNT: - case V8HI_FTYPE_V8HI_SI_COUNT: - case V4SI_FTYPE_V4SI_V4SI_COUNT: - case V4SI_FTYPE_V4SI_SI_COUNT: - case V4HI_FTYPE_V4HI_V4HI_COUNT: - case V4HI_FTYPE_V4HI_SI_COUNT: - case V2DI_FTYPE_V2DI_V2DI_COUNT: - case V2DI_FTYPE_V2DI_SI_COUNT: - case V2SI_FTYPE_V2SI_V2SI_COUNT: - case V2SI_FTYPE_V2SI_SI_COUNT: - case V1DI_FTYPE_V1DI_V1DI_COUNT: - case V1DI_FTYPE_V1DI_SI_COUNT: - nargs = 2; - second_arg_count = true; - break; - case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT: - case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT: - case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT: - case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT: - case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT: - case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT: - case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT: - case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT: - case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT: - case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT: - case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT: - case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT: - case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT: - case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT: - case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT: - case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT: - case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT: - case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT: - nargs = 4; - second_arg_count = true; - break; - case UINT64_FTYPE_UINT64_UINT64: - case UINT_FTYPE_UINT_UINT: - case UINT_FTYPE_UINT_USHORT: - case UINT_FTYPE_UINT_UCHAR: - case UINT16_FTYPE_UINT16_INT: - case UINT8_FTYPE_UINT8_INT: - case UQI_FTYPE_UQI_UQI: - case UHI_FTYPE_UHI_UHI: - case USI_FTYPE_USI_USI: - case UDI_FTYPE_UDI_UDI: - case V16SI_FTYPE_V8DF_V8DF: - nargs = 2; - break; - case V2DI_FTYPE_V2DI_INT_CONVERT: - nargs = 2; - rmode = V1TImode; - nargs_constant = 1; - break; - case V4DI_FTYPE_V4DI_INT_CONVERT: - nargs = 2; - rmode = V2TImode; - nargs_constant = 1; - break; - case V8DI_FTYPE_V8DI_INT_CONVERT: - nargs = 2; - rmode = V4TImode; - nargs_constant = 1; - break; - case V8HI_FTYPE_V8HI_INT: - case V8HI_FTYPE_V8SF_INT: - case V16HI_FTYPE_V16SF_INT: - case V8HI_FTYPE_V4SF_INT: - case V8SF_FTYPE_V8SF_INT: - case V4SF_FTYPE_V16SF_INT: - case V16SF_FTYPE_V16SF_INT: - case V4SI_FTYPE_V4SI_INT: - case V4SI_FTYPE_V8SI_INT: - case V4HI_FTYPE_V4HI_INT: - case V4DF_FTYPE_V4DF_INT: - case V4DF_FTYPE_V8DF_INT: - case V4SF_FTYPE_V4SF_INT: - case V4SF_FTYPE_V8SF_INT: - case V2DI_FTYPE_V2DI_INT: - case V2DF_FTYPE_V2DF_INT: - case V2DF_FTYPE_V4DF_INT: - case V16HI_FTYPE_V16HI_INT: - case V8SI_FTYPE_V8SI_INT: - case V16SI_FTYPE_V16SI_INT: - case V4SI_FTYPE_V16SI_INT: - case V4DI_FTYPE_V4DI_INT: - case V2DI_FTYPE_V4DI_INT: - case V4DI_FTYPE_V8DI_INT: - case QI_FTYPE_V4SF_INT: - case QI_FTYPE_V2DF_INT: - case UQI_FTYPE_UQI_UQI_CONST: - case UHI_FTYPE_UHI_UQI: - case USI_FTYPE_USI_UQI: - case UDI_FTYPE_UDI_UQI: - nargs = 2; - nargs_constant = 1; - break; - case V16QI_FTYPE_V16QI_V16QI_V16QI: - case V8SF_FTYPE_V8SF_V8SF_V8SF: - case V4DF_FTYPE_V4DF_V4DF_V4DF: - case V4SF_FTYPE_V4SF_V4SF_V4SF: - case V2DF_FTYPE_V2DF_V2DF_V2DF: - case V32QI_FTYPE_V32QI_V32QI_V32QI: - case UHI_FTYPE_V16SI_V16SI_UHI: - case UQI_FTYPE_V8DI_V8DI_UQI: - case V16HI_FTYPE_V16SI_V16HI_UHI: - case V16QI_FTYPE_V16SI_V16QI_UHI: - case V16QI_FTYPE_V8DI_V16QI_UQI: - case V16SF_FTYPE_V16SF_V16SF_UHI: - case V16SF_FTYPE_V4SF_V16SF_UHI: - case V16SI_FTYPE_SI_V16SI_UHI: - case V16SI_FTYPE_V16HI_V16SI_UHI: - case V16SI_FTYPE_V16QI_V16SI_UHI: - case V8SF_FTYPE_V4SF_V8SF_UQI: - case V4DF_FTYPE_V2DF_V4DF_UQI: - case V8SI_FTYPE_V4SI_V8SI_UQI: - case V8SI_FTYPE_SI_V8SI_UQI: - case V4SI_FTYPE_V4SI_V4SI_UQI: - case V4SI_FTYPE_SI_V4SI_UQI: - case V4DI_FTYPE_V2DI_V4DI_UQI: - case V4DI_FTYPE_DI_V4DI_UQI: - case V2DI_FTYPE_V2DI_V2DI_UQI: - case V2DI_FTYPE_DI_V2DI_UQI: - case V64QI_FTYPE_V64QI_V64QI_UDI: - case V64QI_FTYPE_V16QI_V64QI_UDI: - case V64QI_FTYPE_QI_V64QI_UDI: - case V32QI_FTYPE_V32QI_V32QI_USI: - case V32QI_FTYPE_V16QI_V32QI_USI: - case V32QI_FTYPE_QI_V32QI_USI: - case V16QI_FTYPE_V16QI_V16QI_UHI: - case V16QI_FTYPE_QI_V16QI_UHI: - case V32HI_FTYPE_V8HI_V32HI_USI: - case V32HI_FTYPE_HI_V32HI_USI: - case V16HI_FTYPE_V8HI_V16HI_UHI: - case V16HI_FTYPE_HI_V16HI_UHI: - case V8HI_FTYPE_V8HI_V8HI_UQI: - case V8HI_FTYPE_HI_V8HI_UQI: - case V8SF_FTYPE_V8HI_V8SF_UQI: - case V4SF_FTYPE_V8HI_V4SF_UQI: - case V8SI_FTYPE_V8SF_V8SI_UQI: - case V4SI_FTYPE_V4SF_V4SI_UQI: - case V4DI_FTYPE_V4SF_V4DI_UQI: - case V2DI_FTYPE_V4SF_V2DI_UQI: - case V4SF_FTYPE_V4DI_V4SF_UQI: - case V4SF_FTYPE_V2DI_V4SF_UQI: - case V4DF_FTYPE_V4DI_V4DF_UQI: - case V2DF_FTYPE_V2DI_V2DF_UQI: - case V16QI_FTYPE_V8HI_V16QI_UQI: - case V16QI_FTYPE_V16HI_V16QI_UHI: - case V16QI_FTYPE_V4SI_V16QI_UQI: - case V16QI_FTYPE_V8SI_V16QI_UQI: - case V8HI_FTYPE_V4SI_V8HI_UQI: - case V8HI_FTYPE_V8SI_V8HI_UQI: - case V16QI_FTYPE_V2DI_V16QI_UQI: - case V16QI_FTYPE_V4DI_V16QI_UQI: - case V8HI_FTYPE_V2DI_V8HI_UQI: - case V8HI_FTYPE_V4DI_V8HI_UQI: - case V4SI_FTYPE_V2DI_V4SI_UQI: - case V4SI_FTYPE_V4DI_V4SI_UQI: - case V32QI_FTYPE_V32HI_V32QI_USI: - case UHI_FTYPE_V16QI_V16QI_UHI: - case USI_FTYPE_V32QI_V32QI_USI: - case UDI_FTYPE_V64QI_V64QI_UDI: - case UQI_FTYPE_V8HI_V8HI_UQI: - case UHI_FTYPE_V16HI_V16HI_UHI: - case USI_FTYPE_V32HI_V32HI_USI: - case UQI_FTYPE_V4SI_V4SI_UQI: - case UQI_FTYPE_V8SI_V8SI_UQI: - case UQI_FTYPE_V2DI_V2DI_UQI: - case UQI_FTYPE_V4DI_V4DI_UQI: - case V4SF_FTYPE_V2DF_V4SF_UQI: - case V4SF_FTYPE_V4DF_V4SF_UQI: - case V16SI_FTYPE_V16SI_V16SI_UHI: - case V16SI_FTYPE_V4SI_V16SI_UHI: - case V2DI_FTYPE_V4SI_V2DI_UQI: - case V2DI_FTYPE_V8HI_V2DI_UQI: - case V2DI_FTYPE_V16QI_V2DI_UQI: - case V4DI_FTYPE_V4DI_V4DI_UQI: - case V4DI_FTYPE_V4SI_V4DI_UQI: - case V4DI_FTYPE_V8HI_V4DI_UQI: - case V4DI_FTYPE_V16QI_V4DI_UQI: - case V4DI_FTYPE_V4DF_V4DI_UQI: - case V2DI_FTYPE_V2DF_V2DI_UQI: - case V4SI_FTYPE_V4DF_V4SI_UQI: - case V4SI_FTYPE_V2DF_V4SI_UQI: - case V4SI_FTYPE_V8HI_V4SI_UQI: - case V4SI_FTYPE_V16QI_V4SI_UQI: - case V4DI_FTYPE_V4DI_V4DI_V4DI: - case V8DF_FTYPE_V2DF_V8DF_UQI: - case V8DF_FTYPE_V4DF_V8DF_UQI: - case V8DF_FTYPE_V8DF_V8DF_UQI: - case V8SF_FTYPE_V8SF_V8SF_UQI: - case V8SF_FTYPE_V8SI_V8SF_UQI: - case V4DF_FTYPE_V4DF_V4DF_UQI: - case V4SF_FTYPE_V4SF_V4SF_UQI: - case V2DF_FTYPE_V2DF_V2DF_UQI: - case V2DF_FTYPE_V4SF_V2DF_UQI: - case V2DF_FTYPE_V4SI_V2DF_UQI: - case V4SF_FTYPE_V4SI_V4SF_UQI: - case V4DF_FTYPE_V4SF_V4DF_UQI: - case V4DF_FTYPE_V4SI_V4DF_UQI: - case V8SI_FTYPE_V8SI_V8SI_UQI: - case V8SI_FTYPE_V8HI_V8SI_UQI: - case V8SI_FTYPE_V16QI_V8SI_UQI: - case V8DF_FTYPE_V8SI_V8DF_UQI: - case V8DI_FTYPE_DI_V8DI_UQI: - case V16SF_FTYPE_V8SF_V16SF_UHI: - case V16SI_FTYPE_V8SI_V16SI_UHI: - case V16HI_FTYPE_V16HI_V16HI_UHI: - case V8HI_FTYPE_V16QI_V8HI_UQI: - case V16HI_FTYPE_V16QI_V16HI_UHI: - case V32HI_FTYPE_V32HI_V32HI_USI: - case V32HI_FTYPE_V32QI_V32HI_USI: - case V8DI_FTYPE_V16QI_V8DI_UQI: - case V8DI_FTYPE_V2DI_V8DI_UQI: - case V8DI_FTYPE_V4DI_V8DI_UQI: - case V8DI_FTYPE_V8DI_V8DI_UQI: - case V8DI_FTYPE_V8HI_V8DI_UQI: - case V8DI_FTYPE_V8SI_V8DI_UQI: - case V8HI_FTYPE_V8DI_V8HI_UQI: - case V8SI_FTYPE_V8DI_V8SI_UQI: - case V4SI_FTYPE_V4SI_V4SI_V4SI: - case V16SI_FTYPE_V16SI_V16SI_V16SI: - case V8DI_FTYPE_V8DI_V8DI_V8DI: - case V32HI_FTYPE_V32HI_V32HI_V32HI: - case V2DI_FTYPE_V2DI_V2DI_V2DI: - case V16HI_FTYPE_V16HI_V16HI_V16HI: - case V8SI_FTYPE_V8SI_V8SI_V8SI: - case V8HI_FTYPE_V8HI_V8HI_V8HI: - nargs = 3; - break; - case V32QI_FTYPE_V32QI_V32QI_INT: - case V16HI_FTYPE_V16HI_V16HI_INT: - case V16QI_FTYPE_V16QI_V16QI_INT: - case V4DI_FTYPE_V4DI_V4DI_INT: - case V8HI_FTYPE_V8HI_V8HI_INT: - case V8SI_FTYPE_V8SI_V8SI_INT: - case V8SI_FTYPE_V8SI_V4SI_INT: - case V8SF_FTYPE_V8SF_V8SF_INT: - case V8SF_FTYPE_V8SF_V4SF_INT: - case V4SI_FTYPE_V4SI_V4SI_INT: - case V4DF_FTYPE_V4DF_V4DF_INT: - case V16SF_FTYPE_V16SF_V16SF_INT: - case V16SF_FTYPE_V16SF_V4SF_INT: - case V16SI_FTYPE_V16SI_V4SI_INT: - case V4DF_FTYPE_V4DF_V2DF_INT: - case V4SF_FTYPE_V4SF_V4SF_INT: - case V2DI_FTYPE_V2DI_V2DI_INT: - case V4DI_FTYPE_V4DI_V2DI_INT: - case V2DF_FTYPE_V2DF_V2DF_INT: - case UQI_FTYPE_V8DI_V8UDI_INT: - case UQI_FTYPE_V8DF_V8DF_INT: - case UQI_FTYPE_V2DF_V2DF_INT: - case UQI_FTYPE_V4SF_V4SF_INT: - case UHI_FTYPE_V16SI_V16SI_INT: - case UHI_FTYPE_V16SF_V16SF_INT: - case V64QI_FTYPE_V64QI_V64QI_INT: - case V32HI_FTYPE_V32HI_V32HI_INT: - case V16SI_FTYPE_V16SI_V16SI_INT: - case V8DI_FTYPE_V8DI_V8DI_INT: - nargs = 3; - nargs_constant = 1; - break; - case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT: - nargs = 3; - rmode = V4DImode; - nargs_constant = 1; - break; - case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT: - nargs = 3; - rmode = V2DImode; - nargs_constant = 1; - break; - case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT: - nargs = 3; - rmode = DImode; - nargs_constant = 1; - break; - case V2DI_FTYPE_V2DI_UINT_UINT: - nargs = 3; - nargs_constant = 2; - break; - case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT: - nargs = 3; - rmode = V8DImode; - nargs_constant = 1; - break; - case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT: - nargs = 5; - rmode = V8DImode; - mask_pos = 2; - nargs_constant = 1; - break; - case QI_FTYPE_V8DF_INT_UQI: - case QI_FTYPE_V4DF_INT_UQI: - case QI_FTYPE_V2DF_INT_UQI: - case HI_FTYPE_V16SF_INT_UHI: - case QI_FTYPE_V8SF_INT_UQI: - case QI_FTYPE_V4SF_INT_UQI: - case V4SI_FTYPE_V4SI_V4SI_UHI: - case V8SI_FTYPE_V8SI_V8SI_UHI: - nargs = 3; - mask_pos = 1; - nargs_constant = 1; - break; - case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT: - nargs = 5; - rmode = V4DImode; - mask_pos = 2; - nargs_constant = 1; - break; - case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT: - nargs = 5; - rmode = V2DImode; - mask_pos = 2; - nargs_constant = 1; - break; - case V32QI_FTYPE_V32QI_V32QI_V32QI_USI: - case V32HI_FTYPE_V32HI_V32HI_V32HI_USI: - case V32HI_FTYPE_V64QI_V64QI_V32HI_USI: - case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI: - case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI: - case V32HI_FTYPE_V32HI_V8HI_V32HI_USI: - case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI: - case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI: - case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI: - case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI: - case V32QI_FTYPE_V16HI_V16HI_V32QI_USI: - case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI: - case V32HI_FTYPE_V16SI_V16SI_V32HI_USI: - case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI: - case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI: - case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI: - case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI: - case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI: - case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI: - case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI: - case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI: - case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI: - case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI: - case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI: - case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI: - case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI: - case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI: - case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI: - case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI: - case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI: - case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI: - case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI: - case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI: - case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI: - case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI: - case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI: - case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI: - case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI: - case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI: - case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI: - case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI: - case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI: - case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI: - case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI: - case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI: - case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI: - case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI: - case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI: - case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI: - case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI: - case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI: - nargs = 4; - break; - case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: - case V4DF_FTYPE_V4DF_V4DF_V4DI_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SI_INT: - case V8SF_FTYPE_V8SF_V8SF_V8SI_INT: - case V16SF_FTYPE_V16SF_V16SF_V16SI_INT: - nargs = 4; - nargs_constant = 1; - break; - case UQI_FTYPE_V4DI_V4DI_INT_UQI: - case UQI_FTYPE_V8SI_V8SI_INT_UQI: - case QI_FTYPE_V4DF_V4DF_INT_UQI: - case QI_FTYPE_V8SF_V8SF_INT_UQI: - case UQI_FTYPE_V2DI_V2DI_INT_UQI: - case UQI_FTYPE_V4SI_V4SI_INT_UQI: - case UQI_FTYPE_V2DF_V2DF_INT_UQI: - case UQI_FTYPE_V4SF_V4SF_INT_UQI: - case UDI_FTYPE_V64QI_V64QI_INT_UDI: - case USI_FTYPE_V32QI_V32QI_INT_USI: - case UHI_FTYPE_V16QI_V16QI_INT_UHI: - case USI_FTYPE_V32HI_V32HI_INT_USI: - case UHI_FTYPE_V16HI_V16HI_INT_UHI: - case UQI_FTYPE_V8HI_V8HI_INT_UQI: - case V32HI_FTYPE_V32HI_V32HI_V32HI_INT: - case V16HI_FTYPE_V16HI_V16HI_V16HI_INT: - case V8HI_FTYPE_V8HI_V8HI_V8HI_INT: - case V8SI_FTYPE_V8SI_V8SI_V8SI_INT: - case V4DI_FTYPE_V4DI_V4DI_V4DI_INT: - case V8DI_FTYPE_V8DI_V8DI_V8DI_INT: - case V16SI_FTYPE_V16SI_V16SI_V16SI_INT: - case V2DI_FTYPE_V2DI_V2DI_V2DI_INT: - case V4SI_FTYPE_V4SI_V4SI_V4SI_INT: - nargs = 4; - mask_pos = 1; - nargs_constant = 1; - break; - case V2DI_FTYPE_V2DI_V2DI_UINT_UINT: - nargs = 4; - nargs_constant = 2; - break; - case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED: - case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG: - nargs = 4; - break; - case UQI_FTYPE_V8DI_V8DI_INT_UQI: - case UHI_FTYPE_V16SI_V16SI_INT_UHI: - mask_pos = 1; - nargs = 4; - nargs_constant = 1; - break; - case V8SF_FTYPE_V8SF_INT_V8SF_UQI: - case V4SF_FTYPE_V4SF_INT_V4SF_UQI: - case V2DF_FTYPE_V4DF_INT_V2DF_UQI: - case V2DI_FTYPE_V4DI_INT_V2DI_UQI: - case V8SF_FTYPE_V16SF_INT_V8SF_UQI: - case V8SI_FTYPE_V16SI_INT_V8SI_UQI: - case V2DF_FTYPE_V8DF_INT_V2DF_UQI: - case V2DI_FTYPE_V8DI_INT_V2DI_UQI: - case V4SF_FTYPE_V8SF_INT_V4SF_UQI: - case V4SI_FTYPE_V8SI_INT_V4SI_UQI: - case V8HI_FTYPE_V8SF_INT_V8HI_UQI: - case V8HI_FTYPE_V4SF_INT_V8HI_UQI: - case V32HI_FTYPE_V32HI_INT_V32HI_USI: - case V16HI_FTYPE_V16HI_INT_V16HI_UHI: - case V8HI_FTYPE_V8HI_INT_V8HI_UQI: - case V4DI_FTYPE_V4DI_INT_V4DI_UQI: - case V2DI_FTYPE_V2DI_INT_V2DI_UQI: - case V8SI_FTYPE_V8SI_INT_V8SI_UQI: - case V4SI_FTYPE_V4SI_INT_V4SI_UQI: - case V4DF_FTYPE_V4DF_INT_V4DF_UQI: - case V2DF_FTYPE_V2DF_INT_V2DF_UQI: - case V8DF_FTYPE_V8DF_INT_V8DF_UQI: - case V16SF_FTYPE_V16SF_INT_V16SF_UHI: - case V16HI_FTYPE_V16SF_INT_V16HI_UHI: - case V16SI_FTYPE_V16SI_INT_V16SI_UHI: - case V4SI_FTYPE_V16SI_INT_V4SI_UQI: - case V4DI_FTYPE_V8DI_INT_V4DI_UQI: - case V4DF_FTYPE_V8DF_INT_V4DF_UQI: - case V4SF_FTYPE_V16SF_INT_V4SF_UQI: - case V8DI_FTYPE_V8DI_INT_V8DI_UQI: - nargs = 4; - mask_pos = 2; - nargs_constant = 1; - break; - case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI: - case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI: - case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI: - case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI: - case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI: - case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI: - case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI: - case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI: - case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI: - case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI: - case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI: - case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI: - case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI: - case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI: - case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI: - case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI: - case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI: - case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI: - case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI: - case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI: - case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI: - case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI: - case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI: - case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI: - case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI: - case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI: - case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI: - nargs = 5; - mask_pos = 2; - nargs_constant = 1; - break; - case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI: - case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI: - case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI: - case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI: - case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI: - case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI: - case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI: - case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI: - case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI: - case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI: - nargs = 5; - mask_pos = 1; - nargs_constant = 1; - break; - case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI: - case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI: - case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI: - case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT: - case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT: - case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT: - case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT: - case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT: - case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT: - case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT: - case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT: - case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT: - nargs = 5; - mask_pos = 1; - nargs_constant = 2; - break; - - default: - gcc_unreachable (); - } - - gcc_assert (nargs <= ARRAY_SIZE (args)); - - if (comparison != UNKNOWN) - { - gcc_assert (nargs == 2); - return ix86_expand_sse_compare (d, exp, target, swap); - } - - if (rmode == VOIDmode || rmode == tmode) - { - if (optimize - || target == 0 - || GET_MODE (target) != tmode - || !insn_p->operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - else if (memory_operand (target, tmode)) - num_memory++; - real_target = target; - } - else - { - real_target = gen_reg_rtx (tmode); - target = lowpart_subreg (rmode, real_target, tmode); - } - - for (i = 0; i < nargs; i++) - { - tree arg = CALL_EXPR_ARG (exp, i); - rtx op = expand_normal (arg); - machine_mode mode = insn_p->operand[i + 1].mode; - bool match = insn_p->operand[i + 1].predicate (op, mode); - - if (second_arg_count && i == 1) - { - /* SIMD shift insns take either an 8-bit immediate or - register as count. But builtin functions take int as - count. If count doesn't match, we put it in register. - The instructions are using 64-bit count, if op is just - 32-bit, zero-extend it, as negative shift counts - are undefined behavior and zero-extension is more - efficient. */ - if (!match) - { - if (SCALAR_INT_MODE_P (GET_MODE (op))) - op = convert_modes (mode, GET_MODE (op), op, 1); - else - op = lowpart_subreg (mode, op, GET_MODE (op)); - if (!insn_p->operand[i + 1].predicate (op, mode)) - op = copy_to_reg (op); - } - } - else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || - (!mask_pos && (nargs - i) <= nargs_constant)) - { - if (!match) - switch (icode) - { - case CODE_FOR_avx_vinsertf128v4di: - case CODE_FOR_avx_vextractf128v4di: - error ("the last argument must be an 1-bit immediate"); - return const0_rtx; - - case CODE_FOR_avx512f_cmpv8di3_mask: - case CODE_FOR_avx512f_cmpv16si3_mask: - case CODE_FOR_avx512f_ucmpv8di3_mask: - case CODE_FOR_avx512f_ucmpv16si3_mask: - case CODE_FOR_avx512vl_cmpv4di3_mask: - case CODE_FOR_avx512vl_cmpv8si3_mask: - case CODE_FOR_avx512vl_ucmpv4di3_mask: - case CODE_FOR_avx512vl_ucmpv8si3_mask: - case CODE_FOR_avx512vl_cmpv2di3_mask: - case CODE_FOR_avx512vl_cmpv4si3_mask: - case CODE_FOR_avx512vl_ucmpv2di3_mask: - case CODE_FOR_avx512vl_ucmpv4si3_mask: - error ("the last argument must be a 3-bit immediate"); - return const0_rtx; - - case CODE_FOR_sse4_1_roundsd: - case CODE_FOR_sse4_1_roundss: - - case CODE_FOR_sse4_1_roundpd: - case CODE_FOR_sse4_1_roundps: - case CODE_FOR_avx_roundpd256: - case CODE_FOR_avx_roundps256: - - case CODE_FOR_sse4_1_roundpd_vec_pack_sfix: - case CODE_FOR_sse4_1_roundps_sfix: - case CODE_FOR_avx_roundpd_vec_pack_sfix256: - case CODE_FOR_avx_roundps_sfix256: - - case CODE_FOR_sse4_1_blendps: - case CODE_FOR_avx_blendpd256: - case CODE_FOR_avx_vpermilv4df: - case CODE_FOR_avx_vpermilv4df_mask: - case CODE_FOR_avx512f_getmantv8df_mask: - case CODE_FOR_avx512f_getmantv16sf_mask: - case CODE_FOR_avx512vl_getmantv8sf_mask: - case CODE_FOR_avx512vl_getmantv4df_mask: - case CODE_FOR_avx512vl_getmantv4sf_mask: - case CODE_FOR_avx512vl_getmantv2df_mask: - case CODE_FOR_avx512dq_rangepv8df_mask_round: - case CODE_FOR_avx512dq_rangepv16sf_mask_round: - case CODE_FOR_avx512dq_rangepv4df_mask: - case CODE_FOR_avx512dq_rangepv8sf_mask: - case CODE_FOR_avx512dq_rangepv2df_mask: - case CODE_FOR_avx512dq_rangepv4sf_mask: - case CODE_FOR_avx_shufpd256_mask: - error ("the last argument must be a 4-bit immediate"); - return const0_rtx; - - case CODE_FOR_sha1rnds4: - case CODE_FOR_sse4_1_blendpd: - case CODE_FOR_avx_vpermilv2df: - case CODE_FOR_avx_vpermilv2df_mask: - case CODE_FOR_xop_vpermil2v2df3: - case CODE_FOR_xop_vpermil2v4sf3: - case CODE_FOR_xop_vpermil2v4df3: - case CODE_FOR_xop_vpermil2v8sf3: - case CODE_FOR_avx512f_vinsertf32x4_mask: - case CODE_FOR_avx512f_vinserti32x4_mask: - case CODE_FOR_avx512f_vextractf32x4_mask: - case CODE_FOR_avx512f_vextracti32x4_mask: - case CODE_FOR_sse2_shufpd: - case CODE_FOR_sse2_shufpd_mask: - case CODE_FOR_avx512dq_shuf_f64x2_mask: - case CODE_FOR_avx512dq_shuf_i64x2_mask: - case CODE_FOR_avx512vl_shuf_i32x4_mask: - case CODE_FOR_avx512vl_shuf_f32x4_mask: - error ("the last argument must be a 2-bit immediate"); - return const0_rtx; - - case CODE_FOR_avx_vextractf128v4df: - case CODE_FOR_avx_vextractf128v8sf: - case CODE_FOR_avx_vextractf128v8si: - case CODE_FOR_avx_vinsertf128v4df: - case CODE_FOR_avx_vinsertf128v8sf: - case CODE_FOR_avx_vinsertf128v8si: - case CODE_FOR_avx512f_vinsertf64x4_mask: - case CODE_FOR_avx512f_vinserti64x4_mask: - case CODE_FOR_avx512f_vextractf64x4_mask: - case CODE_FOR_avx512f_vextracti64x4_mask: - case CODE_FOR_avx512dq_vinsertf32x8_mask: - case CODE_FOR_avx512dq_vinserti32x8_mask: - case CODE_FOR_avx512vl_vinsertv4df: - case CODE_FOR_avx512vl_vinsertv4di: - case CODE_FOR_avx512vl_vinsertv8sf: - case CODE_FOR_avx512vl_vinsertv8si: - error ("the last argument must be a 1-bit immediate"); - return const0_rtx; - - case CODE_FOR_avx_vmcmpv2df3: - case CODE_FOR_avx_vmcmpv4sf3: - case CODE_FOR_avx_cmpv2df3: - case CODE_FOR_avx_cmpv4sf3: - case CODE_FOR_avx_cmpv4df3: - case CODE_FOR_avx_cmpv8sf3: - case CODE_FOR_avx512f_cmpv8df3_mask: - case CODE_FOR_avx512f_cmpv16sf3_mask: - case CODE_FOR_avx512f_vmcmpv2df3_mask: - case CODE_FOR_avx512f_vmcmpv4sf3_mask: - error ("the last argument must be a 5-bit immediate"); - return const0_rtx; - - default: - switch (nargs_constant) - { - case 2: - if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) || - (!mask_pos && (nargs - i) == nargs_constant)) - { - error ("the next to last argument must be an 8-bit immediate"); - break; - } - /* FALLTHRU */ - case 1: - error ("the last argument must be an 8-bit immediate"); - break; - default: - gcc_unreachable (); - } - return const0_rtx; - } - } - else - { - if (VECTOR_MODE_P (mode)) - op = safe_vector_operand (op, mode); - - /* If we aren't optimizing, only allow one memory operand to - be generated. */ - if (memory_operand (op, mode)) - num_memory++; - - op = fixup_modeless_constant (op, mode); - - if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) - { - if (optimize || !match || num_memory > 1) - op = copy_to_mode_reg (mode, op); - } - else - { - op = copy_to_reg (op); - op = lowpart_subreg (mode, op, GET_MODE (op)); - } - } - - args[i].op = op; - args[i].mode = mode; - } - - switch (nargs) - { - case 1: - pat = GEN_FCN (icode) (real_target, args[0].op); - break; - case 2: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op); - break; - case 3: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, - args[2].op); - break; - case 4: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, - args[2].op, args[3].op); - break; - case 5: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, - args[2].op, args[3].op, args[4].op); - break; - case 6: - pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op, - args[2].op, args[3].op, args[4].op, - args[5].op); - break; - default: - gcc_unreachable (); - } - - if (! pat) - return 0; - - emit_insn (pat); - return target; -} - -/* Transform pattern of following layout: - (set A - (unspec [B C] UNSPEC_EMBEDDED_ROUNDING)) - ) - into: - (set (A B)) */ - -static rtx -ix86_erase_embedded_rounding (rtx pat) -{ - if (GET_CODE (pat) == INSN) - pat = PATTERN (pat); - - gcc_assert (GET_CODE (pat) == SET); - rtx src = SET_SRC (pat); - gcc_assert (XVECLEN (src, 0) == 2); - rtx p0 = XVECEXP (src, 0, 0); - gcc_assert (GET_CODE (src) == UNSPEC - && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING); - rtx res = gen_rtx_SET (SET_DEST (pat), p0); - return res; -} - -/* Subroutine of ix86_expand_round_builtin to take care of comi insns - with rounding. */ -static rtx -ix86_expand_sse_comi_round (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat, set_dst; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - tree arg2 = CALL_EXPR_ARG (exp, 2); - tree arg3 = CALL_EXPR_ARG (exp, 3); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2 = expand_normal (arg2); - rtx op3 = expand_normal (arg3); - enum insn_code icode = d->icode; - const struct insn_data_d *insn_p = &insn_data[icode]; - machine_mode mode0 = insn_p->operand[0].mode; - machine_mode mode1 = insn_p->operand[1].mode; - enum rtx_code comparison = UNEQ; - bool need_ucomi = false; - - /* See avxintrin.h for values. */ - enum rtx_code comi_comparisons[32] = - { - UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT, - UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE, - UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT - }; - bool need_ucomi_values[32] = - { - true, false, false, true, true, false, false, true, - true, false, false, true, true, false, false, true, - false, true, true, false, false, true, true, false, - false, true, true, false, false, true, true, false - }; - - if (!CONST_INT_P (op2)) - { - error ("the third argument must be comparison constant"); - return const0_rtx; - } - if (INTVAL (op2) < 0 || INTVAL (op2) >= 32) - { - error ("incorrect comparison mode"); - return const0_rtx; - } - - if (!insn_p->operand[2].predicate (op3, SImode)) - { - error ("incorrect rounding operand"); - return const0_rtx; - } - - comparison = comi_comparisons[INTVAL (op2)]; - need_ucomi = need_ucomi_values[INTVAL (op2)]; - - if (VECTOR_MODE_P (mode0)) - op0 = safe_vector_operand (op0, mode0); - if (VECTOR_MODE_P (mode1)) - op1 = safe_vector_operand (op1, mode1); - - target = gen_reg_rtx (SImode); - emit_move_insn (target, const0_rtx); - target = gen_rtx_SUBREG (QImode, target, 0); - - if ((optimize && !register_operand (op0, mode0)) - || !insn_p->operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !insn_p->operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - if (need_ucomi) - icode = icode == CODE_FOR_sse_comi_round - ? CODE_FOR_sse_ucomi_round - : CODE_FOR_sse2_ucomi_round; - - pat = GEN_FCN (icode) (op0, op1, op3); - if (! pat) - return 0; - - /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */ - if (INTVAL (op3) == NO_ROUND) - { - pat = ix86_erase_embedded_rounding (pat); - if (! pat) - return 0; - - set_dst = SET_DEST (pat); - } - else - { - gcc_assert (GET_CODE (pat) == SET); - set_dst = SET_DEST (pat); - } - - emit_insn (pat); - emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target), - gen_rtx_fmt_ee (comparison, QImode, - set_dst, - const0_rtx))); - - return SUBREG_REG (target); -} - -static rtx -ix86_expand_round_builtin (const struct builtin_description *d, - tree exp, rtx target) -{ - rtx pat; - unsigned int i, nargs; - struct - { - rtx op; - machine_mode mode; - } args[6]; - enum insn_code icode = d->icode; - const struct insn_data_d *insn_p = &insn_data[icode]; - machine_mode tmode = insn_p->operand[0].mode; - unsigned int nargs_constant = 0; - unsigned int redundant_embed_rnd = 0; - - switch ((enum ix86_builtin_func_type) d->flag) - { - case UINT64_FTYPE_V2DF_INT: - case UINT64_FTYPE_V4SF_INT: - case UINT_FTYPE_V2DF_INT: - case UINT_FTYPE_V4SF_INT: - case INT64_FTYPE_V2DF_INT: - case INT64_FTYPE_V4SF_INT: - case INT_FTYPE_V2DF_INT: - case INT_FTYPE_V4SF_INT: - nargs = 2; - break; - case V4SF_FTYPE_V4SF_UINT_INT: - case V4SF_FTYPE_V4SF_UINT64_INT: - case V2DF_FTYPE_V2DF_UINT64_INT: - case V4SF_FTYPE_V4SF_INT_INT: - case V4SF_FTYPE_V4SF_INT64_INT: - case V2DF_FTYPE_V2DF_INT64_INT: - case V4SF_FTYPE_V4SF_V4SF_INT: - case V2DF_FTYPE_V2DF_V2DF_INT: - case V4SF_FTYPE_V4SF_V2DF_INT: - case V2DF_FTYPE_V2DF_V4SF_INT: - nargs = 3; - break; - case V8SF_FTYPE_V8DF_V8SF_QI_INT: - case V8DF_FTYPE_V8DF_V8DF_QI_INT: - case V8SI_FTYPE_V8DF_V8SI_QI_INT: - case V8DI_FTYPE_V8DF_V8DI_QI_INT: - case V8SF_FTYPE_V8DI_V8SF_QI_INT: - case V8DF_FTYPE_V8DI_V8DF_QI_INT: - case V16SF_FTYPE_V16SF_V16SF_HI_INT: - case V8DI_FTYPE_V8SF_V8DI_QI_INT: - case V16SF_FTYPE_V16SI_V16SF_HI_INT: - case V16SI_FTYPE_V16SF_V16SI_HI_INT: - case V8DF_FTYPE_V8SF_V8DF_QI_INT: - case V16SF_FTYPE_V16HI_V16SF_HI_INT: - case V2DF_FTYPE_V2DF_V2DF_V2DF_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SF_INT: - nargs = 4; - break; - case V4SF_FTYPE_V4SF_V4SF_INT_INT: - case V2DF_FTYPE_V2DF_V2DF_INT_INT: - nargs_constant = 2; - nargs = 4; - break; - case INT_FTYPE_V4SF_V4SF_INT_INT: - case INT_FTYPE_V2DF_V2DF_INT_INT: - return ix86_expand_sse_comi_round (d, exp, target); - case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT: - case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT: - case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT: - case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT: - case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT: - case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT: - nargs = 5; - break; - case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT: - case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT: - nargs_constant = 4; - nargs = 5; - break; - case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT: - case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT: - case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT: - case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT: - nargs_constant = 3; - nargs = 5; - break; - case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT: - case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT: - case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT: - case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT: - case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT: - case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT: - nargs = 6; - nargs_constant = 4; - break; - case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT: - case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT: - case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT: - case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT: - nargs = 6; - nargs_constant = 3; - break; - default: - gcc_unreachable (); - } - gcc_assert (nargs <= ARRAY_SIZE (args)); - - if (optimize - || target == 0 - || GET_MODE (target) != tmode - || !insn_p->operand[0].predicate (target, tmode)) - target = gen_reg_rtx (tmode); - - for (i = 0; i < nargs; i++) - { - tree arg = CALL_EXPR_ARG (exp, i); - rtx op = expand_normal (arg); - machine_mode mode = insn_p->operand[i + 1].mode; - bool match = insn_p->operand[i + 1].predicate (op, mode); - - if (i == nargs - nargs_constant) - { - if (!match) - { - switch (icode) - { - case CODE_FOR_avx512f_getmantv8df_mask_round: - case CODE_FOR_avx512f_getmantv16sf_mask_round: - case CODE_FOR_avx512f_vgetmantv2df_round: - case CODE_FOR_avx512f_vgetmantv2df_mask_round: - case CODE_FOR_avx512f_vgetmantv4sf_round: - case CODE_FOR_avx512f_vgetmantv4sf_mask_round: - error ("the immediate argument must be a 4-bit immediate"); - return const0_rtx; - case CODE_FOR_avx512f_cmpv8df3_mask_round: - case CODE_FOR_avx512f_cmpv16sf3_mask_round: - case CODE_FOR_avx512f_vmcmpv2df3_mask_round: - case CODE_FOR_avx512f_vmcmpv4sf3_mask_round: - error ("the immediate argument must be a 5-bit immediate"); - return const0_rtx; - default: - error ("the immediate argument must be an 8-bit immediate"); - return const0_rtx; - } - } - } - else if (i == nargs-1) - { - if (!insn_p->operand[nargs].predicate (op, SImode)) - { - error ("incorrect rounding operand"); - return const0_rtx; - } - - /* If there is no rounding use normal version of the pattern. */ - if (INTVAL (op) == NO_ROUND) - redundant_embed_rnd = 1; - } - else - { - if (VECTOR_MODE_P (mode)) - op = safe_vector_operand (op, mode); - - op = fixup_modeless_constant (op, mode); - - if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) - { - if (optimize || !match) - op = copy_to_mode_reg (mode, op); - } - else - { - op = copy_to_reg (op); - op = lowpart_subreg (mode, op, GET_MODE (op)); - } - } - - args[i].op = op; - args[i].mode = mode; - } - - switch (nargs) - { - case 1: - pat = GEN_FCN (icode) (target, args[0].op); - break; - case 2: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op); - break; - case 3: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - args[2].op); - break; - case 4: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - args[2].op, args[3].op); - break; - case 5: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - args[2].op, args[3].op, args[4].op); - break; - case 6: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, - args[2].op, args[3].op, args[4].op, - args[5].op); - break; - default: - gcc_unreachable (); - } - - if (!pat) - return 0; - - if (redundant_embed_rnd) - pat = ix86_erase_embedded_rounding (pat); - - emit_insn (pat); - return target; -} - -/* Subroutine of ix86_expand_builtin to take care of special insns - with variable number of operands. */ - -static rtx -ix86_expand_special_args_builtin (const struct builtin_description *d, - tree exp, rtx target) -{ - tree arg; - rtx pat, op; - unsigned int i, nargs, arg_adjust, memory; - bool aligned_mem = false; - struct - { - rtx op; - machine_mode mode; - } args[3]; - enum insn_code icode = d->icode; - bool last_arg_constant = false; - const struct insn_data_d *insn_p = &insn_data[icode]; - machine_mode tmode = insn_p->operand[0].mode; - enum { load, store } klass; - - switch ((enum ix86_builtin_func_type) d->flag) - { - case VOID_FTYPE_VOID: - emit_insn (GEN_FCN (icode) (target)); - return 0; - case VOID_FTYPE_UINT64: - case VOID_FTYPE_UNSIGNED: - nargs = 0; - klass = store; - memory = 0; - break; - - case INT_FTYPE_VOID: - case USHORT_FTYPE_VOID: - case UINT64_FTYPE_VOID: - case UINT_FTYPE_VOID: - case UNSIGNED_FTYPE_VOID: - nargs = 0; - klass = load; - memory = 0; - break; - case UINT64_FTYPE_PUNSIGNED: - case V2DI_FTYPE_PV2DI: - case V4DI_FTYPE_PV4DI: - case V32QI_FTYPE_PCCHAR: - case V16QI_FTYPE_PCCHAR: - case V8SF_FTYPE_PCV4SF: - case V8SF_FTYPE_PCFLOAT: - case V4SF_FTYPE_PCFLOAT: - case V4DF_FTYPE_PCV2DF: - case V4DF_FTYPE_PCDOUBLE: - case V2DF_FTYPE_PCDOUBLE: - case VOID_FTYPE_PVOID: - case V8DI_FTYPE_PV8DI: - nargs = 1; - klass = load; - memory = 0; - switch (icode) - { - case CODE_FOR_sse4_1_movntdqa: - case CODE_FOR_avx2_movntdqa: - case CODE_FOR_avx512f_movntdqa: - aligned_mem = true; - break; - default: - break; - } - break; - case VOID_FTYPE_PV2SF_V4SF: - case VOID_FTYPE_PV8DI_V8DI: - case VOID_FTYPE_PV4DI_V4DI: - case VOID_FTYPE_PV2DI_V2DI: - case VOID_FTYPE_PCHAR_V32QI: - case VOID_FTYPE_PCHAR_V16QI: - case VOID_FTYPE_PFLOAT_V16SF: - case VOID_FTYPE_PFLOAT_V8SF: - case VOID_FTYPE_PFLOAT_V4SF: - case VOID_FTYPE_PDOUBLE_V8DF: - case VOID_FTYPE_PDOUBLE_V4DF: - case VOID_FTYPE_PDOUBLE_V2DF: - case VOID_FTYPE_PLONGLONG_LONGLONG: - case VOID_FTYPE_PULONGLONG_ULONGLONG: - case VOID_FTYPE_PUNSIGNED_UNSIGNED: - case VOID_FTYPE_PINT_INT: - nargs = 1; - klass = store; - /* Reserve memory operand for target. */ - memory = ARRAY_SIZE (args); - switch (icode) - { - /* These builtins and instructions require the memory - to be properly aligned. */ - case CODE_FOR_avx_movntv4di: - case CODE_FOR_sse2_movntv2di: - case CODE_FOR_avx_movntv8sf: - case CODE_FOR_sse_movntv4sf: - case CODE_FOR_sse4a_vmmovntv4sf: - case CODE_FOR_avx_movntv4df: - case CODE_FOR_sse2_movntv2df: - case CODE_FOR_sse4a_vmmovntv2df: - case CODE_FOR_sse2_movntidi: - case CODE_FOR_sse_movntq: - case CODE_FOR_sse2_movntisi: - case CODE_FOR_avx512f_movntv16sf: - case CODE_FOR_avx512f_movntv8df: - case CODE_FOR_avx512f_movntv8di: - aligned_mem = true; - break; - default: - break; - } - break; - case VOID_FTYPE_PVOID_PCVOID: - nargs = 1; - klass = store; - memory = 0; - - break; - case V4SF_FTYPE_V4SF_PCV2SF: - case V2DF_FTYPE_V2DF_PCDOUBLE: - nargs = 2; - klass = load; - memory = 1; - break; - case V8SF_FTYPE_PCV8SF_V8SI: - case V4DF_FTYPE_PCV4DF_V4DI: - case V4SF_FTYPE_PCV4SF_V4SI: - case V2DF_FTYPE_PCV2DF_V2DI: - case V8SI_FTYPE_PCV8SI_V8SI: - case V4DI_FTYPE_PCV4DI_V4DI: - case V4SI_FTYPE_PCV4SI_V4SI: - case V2DI_FTYPE_PCV2DI_V2DI: - case VOID_FTYPE_INT_INT64: - nargs = 2; - klass = load; - memory = 0; - break; - case VOID_FTYPE_PV8DF_V8DF_UQI: - case VOID_FTYPE_PV4DF_V4DF_UQI: - case VOID_FTYPE_PV2DF_V2DF_UQI: - case VOID_FTYPE_PV16SF_V16SF_UHI: - case VOID_FTYPE_PV8SF_V8SF_UQI: - case VOID_FTYPE_PV4SF_V4SF_UQI: - case VOID_FTYPE_PV8DI_V8DI_UQI: - case VOID_FTYPE_PV4DI_V4DI_UQI: - case VOID_FTYPE_PV2DI_V2DI_UQI: - case VOID_FTYPE_PV16SI_V16SI_UHI: - case VOID_FTYPE_PV8SI_V8SI_UQI: - case VOID_FTYPE_PV4SI_V4SI_UQI: - case VOID_FTYPE_PV64QI_V64QI_UDI: - case VOID_FTYPE_PV32HI_V32HI_USI: - case VOID_FTYPE_PV32QI_V32QI_USI: - case VOID_FTYPE_PV16QI_V16QI_UHI: - case VOID_FTYPE_PV16HI_V16HI_UHI: - case VOID_FTYPE_PV8HI_V8HI_UQI: - switch (icode) - { - /* These builtins and instructions require the memory - to be properly aligned. */ - case CODE_FOR_avx512f_storev16sf_mask: - case CODE_FOR_avx512f_storev16si_mask: - case CODE_FOR_avx512f_storev8df_mask: - case CODE_FOR_avx512f_storev8di_mask: - case CODE_FOR_avx512vl_storev8sf_mask: - case CODE_FOR_avx512vl_storev8si_mask: - case CODE_FOR_avx512vl_storev4df_mask: - case CODE_FOR_avx512vl_storev4di_mask: - case CODE_FOR_avx512vl_storev4sf_mask: - case CODE_FOR_avx512vl_storev4si_mask: - case CODE_FOR_avx512vl_storev2df_mask: - case CODE_FOR_avx512vl_storev2di_mask: - aligned_mem = true; - break; - default: - break; - } - /* FALLTHRU */ - case VOID_FTYPE_PV8SF_V8SI_V8SF: - case VOID_FTYPE_PV4DF_V4DI_V4DF: - case VOID_FTYPE_PV4SF_V4SI_V4SF: - case VOID_FTYPE_PV2DF_V2DI_V2DF: - case VOID_FTYPE_PV8SI_V8SI_V8SI: - case VOID_FTYPE_PV4DI_V4DI_V4DI: - case VOID_FTYPE_PV4SI_V4SI_V4SI: - case VOID_FTYPE_PV2DI_V2DI_V2DI: - case VOID_FTYPE_PV8SI_V8DI_UQI: - case VOID_FTYPE_PV8HI_V8DI_UQI: - case VOID_FTYPE_PV16HI_V16SI_UHI: - case VOID_FTYPE_PV16QI_V8DI_UQI: - case VOID_FTYPE_PV16QI_V16SI_UHI: - case VOID_FTYPE_PV4SI_V4DI_UQI: - case VOID_FTYPE_PV4SI_V2DI_UQI: - case VOID_FTYPE_PV8HI_V4DI_UQI: - case VOID_FTYPE_PV8HI_V2DI_UQI: - case VOID_FTYPE_PV8HI_V8SI_UQI: - case VOID_FTYPE_PV8HI_V4SI_UQI: - case VOID_FTYPE_PV16QI_V4DI_UQI: - case VOID_FTYPE_PV16QI_V2DI_UQI: - case VOID_FTYPE_PV16QI_V8SI_UQI: - case VOID_FTYPE_PV16QI_V4SI_UQI: - case VOID_FTYPE_PCHAR_V64QI_UDI: - case VOID_FTYPE_PCHAR_V32QI_USI: - case VOID_FTYPE_PCHAR_V16QI_UHI: - case VOID_FTYPE_PSHORT_V32HI_USI: - case VOID_FTYPE_PSHORT_V16HI_UHI: - case VOID_FTYPE_PSHORT_V8HI_UQI: - case VOID_FTYPE_PINT_V16SI_UHI: - case VOID_FTYPE_PINT_V8SI_UQI: - case VOID_FTYPE_PINT_V4SI_UQI: - case VOID_FTYPE_PINT64_V8DI_UQI: - case VOID_FTYPE_PINT64_V4DI_UQI: - case VOID_FTYPE_PINT64_V2DI_UQI: - case VOID_FTYPE_PDOUBLE_V8DF_UQI: - case VOID_FTYPE_PDOUBLE_V4DF_UQI: - case VOID_FTYPE_PDOUBLE_V2DF_UQI: - case VOID_FTYPE_PFLOAT_V16SF_UHI: - case VOID_FTYPE_PFLOAT_V8SF_UQI: - case VOID_FTYPE_PFLOAT_V4SF_UQI: - case VOID_FTYPE_PV32QI_V32HI_USI: - case VOID_FTYPE_PV16QI_V16HI_UHI: - case VOID_FTYPE_PV8QI_V8HI_UQI: - nargs = 2; - klass = store; - /* Reserve memory operand for target. */ - memory = ARRAY_SIZE (args); - break; - case V4SF_FTYPE_PCV4SF_V4SF_UQI: - case V8SF_FTYPE_PCV8SF_V8SF_UQI: - case V16SF_FTYPE_PCV16SF_V16SF_UHI: - case V4SI_FTYPE_PCV4SI_V4SI_UQI: - case V8SI_FTYPE_PCV8SI_V8SI_UQI: - case V16SI_FTYPE_PCV16SI_V16SI_UHI: - case V2DF_FTYPE_PCV2DF_V2DF_UQI: - case V4DF_FTYPE_PCV4DF_V4DF_UQI: - case V8DF_FTYPE_PCV8DF_V8DF_UQI: - case V2DI_FTYPE_PCV2DI_V2DI_UQI: - case V4DI_FTYPE_PCV4DI_V4DI_UQI: - case V8DI_FTYPE_PCV8DI_V8DI_UQI: - case V64QI_FTYPE_PCV64QI_V64QI_UDI: - case V32HI_FTYPE_PCV32HI_V32HI_USI: - case V32QI_FTYPE_PCV32QI_V32QI_USI: - case V16QI_FTYPE_PCV16QI_V16QI_UHI: - case V16HI_FTYPE_PCV16HI_V16HI_UHI: - case V8HI_FTYPE_PCV8HI_V8HI_UQI: - switch (icode) - { - /* These builtins and instructions require the memory - to be properly aligned. */ - case CODE_FOR_avx512f_loadv16sf_mask: - case CODE_FOR_avx512f_loadv16si_mask: - case CODE_FOR_avx512f_loadv8df_mask: - case CODE_FOR_avx512f_loadv8di_mask: - case CODE_FOR_avx512vl_loadv8sf_mask: - case CODE_FOR_avx512vl_loadv8si_mask: - case CODE_FOR_avx512vl_loadv4df_mask: - case CODE_FOR_avx512vl_loadv4di_mask: - case CODE_FOR_avx512vl_loadv4sf_mask: - case CODE_FOR_avx512vl_loadv4si_mask: - case CODE_FOR_avx512vl_loadv2df_mask: - case CODE_FOR_avx512vl_loadv2di_mask: - case CODE_FOR_avx512bw_loadv64qi_mask: - case CODE_FOR_avx512vl_loadv32qi_mask: - case CODE_FOR_avx512vl_loadv16qi_mask: - case CODE_FOR_avx512bw_loadv32hi_mask: - case CODE_FOR_avx512vl_loadv16hi_mask: - case CODE_FOR_avx512vl_loadv8hi_mask: - aligned_mem = true; - break; - default: - break; - } - /* FALLTHRU */ - case V64QI_FTYPE_PCCHAR_V64QI_UDI: - case V32QI_FTYPE_PCCHAR_V32QI_USI: - case V16QI_FTYPE_PCCHAR_V16QI_UHI: - case V32HI_FTYPE_PCSHORT_V32HI_USI: - case V16HI_FTYPE_PCSHORT_V16HI_UHI: - case V8HI_FTYPE_PCSHORT_V8HI_UQI: - case V16SI_FTYPE_PCINT_V16SI_UHI: - case V8SI_FTYPE_PCINT_V8SI_UQI: - case V4SI_FTYPE_PCINT_V4SI_UQI: - case V8DI_FTYPE_PCINT64_V8DI_UQI: - case V4DI_FTYPE_PCINT64_V4DI_UQI: - case V2DI_FTYPE_PCINT64_V2DI_UQI: - case V8DF_FTYPE_PCDOUBLE_V8DF_UQI: - case V4DF_FTYPE_PCDOUBLE_V4DF_UQI: - case V2DF_FTYPE_PCDOUBLE_V2DF_UQI: - case V16SF_FTYPE_PCFLOAT_V16SF_UHI: - case V8SF_FTYPE_PCFLOAT_V8SF_UQI: - case V4SF_FTYPE_PCFLOAT_V4SF_UQI: - nargs = 3; - klass = load; - memory = 0; - break; - case VOID_FTYPE_UINT_UINT_UINT: - case VOID_FTYPE_UINT64_UINT_UINT: - case UCHAR_FTYPE_UINT_UINT_UINT: - case UCHAR_FTYPE_UINT64_UINT_UINT: - nargs = 3; - klass = load; - memory = ARRAY_SIZE (args); - last_arg_constant = true; - break; - default: - gcc_unreachable (); - } - - gcc_assert (nargs <= ARRAY_SIZE (args)); - - if (klass == store) - { - arg = CALL_EXPR_ARG (exp, 0); - op = expand_normal (arg); - gcc_assert (target == 0); - if (memory) - { - op = ix86_zero_extend_to_Pmode (op); - target = gen_rtx_MEM (tmode, op); - /* target at this point has just BITS_PER_UNIT MEM_ALIGN - on it. Try to improve it using get_pointer_alignment, - and if the special builtin is one that requires strict - mode alignment, also from it's GET_MODE_ALIGNMENT. - Failure to do so could lead to ix86_legitimate_combined_insn - rejecting all changes to such insns. */ - unsigned int align = get_pointer_alignment (arg); - if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode)) - align = GET_MODE_ALIGNMENT (tmode); - if (MEM_ALIGN (target) < align) - set_mem_align (target, align); - } - else - target = force_reg (tmode, op); - arg_adjust = 1; - } - else - { - arg_adjust = 0; - if (optimize - || target == 0 - || !register_operand (target, tmode) - || GET_MODE (target) != tmode) - target = gen_reg_rtx (tmode); - } - - for (i = 0; i < nargs; i++) - { - machine_mode mode = insn_p->operand[i + 1].mode; - bool match; - - arg = CALL_EXPR_ARG (exp, i + arg_adjust); - op = expand_normal (arg); - match = insn_p->operand[i + 1].predicate (op, mode); - - if (last_arg_constant && (i + 1) == nargs) - { - if (!match) - { - if (icode == CODE_FOR_lwp_lwpvalsi3 - || icode == CODE_FOR_lwp_lwpinssi3 - || icode == CODE_FOR_lwp_lwpvaldi3 - || icode == CODE_FOR_lwp_lwpinsdi3) - error ("the last argument must be a 32-bit immediate"); - else - error ("the last argument must be an 8-bit immediate"); - return const0_rtx; - } - } - else - { - if (i == memory) - { - /* This must be the memory operand. */ - op = ix86_zero_extend_to_Pmode (op); - op = gen_rtx_MEM (mode, op); - /* op at this point has just BITS_PER_UNIT MEM_ALIGN - on it. Try to improve it using get_pointer_alignment, - and if the special builtin is one that requires strict - mode alignment, also from it's GET_MODE_ALIGNMENT. - Failure to do so could lead to ix86_legitimate_combined_insn - rejecting all changes to such insns. */ - unsigned int align = get_pointer_alignment (arg); - if (aligned_mem && align < GET_MODE_ALIGNMENT (mode)) - align = GET_MODE_ALIGNMENT (mode); - if (MEM_ALIGN (op) < align) - set_mem_align (op, align); - } - else - { - /* This must be register. */ - if (VECTOR_MODE_P (mode)) - op = safe_vector_operand (op, mode); - - op = fixup_modeless_constant (op, mode); - - if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode) - op = copy_to_mode_reg (mode, op); - else - { - op = copy_to_reg (op); - op = lowpart_subreg (mode, op, GET_MODE (op)); - } - } - } - - args[i].op = op; - args[i].mode = mode; - } - - switch (nargs) - { - case 0: - pat = GEN_FCN (icode) (target); - break; - case 1: - pat = GEN_FCN (icode) (target, args[0].op); - break; - case 2: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op); - break; - case 3: - pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); - break; - default: - gcc_unreachable (); - } - - if (! pat) - return 0; - emit_insn (pat); - return klass == store ? 0 : target; -} - -/* Return the integer constant in ARG. Constrain it to be in the range - of the subparts of VEC_TYPE; issue an error if not. */ - -static int -get_element_number (tree vec_type, tree arg) -{ - unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1; - - if (!tree_fits_uhwi_p (arg) - || (elt = tree_to_uhwi (arg), elt > max)) - { - error ("selector must be an integer constant in the range 0..%wi", max); - return 0; - } - - return elt; -} - -/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around - ix86_expand_vector_init. We DO have language-level syntax for this, in - the form of (type){ init-list }. Except that since we can't place emms - instructions from inside the compiler, we can't allow the use of MMX - registers unless the user explicitly asks for it. So we do *not* define - vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead - we have builtins invoked by mmintrin.h that gives us license to emit - these sorts of instructions. */ - -static rtx -ix86_expand_vec_init_builtin (tree type, tree exp, rtx target) -{ - machine_mode tmode = TYPE_MODE (type); - machine_mode inner_mode = GET_MODE_INNER (tmode); - int i, n_elt = GET_MODE_NUNITS (tmode); - rtvec v = rtvec_alloc (n_elt); - - gcc_assert (VECTOR_MODE_P (tmode)); - gcc_assert (call_expr_nargs (exp) == n_elt); - - for (i = 0; i < n_elt; ++i) - { - rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); - RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); - } - - if (!target || !register_operand (target, tmode)) - target = gen_reg_rtx (tmode); - - ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v)); - return target; -} - -/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around - ix86_expand_vector_extract. They would be redundant (for non-MMX) if we - had a language-level syntax for referencing vector elements. */ - -static rtx -ix86_expand_vec_ext_builtin (tree exp, rtx target) -{ - machine_mode tmode, mode0; - tree arg0, arg1; - int elt; - rtx op0; - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - - op0 = expand_normal (arg0); - elt = get_element_number (TREE_TYPE (arg0), arg1); - - tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); - mode0 = TYPE_MODE (TREE_TYPE (arg0)); - gcc_assert (VECTOR_MODE_P (mode0)); - - op0 = force_reg (mode0, op0); - - if (optimize || !target || !register_operand (target, tmode)) - target = gen_reg_rtx (tmode); - - ix86_expand_vector_extract (true, target, op0, elt); - - return target; -} - -/* A subroutine of ix86_expand_builtin. These builtins are a wrapper around - ix86_expand_vector_set. They would be redundant (for non-MMX) if we had - a language-level syntax for referencing vector elements. */ - -static rtx -ix86_expand_vec_set_builtin (tree exp) -{ - machine_mode tmode, mode1; - tree arg0, arg1, arg2; - int elt; - rtx op0, op1, target; - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - - tmode = TYPE_MODE (TREE_TYPE (arg0)); - mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); - gcc_assert (VECTOR_MODE_P (tmode)); - - op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL); - op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL); - elt = get_element_number (TREE_TYPE (arg0), arg2); - - if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode) - op1 = convert_modes (mode1, GET_MODE (op1), op1, true); - - op0 = force_reg (tmode, op0); - op1 = force_reg (mode1, op1); - - /* OP0 is the source of these builtin functions and shouldn't be - modified. Create a copy, use it and return it as target. */ - target = gen_reg_rtx (tmode); - emit_move_insn (target, op0); - ix86_expand_vector_set (true, target, op1, elt); - - return target; -} - -/* Expand an expression EXP that calls a built-in function, - with result going to TARGET if that's convenient - (and in mode MODE if that's convenient). - SUBTARGET may be used as the target for computing one of EXP's operands. - IGNORE is nonzero if the value is to be ignored. */ - -static rtx -ix86_expand_builtin (tree exp, rtx target, rtx subtarget, - machine_mode mode, int ignore) -{ - size_t i; - enum insn_code icode, icode2; - tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); - tree arg0, arg1, arg2, arg3, arg4; - rtx op0, op1, op2, op3, op4, pat, pat2, insn; - machine_mode mode0, mode1, mode2, mode3, mode4; - unsigned int fcode = DECL_FUNCTION_CODE (fndecl); - - /* For CPU builtins that can be folded, fold first and expand the fold. */ - switch (fcode) - { - case IX86_BUILTIN_CPU_INIT: - { - /* Make it call __cpu_indicator_init in libgcc. */ - tree call_expr, fndecl, type; - type = build_function_type_list (integer_type_node, NULL_TREE); - fndecl = build_fn_decl ("__cpu_indicator_init", type); - call_expr = build_call_expr (fndecl, 0); - return expand_expr (call_expr, target, mode, EXPAND_NORMAL); - } - case IX86_BUILTIN_CPU_IS: - case IX86_BUILTIN_CPU_SUPPORTS: - { - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree fold_expr = fold_builtin_cpu (fndecl, &arg0); - gcc_assert (fold_expr != NULL_TREE); - return expand_expr (fold_expr, target, mode, EXPAND_NORMAL); - } - } - - HOST_WIDE_INT isa = ix86_isa_flags; - HOST_WIDE_INT isa2 = ix86_isa_flags2; - HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa; - HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2; - /* The general case is we require all the ISAs specified in bisa{,2} - to be enabled. - The exceptions are: - OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A - OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 - OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4 - where for each this pair it is sufficient if either of the ISAs is - enabled, plus if it is ored with other options also those others. */ - if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) - == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) - && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0) - isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A); - if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) - == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) - && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0) - isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32); - if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) - == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) - && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0) - isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4); - if ((bisa & isa) != bisa || (bisa2 & isa2) != bisa2) - { - bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT; - if (TARGET_ABI_X32) - bisa |= OPTION_MASK_ABI_X32; - else - bisa |= OPTION_MASK_ABI_64; - char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL, - (enum fpmath_unit) 0, false, add_abi_p); - if (!opts) - error ("%qE needs unknown isa option", fndecl); - else - { - gcc_assert (opts != NULL); - error ("%qE needs isa option %s", fndecl, opts); - free (opts); - } - return expand_call (exp, target, ignore); - } - - switch (fcode) - { - case IX86_BUILTIN_MASKMOVQ: - case IX86_BUILTIN_MASKMOVDQU: - icode = (fcode == IX86_BUILTIN_MASKMOVQ - ? CODE_FOR_mmx_maskmovq - : CODE_FOR_sse2_maskmovdqu); - /* Note the arg order is different from the operand order. */ - arg1 = CALL_EXPR_ARG (exp, 0); - arg2 = CALL_EXPR_ARG (exp, 1); - arg0 = CALL_EXPR_ARG (exp, 2); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - mode0 = insn_data[icode].operand[0].mode; - mode1 = insn_data[icode].operand[1].mode; - mode2 = insn_data[icode].operand[2].mode; - - op0 = ix86_zero_extend_to_Pmode (op0); - op0 = gen_rtx_MEM (mode1, op0); - - if (!insn_data[icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if (!insn_data[icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - if (!insn_data[icode].operand[2].predicate (op2, mode2)) - op2 = copy_to_mode_reg (mode2, op2); - pat = GEN_FCN (icode) (op0, op1, op2); - if (! pat) - return 0; - emit_insn (pat); - return 0; - - case IX86_BUILTIN_LDMXCSR: - op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); - target = assign_386_stack_local (SImode, SLOT_TEMP); - emit_move_insn (target, op0); - emit_insn (gen_sse_ldmxcsr (target)); - return 0; - - case IX86_BUILTIN_STMXCSR: - target = assign_386_stack_local (SImode, SLOT_TEMP); - emit_insn (gen_sse_stmxcsr (target)); - return copy_to_mode_reg (SImode, target); - - case IX86_BUILTIN_CLFLUSH: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_sse2_clflush; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - - emit_insn (gen_sse2_clflush (op0)); - return 0; - - case IX86_BUILTIN_CLWB: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_clwb; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - - emit_insn (gen_clwb (op0)); - return 0; - - case IX86_BUILTIN_CLFLUSHOPT: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_clflushopt; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - - emit_insn (gen_clflushopt (op0)); - return 0; - - case IX86_BUILTIN_MONITOR: - case IX86_BUILTIN_MONITORX: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - if (!REG_P (op0)) - op0 = ix86_zero_extend_to_Pmode (op0); - if (!REG_P (op1)) - op1 = copy_to_mode_reg (SImode, op1); - if (!REG_P (op2)) - op2 = copy_to_mode_reg (SImode, op2); - - emit_insn (fcode == IX86_BUILTIN_MONITOR - ? ix86_gen_monitor (op0, op1, op2) - : ix86_gen_monitorx (op0, op1, op2)); - return 0; - - case IX86_BUILTIN_MWAIT: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); - if (!REG_P (op1)) - op1 = copy_to_mode_reg (SImode, op1); - emit_insn (gen_sse3_mwait (op0, op1)); - return 0; - - case IX86_BUILTIN_MWAITX: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); - if (!REG_P (op1)) - op1 = copy_to_mode_reg (SImode, op1); - if (!REG_P (op2)) - op2 = copy_to_mode_reg (SImode, op2); - emit_insn (gen_mwaitx (op0, op1, op2)); - return 0; - - case IX86_BUILTIN_UMONITOR: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - - op0 = ix86_zero_extend_to_Pmode (op0); - - insn = (TARGET_64BIT - ? gen_umonitor_di (op0) - : gen_umonitor_si (op0)); - - emit_insn (insn); - return 0; - - case IX86_BUILTIN_UMWAIT: - case IX86_BUILTIN_TPAUSE: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); - - op1 = force_reg (DImode, op1); - - if (TARGET_64BIT) - { - op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), - NULL, 1, OPTAB_DIRECT); - switch (fcode) - { - case IX86_BUILTIN_UMWAIT: - icode = CODE_FOR_umwait_rex64; - break; - case IX86_BUILTIN_TPAUSE: - icode = CODE_FOR_tpause_rex64; - break; - default: - gcc_unreachable (); - } - - op2 = gen_lowpart (SImode, op2); - op1 = gen_lowpart (SImode, op1); - pat = GEN_FCN (icode) (op0, op1, op2); - } - else - { - switch (fcode) - { - case IX86_BUILTIN_UMWAIT: - icode = CODE_FOR_umwait; - break; - case IX86_BUILTIN_TPAUSE: - icode = CODE_FOR_tpause; - break; - default: - gcc_unreachable (); - } - pat = GEN_FCN (icode) (op0, op1); - } - - if (!pat) - return 0; - - emit_insn (pat); - - if (target == 0 - || !register_operand (target, QImode)) - target = gen_reg_rtx (QImode); - - pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG), - const0_rtx); - emit_insn (gen_rtx_SET (target, pat)); - - return target; - - case IX86_BUILTIN_CLZERO: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - if (!REG_P (op0)) - op0 = ix86_zero_extend_to_Pmode (op0); - emit_insn (ix86_gen_clzero (op0)); - return 0; - - case IX86_BUILTIN_CLDEMOTE: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_cldemote; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - - emit_insn (gen_cldemote (op0)); - return 0; - - case IX86_BUILTIN_VEC_INIT_V2SI: - case IX86_BUILTIN_VEC_INIT_V4HI: - case IX86_BUILTIN_VEC_INIT_V8QI: - return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); - - case IX86_BUILTIN_VEC_EXT_V2DF: - case IX86_BUILTIN_VEC_EXT_V2DI: - case IX86_BUILTIN_VEC_EXT_V4SF: - case IX86_BUILTIN_VEC_EXT_V4SI: - case IX86_BUILTIN_VEC_EXT_V8HI: - case IX86_BUILTIN_VEC_EXT_V2SI: - case IX86_BUILTIN_VEC_EXT_V4HI: - case IX86_BUILTIN_VEC_EXT_V16QI: - return ix86_expand_vec_ext_builtin (exp, target); - - case IX86_BUILTIN_VEC_SET_V2DI: - case IX86_BUILTIN_VEC_SET_V4SF: - case IX86_BUILTIN_VEC_SET_V4SI: - case IX86_BUILTIN_VEC_SET_V8HI: - case IX86_BUILTIN_VEC_SET_V4HI: - case IX86_BUILTIN_VEC_SET_V16QI: - return ix86_expand_vec_set_builtin (exp); - - case IX86_BUILTIN_NANQ: - case IX86_BUILTIN_NANSQ: - return expand_call (exp, target, ignore); - - case IX86_BUILTIN_RDPID: - - op0 = gen_reg_rtx (word_mode); - - if (TARGET_64BIT) - { - insn = gen_rdpid_rex64 (op0); - op0 = convert_to_mode (SImode, op0, 1); - } - else - insn = gen_rdpid (op0); - - emit_insn (insn); - - if (target == 0 - || !register_operand (target, SImode)) - target = gen_reg_rtx (SImode); - - emit_move_insn (target, op0); - return target; - - case IX86_BUILTIN_RDPMC: - case IX86_BUILTIN_RDTSC: - case IX86_BUILTIN_RDTSCP: - case IX86_BUILTIN_XGETBV: - - op0 = gen_reg_rtx (DImode); - op1 = gen_reg_rtx (DImode); - - if (fcode == IX86_BUILTIN_RDPMC) - { - arg0 = CALL_EXPR_ARG (exp, 0); - op2 = expand_normal (arg0); - if (!register_operand (op2, SImode)) - op2 = copy_to_mode_reg (SImode, op2); - - insn = (TARGET_64BIT - ? gen_rdpmc_rex64 (op0, op1, op2) - : gen_rdpmc (op0, op2)); - emit_insn (insn); - } - else if (fcode == IX86_BUILTIN_XGETBV) - { - arg0 = CALL_EXPR_ARG (exp, 0); - op2 = expand_normal (arg0); - if (!register_operand (op2, SImode)) - op2 = copy_to_mode_reg (SImode, op2); - - insn = (TARGET_64BIT - ? gen_xgetbv_rex64 (op0, op1, op2) - : gen_xgetbv (op0, op2)); - emit_insn (insn); - } - else if (fcode == IX86_BUILTIN_RDTSC) - { - insn = (TARGET_64BIT - ? gen_rdtsc_rex64 (op0, op1) - : gen_rdtsc (op0)); - emit_insn (insn); - } - else - { - op2 = gen_reg_rtx (SImode); - - insn = (TARGET_64BIT - ? gen_rdtscp_rex64 (op0, op1, op2) - : gen_rdtscp (op0, op2)); - emit_insn (insn); - - arg0 = CALL_EXPR_ARG (exp, 0); - op4 = expand_normal (arg0); - if (!address_operand (op4, VOIDmode)) - { - op4 = convert_memory_address (Pmode, op4); - op4 = copy_addr_to_reg (op4); - } - emit_move_insn (gen_rtx_MEM (SImode, op4), op2); - } - - if (target == 0 - || !register_operand (target, DImode)) - target = gen_reg_rtx (DImode); - - if (TARGET_64BIT) - { - op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32), - op1, 1, OPTAB_DIRECT); - op0 = expand_simple_binop (DImode, IOR, op0, op1, - op0, 1, OPTAB_DIRECT); - } - - emit_move_insn (target, op0); - return target; - - case IX86_BUILTIN_MOVDIR64B: - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - op0 = ix86_zero_extend_to_Pmode (op0); - if (!address_operand (op1, VOIDmode)) - { - op1 = convert_memory_address (Pmode, op1); - op1 = copy_addr_to_reg (op1); - } - op1 = gen_rtx_MEM (XImode, op1); - - insn = (TARGET_64BIT - ? gen_movdir64b_di (op0, op1) - : gen_movdir64b_si (op0, op1)); - emit_insn (insn); - return 0; - - case IX86_BUILTIN_FXSAVE: - case IX86_BUILTIN_FXRSTOR: - case IX86_BUILTIN_FXSAVE64: - case IX86_BUILTIN_FXRSTOR64: - case IX86_BUILTIN_FNSTENV: - case IX86_BUILTIN_FLDENV: - mode0 = BLKmode; - switch (fcode) - { - case IX86_BUILTIN_FXSAVE: - icode = CODE_FOR_fxsave; - break; - case IX86_BUILTIN_FXRSTOR: - icode = CODE_FOR_fxrstor; - break; - case IX86_BUILTIN_FXSAVE64: - icode = CODE_FOR_fxsave64; - break; - case IX86_BUILTIN_FXRSTOR64: - icode = CODE_FOR_fxrstor64; - break; - case IX86_BUILTIN_FNSTENV: - icode = CODE_FOR_fnstenv; - break; - case IX86_BUILTIN_FLDENV: - icode = CODE_FOR_fldenv; - break; - default: - gcc_unreachable (); - } - - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - - if (!address_operand (op0, VOIDmode)) - { - op0 = convert_memory_address (Pmode, op0); - op0 = copy_addr_to_reg (op0); - } - op0 = gen_rtx_MEM (mode0, op0); - - pat = GEN_FCN (icode) (op0); - if (pat) - emit_insn (pat); - return 0; - - case IX86_BUILTIN_XSETBV: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); - - op1 = force_reg (DImode, op1); - - if (TARGET_64BIT) - { - op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), - NULL, 1, OPTAB_DIRECT); - - icode = CODE_FOR_xsetbv_rex64; - - op2 = gen_lowpart (SImode, op2); - op1 = gen_lowpart (SImode, op1); - pat = GEN_FCN (icode) (op0, op1, op2); - } - else - { - icode = CODE_FOR_xsetbv; - - pat = GEN_FCN (icode) (op0, op1); - } - if (pat) - emit_insn (pat); - return 0; - - case IX86_BUILTIN_XSAVE: - case IX86_BUILTIN_XRSTOR: - case IX86_BUILTIN_XSAVE64: - case IX86_BUILTIN_XRSTOR64: - case IX86_BUILTIN_XSAVEOPT: - case IX86_BUILTIN_XSAVEOPT64: - case IX86_BUILTIN_XSAVES: - case IX86_BUILTIN_XRSTORS: - case IX86_BUILTIN_XSAVES64: - case IX86_BUILTIN_XRSTORS64: - case IX86_BUILTIN_XSAVEC: - case IX86_BUILTIN_XSAVEC64: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - if (!address_operand (op0, VOIDmode)) - { - op0 = convert_memory_address (Pmode, op0); - op0 = copy_addr_to_reg (op0); - } - op0 = gen_rtx_MEM (BLKmode, op0); - - op1 = force_reg (DImode, op1); - - if (TARGET_64BIT) - { - op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32), - NULL, 1, OPTAB_DIRECT); - switch (fcode) - { - case IX86_BUILTIN_XSAVE: - icode = CODE_FOR_xsave_rex64; - break; - case IX86_BUILTIN_XRSTOR: - icode = CODE_FOR_xrstor_rex64; - break; - case IX86_BUILTIN_XSAVE64: - icode = CODE_FOR_xsave64; - break; - case IX86_BUILTIN_XRSTOR64: - icode = CODE_FOR_xrstor64; - break; - case IX86_BUILTIN_XSAVEOPT: - icode = CODE_FOR_xsaveopt_rex64; - break; - case IX86_BUILTIN_XSAVEOPT64: - icode = CODE_FOR_xsaveopt64; - break; - case IX86_BUILTIN_XSAVES: - icode = CODE_FOR_xsaves_rex64; - break; - case IX86_BUILTIN_XRSTORS: - icode = CODE_FOR_xrstors_rex64; - break; - case IX86_BUILTIN_XSAVES64: - icode = CODE_FOR_xsaves64; - break; - case IX86_BUILTIN_XRSTORS64: - icode = CODE_FOR_xrstors64; - break; - case IX86_BUILTIN_XSAVEC: - icode = CODE_FOR_xsavec_rex64; - break; - case IX86_BUILTIN_XSAVEC64: - icode = CODE_FOR_xsavec64; - break; - default: - gcc_unreachable (); - } - - op2 = gen_lowpart (SImode, op2); - op1 = gen_lowpart (SImode, op1); - pat = GEN_FCN (icode) (op0, op1, op2); - } - else - { - switch (fcode) - { - case IX86_BUILTIN_XSAVE: - icode = CODE_FOR_xsave; - break; - case IX86_BUILTIN_XRSTOR: - icode = CODE_FOR_xrstor; - break; - case IX86_BUILTIN_XSAVEOPT: - icode = CODE_FOR_xsaveopt; - break; - case IX86_BUILTIN_XSAVES: - icode = CODE_FOR_xsaves; - break; - case IX86_BUILTIN_XRSTORS: - icode = CODE_FOR_xrstors; - break; - case IX86_BUILTIN_XSAVEC: - icode = CODE_FOR_xsavec; - break; - default: - gcc_unreachable (); - } - pat = GEN_FCN (icode) (op0, op1); - } - - if (pat) - emit_insn (pat); - return 0; - - case IX86_BUILTIN_LLWPCB: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = CODE_FOR_lwp_llwpcb; - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = ix86_zero_extend_to_Pmode (op0); - emit_insn (gen_lwp_llwpcb (op0)); - return 0; - - case IX86_BUILTIN_SLWPCB: - icode = CODE_FOR_lwp_slwpcb; - if (!target - || !insn_data[icode].operand[0].predicate (target, Pmode)) - target = gen_reg_rtx (Pmode); - emit_insn (gen_lwp_slwpcb (target)); - return target; - - case IX86_BUILTIN_BEXTRI32: - case IX86_BUILTIN_BEXTRI64: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - icode = (fcode == IX86_BUILTIN_BEXTRI32 - ? CODE_FOR_tbm_bextri_si - : CODE_FOR_tbm_bextri_di); - if (!CONST_INT_P (op1)) - { - error ("last argument must be an immediate"); - return const0_rtx; - } - else - { - unsigned char length = (INTVAL (op1) >> 8) & 0xFF; - unsigned char lsb_index = INTVAL (op1) & 0xFF; - op1 = GEN_INT (length); - op2 = GEN_INT (lsb_index); - - mode1 = insn_data[icode].operand[1].mode; - if (!insn_data[icode].operand[1].predicate (op0, mode1)) - op0 = copy_to_mode_reg (mode1, op0); - - mode0 = insn_data[icode].operand[0].mode; - if (target == 0 - || !register_operand (target, mode0)) - target = gen_reg_rtx (mode0); - - pat = GEN_FCN (icode) (target, op0, op1, op2); - if (pat) - emit_insn (pat); - return target; - } - - case IX86_BUILTIN_RDRAND16_STEP: - icode = CODE_FOR_rdrandhi_1; - mode0 = HImode; - goto rdrand_step; - - case IX86_BUILTIN_RDRAND32_STEP: - icode = CODE_FOR_rdrandsi_1; - mode0 = SImode; - goto rdrand_step; - - case IX86_BUILTIN_RDRAND64_STEP: - icode = CODE_FOR_rdranddi_1; - mode0 = DImode; - -rdrand_step: - arg0 = CALL_EXPR_ARG (exp, 0); - op1 = expand_normal (arg0); - if (!address_operand (op1, VOIDmode)) - { - op1 = convert_memory_address (Pmode, op1); - op1 = copy_addr_to_reg (op1); - } - - op0 = gen_reg_rtx (mode0); - emit_insn (GEN_FCN (icode) (op0)); - - emit_move_insn (gen_rtx_MEM (mode0, op1), op0); - - op1 = gen_reg_rtx (SImode); - emit_move_insn (op1, CONST1_RTX (SImode)); - - /* Emit SImode conditional move. */ - if (mode0 == HImode) - { - if (TARGET_ZERO_EXTEND_WITH_AND - && optimize_function_for_speed_p (cfun)) - { - op2 = force_reg (SImode, const0_rtx); - - emit_insn (gen_movstricthi - (gen_lowpart (HImode, op2), op0)); - } - else - { - op2 = gen_reg_rtx (SImode); - - emit_insn (gen_zero_extendhisi2 (op2, op0)); - } - } - else if (mode0 == SImode) - op2 = op0; - else - op2 = gen_rtx_SUBREG (SImode, op0, 0); - - if (target == 0 - || !register_operand (target, SImode)) - target = gen_reg_rtx (SImode); - - pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG), - const0_rtx); - emit_insn (gen_rtx_SET (target, - gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1))); - return target; - - case IX86_BUILTIN_RDSEED16_STEP: - icode = CODE_FOR_rdseedhi_1; - mode0 = HImode; - goto rdseed_step; - - case IX86_BUILTIN_RDSEED32_STEP: - icode = CODE_FOR_rdseedsi_1; - mode0 = SImode; - goto rdseed_step; - - case IX86_BUILTIN_RDSEED64_STEP: - icode = CODE_FOR_rdseeddi_1; - mode0 = DImode; - -rdseed_step: - arg0 = CALL_EXPR_ARG (exp, 0); - op1 = expand_normal (arg0); - if (!address_operand (op1, VOIDmode)) - { - op1 = convert_memory_address (Pmode, op1); - op1 = copy_addr_to_reg (op1); - } - - op0 = gen_reg_rtx (mode0); - emit_insn (GEN_FCN (icode) (op0)); - - emit_move_insn (gen_rtx_MEM (mode0, op1), op0); - - op2 = gen_reg_rtx (QImode); - - pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG), - const0_rtx); - emit_insn (gen_rtx_SET (op2, pat)); - - if (target == 0 - || !register_operand (target, SImode)) - target = gen_reg_rtx (SImode); - - emit_insn (gen_zero_extendqisi2 (target, op2)); - return target; - - case IX86_BUILTIN_SBB32: - icode = CODE_FOR_subborrowsi; - icode2 = CODE_FOR_subborrowsi_0; - mode0 = SImode; - mode1 = DImode; - mode2 = CCmode; - goto handlecarry; - - case IX86_BUILTIN_SBB64: - icode = CODE_FOR_subborrowdi; - icode2 = CODE_FOR_subborrowdi_0; - mode0 = DImode; - mode1 = TImode; - mode2 = CCmode; - goto handlecarry; - - case IX86_BUILTIN_ADDCARRYX32: - icode = CODE_FOR_addcarrysi; - icode2 = CODE_FOR_addcarrysi_0; - mode0 = SImode; - mode1 = DImode; - mode2 = CCCmode; - goto handlecarry; - - case IX86_BUILTIN_ADDCARRYX64: - icode = CODE_FOR_addcarrydi; - icode2 = CODE_FOR_addcarrydi_0; - mode0 = DImode; - mode1 = TImode; - mode2 = CCCmode; - - handlecarry: - arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */ - arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */ - arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */ - arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */ - - op1 = expand_normal (arg0); - if (!integer_zerop (arg0)) - op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1)); - - op2 = expand_normal (arg1); - if (!register_operand (op2, mode0)) - op2 = copy_to_mode_reg (mode0, op2); - - op3 = expand_normal (arg2); - if (!register_operand (op3, mode0)) - op3 = copy_to_mode_reg (mode0, op3); - - op4 = expand_normal (arg3); - if (!address_operand (op4, VOIDmode)) - { - op4 = convert_memory_address (Pmode, op4); - op4 = copy_addr_to_reg (op4); - } - - op0 = gen_reg_rtx (mode0); - if (integer_zerop (arg0)) - { - /* If arg0 is 0, optimize right away into add or sub - instruction that sets CCCmode flags. */ - op1 = gen_rtx_REG (mode2, FLAGS_REG); - emit_insn (GEN_FCN (icode2) (op0, op2, op3)); - } - else - { - /* Generate CF from input operand. */ - emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx)); - - /* Generate instruction that consumes CF. */ - op1 = gen_rtx_REG (CCCmode, FLAGS_REG); - pat = gen_rtx_LTU (mode1, op1, const0_rtx); - pat2 = gen_rtx_LTU (mode0, op1, const0_rtx); - emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2)); - } - - /* Return current CF value. */ - if (target == 0) - target = gen_reg_rtx (QImode); - - pat = gen_rtx_LTU (QImode, op1, const0_rtx); - emit_insn (gen_rtx_SET (target, pat)); - - /* Store the result. */ - emit_move_insn (gen_rtx_MEM (mode0, op4), op0); - - return target; - - case IX86_BUILTIN_READ_FLAGS: - emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG))); - - if (optimize - || target == NULL_RTX - || !nonimmediate_operand (target, word_mode) - || GET_MODE (target) != word_mode) - target = gen_reg_rtx (word_mode); - - emit_insn (gen_pop (target)); - return target; - - case IX86_BUILTIN_WRITE_FLAGS: - - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - if (!general_no_elim_operand (op0, word_mode)) - op0 = copy_to_mode_reg (word_mode, op0); - - emit_insn (gen_push (op0)); - emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG))); - return 0; - - case IX86_BUILTIN_KTESTC8: - icode = CODE_FOR_ktestqi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KTESTZ8: - icode = CODE_FOR_ktestqi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KTESTC16: - icode = CODE_FOR_ktesthi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KTESTZ16: - icode = CODE_FOR_ktesthi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KTESTC32: - icode = CODE_FOR_ktestsi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KTESTZ32: - icode = CODE_FOR_ktestsi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KTESTC64: - icode = CODE_FOR_ktestdi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KTESTZ64: - icode = CODE_FOR_ktestdi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KORTESTC8: - icode = CODE_FOR_kortestqi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KORTESTZ8: - icode = CODE_FOR_kortestqi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KORTESTC16: - icode = CODE_FOR_kortesthi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KORTESTZ16: - icode = CODE_FOR_kortesthi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KORTESTC32: - icode = CODE_FOR_kortestsi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KORTESTZ32: - icode = CODE_FOR_kortestsi; - mode3 = CCZmode; - goto kortest; - - case IX86_BUILTIN_KORTESTC64: - icode = CODE_FOR_kortestdi; - mode3 = CCCmode; - goto kortest; - - case IX86_BUILTIN_KORTESTZ64: - icode = CODE_FOR_kortestdi; - mode3 = CCZmode; - - kortest: - arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */ - arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */ - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - mode0 = insn_data[icode].operand[0].mode; - mode1 = insn_data[icode].operand[1].mode; - - if (GET_MODE (op0) != VOIDmode) - op0 = force_reg (GET_MODE (op0), op0); - - op0 = gen_lowpart (mode0, op0); - - if (!insn_data[icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - - if (GET_MODE (op1) != VOIDmode) - op1 = force_reg (GET_MODE (op1), op1); - - op1 = gen_lowpart (mode1, op1); - - if (!insn_data[icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - target = gen_reg_rtx (QImode); - - /* Emit kortest. */ - emit_insn (GEN_FCN (icode) (op0, op1)); - /* And use setcc to return result from flags. */ - ix86_expand_setcc (target, EQ, - gen_rtx_REG (mode3, FLAGS_REG), const0_rtx); - return target; - - case IX86_BUILTIN_GATHERSIV2DF: - icode = CODE_FOR_avx2_gathersiv2df; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV4DF: - icode = CODE_FOR_avx2_gathersiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV2DF: - icode = CODE_FOR_avx2_gatherdiv2df; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV4DF: - icode = CODE_FOR_avx2_gatherdiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV4SF: - icode = CODE_FOR_avx2_gathersiv4sf; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV8SF: - icode = CODE_FOR_avx2_gathersiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV4SF: - icode = CODE_FOR_avx2_gatherdiv4sf; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV8SF: - icode = CODE_FOR_avx2_gatherdiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV2DI: - icode = CODE_FOR_avx2_gathersiv2di; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV4DI: - icode = CODE_FOR_avx2_gathersiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV2DI: - icode = CODE_FOR_avx2_gatherdiv2di; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV4DI: - icode = CODE_FOR_avx2_gatherdiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV4SI: - icode = CODE_FOR_avx2_gathersiv4si; - goto gather_gen; - case IX86_BUILTIN_GATHERSIV8SI: - icode = CODE_FOR_avx2_gathersiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV4SI: - icode = CODE_FOR_avx2_gatherdiv4si; - goto gather_gen; - case IX86_BUILTIN_GATHERDIV8SI: - icode = CODE_FOR_avx2_gatherdiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHERALTSIV4DF: - icode = CODE_FOR_avx2_gathersiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHERALTDIV8SF: - icode = CODE_FOR_avx2_gatherdiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHERALTSIV4DI: - icode = CODE_FOR_avx2_gathersiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHERALTDIV8SI: - icode = CODE_FOR_avx2_gatherdiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV16SF: - icode = CODE_FOR_avx512f_gathersiv16sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV8DF: - icode = CODE_FOR_avx512f_gathersiv8df; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV16SF: - icode = CODE_FOR_avx512f_gatherdiv16sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV8DF: - icode = CODE_FOR_avx512f_gatherdiv8df; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV16SI: - icode = CODE_FOR_avx512f_gathersiv16si; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV8DI: - icode = CODE_FOR_avx512f_gathersiv8di; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV16SI: - icode = CODE_FOR_avx512f_gatherdiv16si; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV8DI: - icode = CODE_FOR_avx512f_gatherdiv8di; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTSIV8DF: - icode = CODE_FOR_avx512f_gathersiv8df; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTDIV16SF: - icode = CODE_FOR_avx512f_gatherdiv16sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTSIV8DI: - icode = CODE_FOR_avx512f_gathersiv8di; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTDIV16SI: - icode = CODE_FOR_avx512f_gatherdiv16si; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV2DF: - icode = CODE_FOR_avx512vl_gathersiv2df; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV4DF: - icode = CODE_FOR_avx512vl_gathersiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV2DF: - icode = CODE_FOR_avx512vl_gatherdiv2df; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV4DF: - icode = CODE_FOR_avx512vl_gatherdiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV4SF: - icode = CODE_FOR_avx512vl_gathersiv4sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV8SF: - icode = CODE_FOR_avx512vl_gathersiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV4SF: - icode = CODE_FOR_avx512vl_gatherdiv4sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV8SF: - icode = CODE_FOR_avx512vl_gatherdiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV2DI: - icode = CODE_FOR_avx512vl_gathersiv2di; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV4DI: - icode = CODE_FOR_avx512vl_gathersiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV2DI: - icode = CODE_FOR_avx512vl_gatherdiv2di; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV4DI: - icode = CODE_FOR_avx512vl_gatherdiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV4SI: - icode = CODE_FOR_avx512vl_gathersiv4si; - goto gather_gen; - case IX86_BUILTIN_GATHER3SIV8SI: - icode = CODE_FOR_avx512vl_gathersiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV4SI: - icode = CODE_FOR_avx512vl_gatherdiv4si; - goto gather_gen; - case IX86_BUILTIN_GATHER3DIV8SI: - icode = CODE_FOR_avx512vl_gatherdiv8si; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTSIV4DF: - icode = CODE_FOR_avx512vl_gathersiv4df; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTDIV8SF: - icode = CODE_FOR_avx512vl_gatherdiv8sf; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTSIV4DI: - icode = CODE_FOR_avx512vl_gathersiv4di; - goto gather_gen; - case IX86_BUILTIN_GATHER3ALTDIV8SI: - icode = CODE_FOR_avx512vl_gatherdiv8si; - goto gather_gen; - case IX86_BUILTIN_SCATTERSIV16SF: - icode = CODE_FOR_avx512f_scattersiv16sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV8DF: - icode = CODE_FOR_avx512f_scattersiv8df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV16SF: - icode = CODE_FOR_avx512f_scatterdiv16sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV8DF: - icode = CODE_FOR_avx512f_scatterdiv8df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV16SI: - icode = CODE_FOR_avx512f_scattersiv16si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV8DI: - icode = CODE_FOR_avx512f_scattersiv8di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV16SI: - icode = CODE_FOR_avx512f_scatterdiv16si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV8DI: - icode = CODE_FOR_avx512f_scatterdiv8di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV8SF: - icode = CODE_FOR_avx512vl_scattersiv8sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV4SF: - icode = CODE_FOR_avx512vl_scattersiv4sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV4DF: - icode = CODE_FOR_avx512vl_scattersiv4df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV2DF: - icode = CODE_FOR_avx512vl_scattersiv2df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV8SF: - icode = CODE_FOR_avx512vl_scatterdiv8sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV4SF: - icode = CODE_FOR_avx512vl_scatterdiv4sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV4DF: - icode = CODE_FOR_avx512vl_scatterdiv4df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV2DF: - icode = CODE_FOR_avx512vl_scatterdiv2df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV8SI: - icode = CODE_FOR_avx512vl_scattersiv8si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV4SI: - icode = CODE_FOR_avx512vl_scattersiv4si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV4DI: - icode = CODE_FOR_avx512vl_scattersiv4di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERSIV2DI: - icode = CODE_FOR_avx512vl_scattersiv2di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV8SI: - icode = CODE_FOR_avx512vl_scatterdiv8si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV4SI: - icode = CODE_FOR_avx512vl_scatterdiv4si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV4DI: - icode = CODE_FOR_avx512vl_scatterdiv4di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERDIV2DI: - icode = CODE_FOR_avx512vl_scatterdiv2di; - goto scatter_gen; - case IX86_BUILTIN_GATHERPFDPD: - icode = CODE_FOR_avx512pf_gatherpfv8sidf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERALTSIV8DF: - icode = CODE_FOR_avx512f_scattersiv8df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV16SF: - icode = CODE_FOR_avx512f_scatterdiv16sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV8DI: - icode = CODE_FOR_avx512f_scattersiv8di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV16SI: - icode = CODE_FOR_avx512f_scatterdiv16si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV4DF: - icode = CODE_FOR_avx512vl_scattersiv4df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV8SF: - icode = CODE_FOR_avx512vl_scatterdiv8sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV4DI: - icode = CODE_FOR_avx512vl_scattersiv4di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV8SI: - icode = CODE_FOR_avx512vl_scatterdiv8si; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV2DF: - icode = CODE_FOR_avx512vl_scattersiv2df; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV4SF: - icode = CODE_FOR_avx512vl_scatterdiv4sf; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTSIV2DI: - icode = CODE_FOR_avx512vl_scattersiv2di; - goto scatter_gen; - case IX86_BUILTIN_SCATTERALTDIV4SI: - icode = CODE_FOR_avx512vl_scatterdiv4si; - goto scatter_gen; - case IX86_BUILTIN_GATHERPFDPS: - icode = CODE_FOR_avx512pf_gatherpfv16sisf; - goto vec_prefetch_gen; - case IX86_BUILTIN_GATHERPFQPD: - icode = CODE_FOR_avx512pf_gatherpfv8didf; - goto vec_prefetch_gen; - case IX86_BUILTIN_GATHERPFQPS: - icode = CODE_FOR_avx512pf_gatherpfv8disf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERPFDPD: - icode = CODE_FOR_avx512pf_scatterpfv8sidf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERPFDPS: - icode = CODE_FOR_avx512pf_scatterpfv16sisf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERPFQPD: - icode = CODE_FOR_avx512pf_scatterpfv8didf; - goto vec_prefetch_gen; - case IX86_BUILTIN_SCATTERPFQPS: - icode = CODE_FOR_avx512pf_scatterpfv8disf; - goto vec_prefetch_gen; - - gather_gen: - rtx half; - rtx (*gen) (rtx, rtx); - - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - arg3 = CALL_EXPR_ARG (exp, 3); - arg4 = CALL_EXPR_ARG (exp, 4); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - op3 = expand_normal (arg3); - op4 = expand_normal (arg4); - /* Note the arg order is different from the operand order. */ - mode0 = insn_data[icode].operand[1].mode; - mode2 = insn_data[icode].operand[3].mode; - mode3 = insn_data[icode].operand[4].mode; - mode4 = insn_data[icode].operand[5].mode; - - if (target == NULL_RTX - || GET_MODE (target) != insn_data[icode].operand[0].mode - || !insn_data[icode].operand[0].predicate (target, - GET_MODE (target))) - subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode); - else - subtarget = target; - - switch (fcode) - { - case IX86_BUILTIN_GATHER3ALTSIV8DF: - case IX86_BUILTIN_GATHER3ALTSIV8DI: - half = gen_reg_rtx (V8SImode); - if (!nonimmediate_operand (op2, V16SImode)) - op2 = copy_to_mode_reg (V16SImode, op2); - emit_insn (gen_vec_extract_lo_v16si (half, op2)); - op2 = half; - break; - case IX86_BUILTIN_GATHER3ALTSIV4DF: - case IX86_BUILTIN_GATHER3ALTSIV4DI: - case IX86_BUILTIN_GATHERALTSIV4DF: - case IX86_BUILTIN_GATHERALTSIV4DI: - half = gen_reg_rtx (V4SImode); - if (!nonimmediate_operand (op2, V8SImode)) - op2 = copy_to_mode_reg (V8SImode, op2); - emit_insn (gen_vec_extract_lo_v8si (half, op2)); - op2 = half; - break; - case IX86_BUILTIN_GATHER3ALTDIV16SF: - case IX86_BUILTIN_GATHER3ALTDIV16SI: - half = gen_reg_rtx (mode0); - if (mode0 == V8SFmode) - gen = gen_vec_extract_lo_v16sf; - else - gen = gen_vec_extract_lo_v16si; - if (!nonimmediate_operand (op0, GET_MODE (op0))) - op0 = copy_to_mode_reg (GET_MODE (op0), op0); - emit_insn (gen (half, op0)); - op0 = half; - op3 = lowpart_subreg (QImode, op3, HImode); - break; - case IX86_BUILTIN_GATHER3ALTDIV8SF: - case IX86_BUILTIN_GATHER3ALTDIV8SI: - case IX86_BUILTIN_GATHERALTDIV8SF: - case IX86_BUILTIN_GATHERALTDIV8SI: - half = gen_reg_rtx (mode0); - if (mode0 == V4SFmode) - gen = gen_vec_extract_lo_v8sf; - else - gen = gen_vec_extract_lo_v8si; - if (!nonimmediate_operand (op0, GET_MODE (op0))) - op0 = copy_to_mode_reg (GET_MODE (op0), op0); - emit_insn (gen (half, op0)); - op0 = half; - if (VECTOR_MODE_P (GET_MODE (op3))) - { - half = gen_reg_rtx (mode0); - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - emit_insn (gen (half, op3)); - op3 = half; - } - break; - default: - break; - } - - /* Force memory operand only with base register here. But we - don't want to do it on memory operand for other builtin - functions. */ - op1 = ix86_zero_extend_to_Pmode (op1); - - if (!insn_data[icode].operand[1].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if (!insn_data[icode].operand[2].predicate (op1, Pmode)) - op1 = copy_to_mode_reg (Pmode, op1); - if (!insn_data[icode].operand[3].predicate (op2, mode2)) - op2 = copy_to_mode_reg (mode2, op2); - - op3 = fixup_modeless_constant (op3, mode3); - - if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode) - { - if (!insn_data[icode].operand[4].predicate (op3, mode3)) - op3 = copy_to_mode_reg (mode3, op3); - } - else - { - op3 = copy_to_reg (op3); - op3 = lowpart_subreg (mode3, op3, GET_MODE (op3)); - } - if (!insn_data[icode].operand[5].predicate (op4, mode4)) - { - error ("the last argument must be scale 1, 2, 4, 8"); - return const0_rtx; - } - - /* Optimize. If mask is known to have all high bits set, - replace op0 with pc_rtx to signal that the instruction - overwrites the whole destination and doesn't use its - previous contents. */ - if (optimize) - { - if (TREE_CODE (arg3) == INTEGER_CST) - { - if (integer_all_onesp (arg3)) - op0 = pc_rtx; - } - else if (TREE_CODE (arg3) == VECTOR_CST) - { - unsigned int negative = 0; - for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i) - { - tree cst = VECTOR_CST_ELT (arg3, i); - if (TREE_CODE (cst) == INTEGER_CST - && tree_int_cst_sign_bit (cst)) - negative++; - else if (TREE_CODE (cst) == REAL_CST - && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst))) - negative++; - } - if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3))) - op0 = pc_rtx; - } - else if (TREE_CODE (arg3) == SSA_NAME - && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE) - { - /* Recognize also when mask is like: - __v2df src = _mm_setzero_pd (); - __v2df mask = _mm_cmpeq_pd (src, src); - or - __v8sf src = _mm256_setzero_ps (); - __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); - as that is a cheaper way to load all ones into - a register than having to load a constant from - memory. */ - gimple *def_stmt = SSA_NAME_DEF_STMT (arg3); - if (is_gimple_call (def_stmt)) - { - tree fndecl = gimple_call_fndecl (def_stmt); - if (fndecl - && fndecl_built_in_p (fndecl, BUILT_IN_MD)) - switch ((unsigned int) DECL_FUNCTION_CODE (fndecl)) - { - case IX86_BUILTIN_CMPPD: - case IX86_BUILTIN_CMPPS: - case IX86_BUILTIN_CMPPD256: - case IX86_BUILTIN_CMPPS256: - if (!integer_zerop (gimple_call_arg (def_stmt, 2))) - break; - /* FALLTHRU */ - case IX86_BUILTIN_CMPEQPD: - case IX86_BUILTIN_CMPEQPS: - if (initializer_zerop (gimple_call_arg (def_stmt, 0)) - && initializer_zerop (gimple_call_arg (def_stmt, - 1))) - op0 = pc_rtx; - break; - default: - break; - } - } - } - } - - pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4); - if (! pat) - return const0_rtx; - emit_insn (pat); - - switch (fcode) - { - case IX86_BUILTIN_GATHER3DIV16SF: - if (target == NULL_RTX) - target = gen_reg_rtx (V8SFmode); - emit_insn (gen_vec_extract_lo_v16sf (target, subtarget)); - break; - case IX86_BUILTIN_GATHER3DIV16SI: - if (target == NULL_RTX) - target = gen_reg_rtx (V8SImode); - emit_insn (gen_vec_extract_lo_v16si (target, subtarget)); - break; - case IX86_BUILTIN_GATHER3DIV8SF: - case IX86_BUILTIN_GATHERDIV8SF: - if (target == NULL_RTX) - target = gen_reg_rtx (V4SFmode); - emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); - break; - case IX86_BUILTIN_GATHER3DIV8SI: - case IX86_BUILTIN_GATHERDIV8SI: - if (target == NULL_RTX) - target = gen_reg_rtx (V4SImode); - emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); - break; - default: - target = subtarget; - break; - } - return target; - - scatter_gen: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - arg3 = CALL_EXPR_ARG (exp, 3); - arg4 = CALL_EXPR_ARG (exp, 4); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - op3 = expand_normal (arg3); - op4 = expand_normal (arg4); - mode1 = insn_data[icode].operand[1].mode; - mode2 = insn_data[icode].operand[2].mode; - mode3 = insn_data[icode].operand[3].mode; - mode4 = insn_data[icode].operand[4].mode; - - /* Scatter instruction stores operand op3 to memory with - indices from op2 and scale from op4 under writemask op1. - If index operand op2 has more elements then source operand - op3 one need to use only its low half. And vice versa. */ - switch (fcode) - { - case IX86_BUILTIN_SCATTERALTSIV8DF: - case IX86_BUILTIN_SCATTERALTSIV8DI: - half = gen_reg_rtx (V8SImode); - if (!nonimmediate_operand (op2, V16SImode)) - op2 = copy_to_mode_reg (V16SImode, op2); - emit_insn (gen_vec_extract_lo_v16si (half, op2)); - op2 = half; - break; - case IX86_BUILTIN_SCATTERALTDIV16SF: - case IX86_BUILTIN_SCATTERALTDIV16SI: - half = gen_reg_rtx (mode3); - if (mode3 == V8SFmode) - gen = gen_vec_extract_lo_v16sf; - else - gen = gen_vec_extract_lo_v16si; - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - emit_insn (gen (half, op3)); - op3 = half; - break; - case IX86_BUILTIN_SCATTERALTSIV4DF: - case IX86_BUILTIN_SCATTERALTSIV4DI: - half = gen_reg_rtx (V4SImode); - if (!nonimmediate_operand (op2, V8SImode)) - op2 = copy_to_mode_reg (V8SImode, op2); - emit_insn (gen_vec_extract_lo_v8si (half, op2)); - op2 = half; - break; - case IX86_BUILTIN_SCATTERALTDIV8SF: - case IX86_BUILTIN_SCATTERALTDIV8SI: - half = gen_reg_rtx (mode3); - if (mode3 == V4SFmode) - gen = gen_vec_extract_lo_v8sf; - else - gen = gen_vec_extract_lo_v8si; - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - emit_insn (gen (half, op3)); - op3 = half; - break; - case IX86_BUILTIN_SCATTERALTSIV2DF: - case IX86_BUILTIN_SCATTERALTSIV2DI: - if (!nonimmediate_operand (op2, V4SImode)) - op2 = copy_to_mode_reg (V4SImode, op2); - break; - case IX86_BUILTIN_SCATTERALTDIV4SF: - case IX86_BUILTIN_SCATTERALTDIV4SI: - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - break; - default: - break; - } - - /* Force memory operand only with base register here. But we - don't want to do it on memory operand for other builtin - functions. */ - op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1)); - - if (!insn_data[icode].operand[0].predicate (op0, Pmode)) - op0 = copy_to_mode_reg (Pmode, op0); - - op1 = fixup_modeless_constant (op1, mode1); - - if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode) - { - if (!insn_data[icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - } - else - { - op1 = copy_to_reg (op1); - op1 = lowpart_subreg (mode1, op1, GET_MODE (op1)); - } - - if (!insn_data[icode].operand[2].predicate (op2, mode2)) - op2 = copy_to_mode_reg (mode2, op2); - - if (!insn_data[icode].operand[3].predicate (op3, mode3)) - op3 = copy_to_mode_reg (mode3, op3); - - if (!insn_data[icode].operand[4].predicate (op4, mode4)) - { - error ("the last argument must be scale 1, 2, 4, 8"); - return const0_rtx; - } - - pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); - if (! pat) - return const0_rtx; - - emit_insn (pat); - return 0; - - vec_prefetch_gen: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - arg3 = CALL_EXPR_ARG (exp, 3); - arg4 = CALL_EXPR_ARG (exp, 4); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - op3 = expand_normal (arg3); - op4 = expand_normal (arg4); - mode0 = insn_data[icode].operand[0].mode; - mode1 = insn_data[icode].operand[1].mode; - mode3 = insn_data[icode].operand[3].mode; - mode4 = insn_data[icode].operand[4].mode; - - op0 = fixup_modeless_constant (op0, mode0); - - if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode) - { - if (!insn_data[icode].operand[0].predicate (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - } - else - { - op0 = copy_to_reg (op0); - op0 = lowpart_subreg (mode0, op0, GET_MODE (op0)); - } - - if (!insn_data[icode].operand[1].predicate (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - - /* Force memory operand only with base register here. But we - don't want to do it on memory operand for other builtin - functions. */ - op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1)); - - if (!insn_data[icode].operand[2].predicate (op2, Pmode)) - op2 = copy_to_mode_reg (Pmode, op2); - - if (!insn_data[icode].operand[3].predicate (op3, mode3)) - { - error ("the forth argument must be scale 1, 2, 4, 8"); - return const0_rtx; - } - - if (!insn_data[icode].operand[4].predicate (op4, mode4)) - { - error ("incorrect hint operand"); - return const0_rtx; - } - - pat = GEN_FCN (icode) (op0, op1, op2, op3, op4); - if (! pat) - return const0_rtx; - - emit_insn (pat); - - return 0; - - case IX86_BUILTIN_XABORT: - icode = CODE_FOR_xabort; - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - mode0 = insn_data[icode].operand[0].mode; - if (!insn_data[icode].operand[0].predicate (op0, mode0)) - { - error ("the argument to % intrinsic must " - "be an 8-bit immediate"); - return const0_rtx; - } - emit_insn (gen_xabort (op0)); - return 0; - - case IX86_BUILTIN_RSTORSSP: - case IX86_BUILTIN_CLRSSBSY: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - icode = (fcode == IX86_BUILTIN_RSTORSSP - ? CODE_FOR_rstorssp - : CODE_FOR_clrssbsy); - if (!address_operand (op0, VOIDmode)) - { - op1 = convert_memory_address (Pmode, op0); - op0 = copy_addr_to_reg (op1); - } - emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0))); - return 0; - - case IX86_BUILTIN_WRSSD: - case IX86_BUILTIN_WRSSQ: - case IX86_BUILTIN_WRUSSD: - case IX86_BUILTIN_WRUSSQ: - arg0 = CALL_EXPR_ARG (exp, 0); - op0 = expand_normal (arg0); - arg1 = CALL_EXPR_ARG (exp, 1); - op1 = expand_normal (arg1); - switch (fcode) - { - case IX86_BUILTIN_WRSSD: - icode = CODE_FOR_wrsssi; - mode = SImode; - break; - case IX86_BUILTIN_WRSSQ: - icode = CODE_FOR_wrssdi; - mode = DImode; - break; - case IX86_BUILTIN_WRUSSD: - icode = CODE_FOR_wrusssi; - mode = SImode; - break; - case IX86_BUILTIN_WRUSSQ: - icode = CODE_FOR_wrussdi; - mode = DImode; - break; - } - op0 = force_reg (mode, op0); - if (!address_operand (op1, VOIDmode)) - { - op2 = convert_memory_address (Pmode, op1); - op1 = copy_addr_to_reg (op2); - } - emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1))); - return 0; - - default: - break; - } - - if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST - && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST; - return ix86_expand_special_args_builtin (bdesc_special_args + i, exp, - target); - } - - if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST - && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST; - rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL; - rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx); - rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx); - int masked = 1; - machine_mode mode, wide_mode, nar_mode; - - nar_mode = V4SFmode; - mode = V16SFmode; - wide_mode = V64SFmode; - fcn_mask = gen_avx5124fmaddps_4fmaddps_mask; - fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz; - - switch (fcode) - { - case IX86_BUILTIN_4FMAPS: - fcn = gen_avx5124fmaddps_4fmaddps; - masked = 0; - goto v4fma_expand; - - case IX86_BUILTIN_4DPWSSD: - nar_mode = V4SImode; - mode = V16SImode; - wide_mode = V64SImode; - fcn = gen_avx5124vnniw_vp4dpwssd; - masked = 0; - goto v4fma_expand; - - case IX86_BUILTIN_4DPWSSDS: - nar_mode = V4SImode; - mode = V16SImode; - wide_mode = V64SImode; - fcn = gen_avx5124vnniw_vp4dpwssds; - masked = 0; - goto v4fma_expand; - - case IX86_BUILTIN_4FNMAPS: - fcn = gen_avx5124fmaddps_4fnmaddps; - masked = 0; - goto v4fma_expand; - - case IX86_BUILTIN_4FNMAPS_MASK: - fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask; - fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz; - goto v4fma_expand; - - case IX86_BUILTIN_4DPWSSD_MASK: - nar_mode = V4SImode; - mode = V16SImode; - wide_mode = V64SImode; - fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask; - fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz; - goto v4fma_expand; - - case IX86_BUILTIN_4DPWSSDS_MASK: - nar_mode = V4SImode; - mode = V16SImode; - wide_mode = V64SImode; - fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask; - fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz; - goto v4fma_expand; - - case IX86_BUILTIN_4FMAPS_MASK: - { - tree args[4]; - rtx ops[4]; - rtx wide_reg; - rtx accum; - rtx addr; - rtx mem; - -v4fma_expand: - wide_reg = gen_reg_rtx (wide_mode); - for (i = 0; i < 4; i++) - { - args[i] = CALL_EXPR_ARG (exp, i); - ops[i] = expand_normal (args[i]); - - emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64), - ops[i]); - } - - accum = expand_normal (CALL_EXPR_ARG (exp, 4)); - accum = force_reg (mode, accum); - - addr = expand_normal (CALL_EXPR_ARG (exp, 5)); - addr = force_reg (Pmode, addr); - - mem = gen_rtx_MEM (nar_mode, addr); - - target = gen_reg_rtx (mode); - - emit_move_insn (target, accum); - - if (! masked) - emit_insn (fcn (target, accum, wide_reg, mem)); - else - { - rtx merge, mask; - merge = expand_normal (CALL_EXPR_ARG (exp, 6)); - - mask = expand_normal (CALL_EXPR_ARG (exp, 7)); - - if (CONST_INT_P (mask)) - mask = fixup_modeless_constant (mask, HImode); - - mask = force_reg (HImode, mask); - - if (GET_MODE (mask) != HImode) - mask = gen_rtx_SUBREG (HImode, mask, 0); - - /* If merge is 0 then we're about to emit z-masked variant. */ - if (const0_operand (merge, mode)) - emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask)); - /* If merge is the same as accum then emit merge-masked variant. */ - else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4)) - { - merge = force_reg (mode, merge); - emit_insn (fcn_mask (target, wide_reg, mem, merge, mask)); - } - /* Merge with something unknown might happen if we z-mask w/ -O0. */ - else - { - target = gen_reg_rtx (mode); - emit_move_insn (target, merge); - emit_insn (fcn_mask (target, wide_reg, mem, target, mask)); - } - } - return target; - } - - case IX86_BUILTIN_4FNMASS: - fcn = gen_avx5124fmaddps_4fnmaddss; - masked = 0; - goto s4fma_expand; - - case IX86_BUILTIN_4FMASS: - fcn = gen_avx5124fmaddps_4fmaddss; - masked = 0; - goto s4fma_expand; - - case IX86_BUILTIN_4FNMASS_MASK: - fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask; - fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz; - goto s4fma_expand; - - case IX86_BUILTIN_4FMASS_MASK: - { - tree args[4]; - rtx ops[4]; - rtx wide_reg; - rtx accum; - rtx addr; - rtx mem; - - fcn_mask = gen_avx5124fmaddps_4fmaddss_mask; - fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz; - -s4fma_expand: - mode = V4SFmode; - wide_reg = gen_reg_rtx (V64SFmode); - for (i = 0; i < 4; i++) - { - rtx tmp; - args[i] = CALL_EXPR_ARG (exp, i); - ops[i] = expand_normal (args[i]); - - tmp = gen_reg_rtx (SFmode); - emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0)); - - emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64), - gen_rtx_SUBREG (V16SFmode, tmp, 0)); - } - - accum = expand_normal (CALL_EXPR_ARG (exp, 4)); - accum = force_reg (V4SFmode, accum); - - addr = expand_normal (CALL_EXPR_ARG (exp, 5)); - addr = force_reg (Pmode, addr); - - mem = gen_rtx_MEM (V4SFmode, addr); - - target = gen_reg_rtx (V4SFmode); - - emit_move_insn (target, accum); - - if (! masked) - emit_insn (fcn (target, accum, wide_reg, mem)); - else - { - rtx merge, mask; - merge = expand_normal (CALL_EXPR_ARG (exp, 6)); - - mask = expand_normal (CALL_EXPR_ARG (exp, 7)); - - if (CONST_INT_P (mask)) - mask = fixup_modeless_constant (mask, QImode); - - mask = force_reg (QImode, mask); - - if (GET_MODE (mask) != QImode) - mask = gen_rtx_SUBREG (QImode, mask, 0); - - /* If merge is 0 then we're about to emit z-masked variant. */ - if (const0_operand (merge, mode)) - emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask)); - /* If merge is the same as accum then emit merge-masked - variant. */ - else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4)) - { - merge = force_reg (mode, merge); - emit_insn (fcn_mask (target, wide_reg, mem, merge, mask)); - } - /* Merge with something unknown might happen if we z-mask - w/ -O0. */ - else - { - target = gen_reg_rtx (mode); - emit_move_insn (target, merge); - emit_insn (fcn_mask (target, wide_reg, mem, target, mask)); - } - } - return target; - } - case IX86_BUILTIN_RDPID: - return ix86_expand_special_args_builtin (bdesc_args + i, exp, - target); - case IX86_BUILTIN_FABSQ: - case IX86_BUILTIN_COPYSIGNQ: - if (!TARGET_SSE) - /* Emit a normal call if SSE isn't available. */ - return expand_call (exp, target, ignore); - /* FALLTHRU */ - default: - return ix86_expand_args_builtin (bdesc_args + i, exp, target); - } - } - - if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST - && fcode <= IX86_BUILTIN__BDESC_COMI_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST; - return ix86_expand_sse_comi (bdesc_comi + i, exp, target); - } - - if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST - && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST; - return ix86_expand_round_builtin (bdesc_round_args + i, exp, target); - } - - if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST - && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST; - return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target); - } - - if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST - && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST; - return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target); - } - - if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST - && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST; - const struct builtin_description *d = bdesc_multi_arg + i; - return ix86_expand_multi_arg_builtin (d->icode, exp, target, - (enum ix86_builtin_func_type) - d->flag, d->comparison); - } - - if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST - && fcode <= IX86_BUILTIN__BDESC_CET_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_CET_FIRST; - return ix86_expand_special_args_builtin (bdesc_cet + i, exp, - target); - } - - if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST - && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST) - { - i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST; - return ix86_expand_special_args_builtin (bdesc_cet_rdssp + i, exp, - target); - } - - gcc_unreachable (); -} - -/* This returns the target-specific builtin with code CODE if - current_function_decl has visibility on this builtin, which is checked - using isa flags. Returns NULL_TREE otherwise. */ - -static tree ix86_get_builtin (enum ix86_builtins code) -{ - struct cl_target_option *opts; - tree target_tree = NULL_TREE; - - /* Determine the isa flags of current_function_decl. */ - - if (current_function_decl) - target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl); - - if (target_tree == NULL) - target_tree = target_option_default_node; - - opts = TREE_TARGET_OPTION (target_tree); - - if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags) - || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2)) - return ix86_builtin_decl (code, true); - else - return NULL_TREE; -} - -/* Returns a function decl for a vectorized version of the combined function - with combined_fn code FN and the result vector type TYPE, or NULL_TREE - if it is not available. */ - -static tree -ix86_builtin_vectorized_function (unsigned int fn, tree type_out, - tree type_in) -{ - machine_mode in_mode, out_mode; - int in_n, out_n; - - if (TREE_CODE (type_out) != VECTOR_TYPE - || TREE_CODE (type_in) != VECTOR_TYPE) - return NULL_TREE; - - out_mode = TYPE_MODE (TREE_TYPE (type_out)); - out_n = TYPE_VECTOR_SUBPARTS (type_out); - in_mode = TYPE_MODE (TREE_TYPE (type_in)); - in_n = TYPE_VECTOR_SUBPARTS (type_in); - - switch (fn) - { - CASE_CFN_EXP2: - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_EXP2PS); - } - break; - - CASE_CFN_IFLOOR: - CASE_CFN_LFLOOR: - CASE_CFN_LLFLOOR: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == SImode && in_mode == DFmode) - { - if (out_n == 4 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX); - else if (out_n == 8 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256); - else if (out_n == 16 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512); - } - if (out_mode == SImode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512); - } - break; - - CASE_CFN_ICEIL: - CASE_CFN_LCEIL: - CASE_CFN_LLCEIL: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == SImode && in_mode == DFmode) - { - if (out_n == 4 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX); - else if (out_n == 8 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256); - else if (out_n == 16 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512); - } - if (out_mode == SImode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512); - } - break; - - CASE_CFN_IRINT: - CASE_CFN_LRINT: - CASE_CFN_LLRINT: - if (out_mode == SImode && in_mode == DFmode) - { - if (out_n == 4 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX); - else if (out_n == 8 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256); - else if (out_n == 16 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512); - } - if (out_mode == SImode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512); - } - break; - - CASE_CFN_IROUND: - CASE_CFN_LROUND: - CASE_CFN_LLROUND: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == SImode && in_mode == DFmode) - { - if (out_n == 4 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX); - else if (out_n == 8 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256); - else if (out_n == 16 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512); - } - if (out_mode == SImode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512); - } - break; - - CASE_CFN_FLOOR: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD); - else if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD256); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_FLOORPD512); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_FLOORPS512); - } - break; - - CASE_CFN_CEIL: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_CEILPD); - else if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CEILPD256); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CEILPD512); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_CEILPS); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_CEILPS256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_CEILPS512); - } - break; - - CASE_CFN_TRUNC: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPD); - else if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPS); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256); - else if (out_n == 16 && in_n == 16) - return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512); - } - break; - - CASE_CFN_RINT: - /* The round insn does not trap on denormals. */ - if (flag_trapping_math || !TARGET_SSE4_1) - break; - - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_RINTPD); - else if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_RINTPD256); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_RINTPS); - else if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_RINTPS256); - } - break; - - CASE_CFN_FMA: - if (out_mode == DFmode && in_mode == DFmode) - { - if (out_n == 2 && in_n == 2) - return ix86_get_builtin (IX86_BUILTIN_VFMADDPD); - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256); - } - if (out_mode == SFmode && in_mode == SFmode) - { - if (out_n == 4 && in_n == 4) - return ix86_get_builtin (IX86_BUILTIN_VFMADDPS); - if (out_n == 8 && in_n == 8) - return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256); - } - break; - - default: - break; - } - - /* Dispatch to a handler for a vectorization library. */ - if (ix86_veclib_handler) - return ix86_veclib_handler (combined_fn (fn), type_out, type_in); - - return NULL_TREE; -} - -/* Handler for an SVML-style interface to - a library with vectorized intrinsics. */ - -static tree -ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in) -{ - char name[20]; - tree fntype, new_fndecl, args; - unsigned arity; - const char *bname; - machine_mode el_mode, in_mode; - int n, in_n; - - /* The SVML is suitable for unsafe math only. */ - if (!flag_unsafe_math_optimizations) - return NULL_TREE; - - el_mode = TYPE_MODE (TREE_TYPE (type_out)); - n = TYPE_VECTOR_SUBPARTS (type_out); - in_mode = TYPE_MODE (TREE_TYPE (type_in)); - in_n = TYPE_VECTOR_SUBPARTS (type_in); - if (el_mode != in_mode - || n != in_n) - return NULL_TREE; - - switch (fn) - { - CASE_CFN_EXP: - CASE_CFN_LOG: - CASE_CFN_LOG10: - CASE_CFN_POW: - CASE_CFN_TANH: - CASE_CFN_TAN: - CASE_CFN_ATAN: - CASE_CFN_ATAN2: - CASE_CFN_ATANH: - CASE_CFN_CBRT: - CASE_CFN_SINH: - CASE_CFN_SIN: - CASE_CFN_ASINH: - CASE_CFN_ASIN: - CASE_CFN_COSH: - CASE_CFN_COS: - CASE_CFN_ACOSH: - CASE_CFN_ACOS: - if ((el_mode != DFmode || n != 2) - && (el_mode != SFmode || n != 4)) - return NULL_TREE; - break; - - default: - return NULL_TREE; - } - - tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn); - bname = IDENTIFIER_POINTER (DECL_NAME (fndecl)); - - if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF) - strcpy (name, "vmlsLn4"); - else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG) - strcpy (name, "vmldLn2"); - else if (n == 4) - { - sprintf (name, "vmls%s", bname+10); - name[strlen (name)-1] = '4'; - } - else - sprintf (name, "vmld%s2", bname+10); - - /* Convert to uppercase. */ - name[4] &= ~0x20; - - arity = 0; - for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args)) - arity++; - - if (arity == 1) - fntype = build_function_type_list (type_out, type_in, NULL); - else - fntype = build_function_type_list (type_out, type_in, type_in, NULL); - - /* Build a function declaration for the vectorized function. */ - new_fndecl = build_decl (BUILTINS_LOCATION, - FUNCTION_DECL, get_identifier (name), fntype); - TREE_PUBLIC (new_fndecl) = 1; - DECL_EXTERNAL (new_fndecl) = 1; - DECL_IS_NOVOPS (new_fndecl) = 1; - TREE_READONLY (new_fndecl) = 1; - - return new_fndecl; -} - -/* Handler for an ACML-style interface to - a library with vectorized intrinsics. */ - -static tree -ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in) -{ - char name[20] = "__vr.._"; - tree fntype, new_fndecl, args; - unsigned arity; - const char *bname; - machine_mode el_mode, in_mode; - int n, in_n; - - /* The ACML is 64bits only and suitable for unsafe math only as - it does not correctly support parts of IEEE with the required - precision such as denormals. */ - if (!TARGET_64BIT - || !flag_unsafe_math_optimizations) - return NULL_TREE; - - el_mode = TYPE_MODE (TREE_TYPE (type_out)); - n = TYPE_VECTOR_SUBPARTS (type_out); - in_mode = TYPE_MODE (TREE_TYPE (type_in)); - in_n = TYPE_VECTOR_SUBPARTS (type_in); - if (el_mode != in_mode - || n != in_n) - return NULL_TREE; - - switch (fn) - { - CASE_CFN_SIN: - CASE_CFN_COS: - CASE_CFN_EXP: - CASE_CFN_LOG: - CASE_CFN_LOG2: - CASE_CFN_LOG10: - if (el_mode == DFmode && n == 2) - { - name[4] = 'd'; - name[5] = '2'; - } - else if (el_mode == SFmode && n == 4) - { - name[4] = 's'; - name[5] = '4'; - } - else - return NULL_TREE; - break; - - default: - return NULL_TREE; - } - - tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn); - bname = IDENTIFIER_POINTER (DECL_NAME (fndecl)); - sprintf (name + 7, "%s", bname+10); - - arity = 0; - for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args)) - arity++; - - if (arity == 1) - fntype = build_function_type_list (type_out, type_in, NULL); - else - fntype = build_function_type_list (type_out, type_in, type_in, NULL); - - /* Build a function declaration for the vectorized function. */ - new_fndecl = build_decl (BUILTINS_LOCATION, - FUNCTION_DECL, get_identifier (name), fntype); - TREE_PUBLIC (new_fndecl) = 1; - DECL_EXTERNAL (new_fndecl) = 1; - DECL_IS_NOVOPS (new_fndecl) = 1; - TREE_READONLY (new_fndecl) = 1; - - return new_fndecl; -} - -/* Returns a decl of a function that implements gather load with - memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE. - Return NULL_TREE if it is not available. */ - -static tree -ix86_vectorize_builtin_gather (const_tree mem_vectype, - const_tree index_type, int scale) -{ - bool si; - enum ix86_builtins code; - - if (! TARGET_AVX2 || !TARGET_USE_GATHER) - return NULL_TREE; - - if ((TREE_CODE (index_type) != INTEGER_TYPE - && !POINTER_TYPE_P (index_type)) - || (TYPE_MODE (index_type) != SImode - && TYPE_MODE (index_type) != DImode)) - return NULL_TREE; - - if (TYPE_PRECISION (index_type) > POINTER_SIZE) - return NULL_TREE; - - /* v*gather* insn sign extends index to pointer mode. */ - if (TYPE_PRECISION (index_type) < POINTER_SIZE - && TYPE_UNSIGNED (index_type)) - return NULL_TREE; - - if (scale <= 0 - || scale > 8 - || (scale & (scale - 1)) != 0) - return NULL_TREE; - - si = TYPE_MODE (index_type) == SImode; - switch (TYPE_MODE (mem_vectype)) - { - case E_V2DFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF; - else - code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF; - break; - case E_V4DFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF; - else - code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF; - break; - case E_V2DImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI; - else - code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI; - break; - case E_V4DImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI; - else - code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI; - break; - case E_V4SFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF; - else - code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF; - break; - case E_V8SFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF; - else - code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF; - break; - case E_V4SImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI; - else - code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI; - break; - case E_V8SImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI; - else - code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI; - break; - case E_V8DFmode: - if (TARGET_AVX512F) - code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF; - else - return NULL_TREE; - break; - case E_V8DImode: - if (TARGET_AVX512F) - code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI; - else - return NULL_TREE; - break; - case E_V16SFmode: - if (TARGET_AVX512F) - code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF; - else - return NULL_TREE; - break; - case E_V16SImode: - if (TARGET_AVX512F) - code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI; - else - return NULL_TREE; - break; - default: - return NULL_TREE; - } - - return ix86_get_builtin (code); -} - -/* Returns a decl of a function that implements scatter store with - register type VECTYPE and index type INDEX_TYPE and SCALE. - Return NULL_TREE if it is not available. */ - -static tree -ix86_vectorize_builtin_scatter (const_tree vectype, - const_tree index_type, int scale) -{ - bool si; - enum ix86_builtins code; - - if (!TARGET_AVX512F) - return NULL_TREE; - - if ((TREE_CODE (index_type) != INTEGER_TYPE - && !POINTER_TYPE_P (index_type)) - || (TYPE_MODE (index_type) != SImode - && TYPE_MODE (index_type) != DImode)) - return NULL_TREE; - - if (TYPE_PRECISION (index_type) > POINTER_SIZE) - return NULL_TREE; - - /* v*scatter* insn sign extends index to pointer mode. */ - if (TYPE_PRECISION (index_type) < POINTER_SIZE - && TYPE_UNSIGNED (index_type)) - return NULL_TREE; - - /* Scale can be 1, 2, 4 or 8. */ - if (scale <= 0 - || scale > 8 - || (scale & (scale - 1)) != 0) - return NULL_TREE; - - si = TYPE_MODE (index_type) == SImode; - switch (TYPE_MODE (vectype)) - { - case E_V8DFmode: - code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF; - break; - case E_V8DImode: - code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI; - break; - case E_V16SFmode: - code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF; - break; - case E_V16SImode: - code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI; - break; - case E_V4DFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_SCATTERALTSIV4DF : IX86_BUILTIN_SCATTERDIV4DF; - else - return NULL_TREE; - break; - case E_V4DImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_SCATTERALTSIV4DI : IX86_BUILTIN_SCATTERDIV4DI; - else - return NULL_TREE; - break; - case E_V8SFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_SCATTERSIV8SF : IX86_BUILTIN_SCATTERALTDIV8SF; - else - return NULL_TREE; - break; - case E_V8SImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_SCATTERSIV8SI : IX86_BUILTIN_SCATTERALTDIV8SI; - else - return NULL_TREE; - break; - case E_V2DFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_SCATTERALTSIV2DF : IX86_BUILTIN_SCATTERDIV2DF; - else - return NULL_TREE; - break; - case E_V2DImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_SCATTERALTSIV2DI : IX86_BUILTIN_SCATTERDIV2DI; - else - return NULL_TREE; - break; - case E_V4SFmode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_SCATTERSIV4SF : IX86_BUILTIN_SCATTERALTDIV4SF; - else - return NULL_TREE; - break; - case E_V4SImode: - if (TARGET_AVX512VL) - code = si ? IX86_BUILTIN_SCATTERSIV4SI : IX86_BUILTIN_SCATTERALTDIV4SI; - else - return NULL_TREE; - break; - default: - return NULL_TREE; - } - - return ix86_builtins[code]; -} - -/* Return true if it is safe to use the rsqrt optabs to optimize - 1.0/sqrt. */ - -static bool -use_rsqrt_p () -{ - return (TARGET_SSE && TARGET_SSE_MATH - && flag_finite_math_only - && !flag_trapping_math - && flag_unsafe_math_optimizations); -} - -/* Returns a code for a target-specific builtin that implements - reciprocal of the function, or NULL_TREE if not available. */ - -static tree -ix86_builtin_reciprocal (tree fndecl) -{ - enum ix86_builtins fn_code - = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl); - switch (fn_code) - { - /* Vectorized version of sqrt to rsqrt conversion. */ - case IX86_BUILTIN_SQRTPS_NR: - return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR); - - case IX86_BUILTIN_SQRTPS_NR256: - return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256); - - default: - return NULL_TREE; - } -} - -/* Helper for avx_vpermilps256_operand et al. This is also used by - the expansion functions to turn the parallel back into a mask. - The return value is 0 for no match and the imm8+1 for a match. */ - -int -avx_vpermilp_parallel (rtx par, machine_mode mode) -{ - unsigned i, nelt = GET_MODE_NUNITS (mode); - unsigned mask = 0; - unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */ - - if (XVECLEN (par, 0) != (int) nelt) - return 0; - - /* Validate that all of the elements are constants, and not totally - out of range. Copy the data into an integral array to make the - subsequent checks easier. */ - for (i = 0; i < nelt; ++i) - { - rtx er = XVECEXP (par, 0, i); - unsigned HOST_WIDE_INT ei; - - if (!CONST_INT_P (er)) - return 0; - ei = INTVAL (er); - if (ei >= nelt) - return 0; - ipar[i] = ei; - } - - switch (mode) - { - case E_V8DFmode: - /* In the 512-bit DFmode case, we can only move elements within - a 128-bit lane. First fill the second part of the mask, - then fallthru. */ - for (i = 4; i < 6; ++i) - { - if (ipar[i] < 4 || ipar[i] >= 6) - return 0; - mask |= (ipar[i] - 4) << i; - } - for (i = 6; i < 8; ++i) - { - if (ipar[i] < 6) - return 0; - mask |= (ipar[i] - 6) << i; - } - /* FALLTHRU */ - - case E_V4DFmode: - /* In the 256-bit DFmode case, we can only move elements within - a 128-bit lane. */ - for (i = 0; i < 2; ++i) - { - if (ipar[i] >= 2) - return 0; - mask |= ipar[i] << i; - } - for (i = 2; i < 4; ++i) - { - if (ipar[i] < 2) - return 0; - mask |= (ipar[i] - 2) << i; - } - break; - - case E_V16SFmode: - /* In 512 bit SFmode case, permutation in the upper 256 bits - must mirror the permutation in the lower 256-bits. */ - for (i = 0; i < 8; ++i) - if (ipar[i] + 8 != ipar[i + 8]) - return 0; - /* FALLTHRU */ - - case E_V8SFmode: - /* In 256 bit SFmode case, we have full freedom of - movement within the low 128-bit lane, but the high 128-bit - lane must mirror the exact same pattern. */ - for (i = 0; i < 4; ++i) - if (ipar[i] + 4 != ipar[i + 4]) - return 0; - nelt = 4; - /* FALLTHRU */ - - case E_V2DFmode: - case E_V4SFmode: - /* In the 128-bit case, we've full freedom in the placement of - the elements from the source operand. */ - for (i = 0; i < nelt; ++i) - mask |= ipar[i] << (i * (nelt / 2)); - break; - - default: - gcc_unreachable (); - } - - /* Make sure success has a non-zero value by adding one. */ - return mask + 1; -} - -/* Helper for avx_vperm2f128_v4df_operand et al. This is also used by - the expansion functions to turn the parallel back into a mask. - The return value is 0 for no match and the imm8+1 for a match. */ - -int -avx_vperm2f128_parallel (rtx par, machine_mode mode) -{ - unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2; - unsigned mask = 0; - unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */ - - if (XVECLEN (par, 0) != (int) nelt) - return 0; - - /* Validate that all of the elements are constants, and not totally - out of range. Copy the data into an integral array to make the - subsequent checks easier. */ - for (i = 0; i < nelt; ++i) - { - rtx er = XVECEXP (par, 0, i); - unsigned HOST_WIDE_INT ei; - - if (!CONST_INT_P (er)) - return 0; - ei = INTVAL (er); - if (ei >= 2 * nelt) - return 0; - ipar[i] = ei; - } - - /* Validate that the halves of the permute are halves. */ - for (i = 0; i < nelt2 - 1; ++i) - if (ipar[i] + 1 != ipar[i + 1]) - return 0; - for (i = nelt2; i < nelt - 1; ++i) - if (ipar[i] + 1 != ipar[i + 1]) - return 0; - - /* Reconstruct the mask. */ - for (i = 0; i < 2; ++i) - { - unsigned e = ipar[i * nelt2]; - if (e % nelt2) - return 0; - e /= nelt2; - mask |= e << (i * 4); - } - - /* Make sure success has a non-zero value by adding one. */ - return mask + 1; -} - -/* Return a register priority for hard reg REGNO. */ -static int -ix86_register_priority (int hard_regno) -{ - /* ebp and r13 as the base always wants a displacement, r12 as the - base always wants an index. So discourage their usage in an - address. */ - if (hard_regno == R12_REG || hard_regno == R13_REG) - return 0; - if (hard_regno == BP_REG) - return 1; - /* New x86-64 int registers result in bigger code size. Discourage - them. */ - if (IN_RANGE (hard_regno, FIRST_REX_INT_REG, LAST_REX_INT_REG)) - return 2; - /* New x86-64 SSE registers result in bigger code size. Discourage - them. */ - if (IN_RANGE (hard_regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG)) - return 2; - if (IN_RANGE (hard_regno, FIRST_EXT_REX_SSE_REG, LAST_EXT_REX_SSE_REG)) - return 1; - /* Usage of AX register results in smaller code. Prefer it. */ - if (hard_regno == AX_REG) - return 4; - return 3; -} - -/* Implement TARGET_PREFERRED_RELOAD_CLASS. - - Put float CONST_DOUBLE in the constant pool instead of fp regs. - QImode must go into class Q_REGS. - Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and - movdf to do mem-to-mem moves through integer regs. */ - -static reg_class_t -ix86_preferred_reload_class (rtx x, reg_class_t regclass) -{ - machine_mode mode = GET_MODE (x); - - /* We're only allowed to return a subclass of CLASS. Many of the - following checks fail for NO_REGS, so eliminate that early. */ - if (regclass == NO_REGS) - return NO_REGS; - - /* All classes can load zeros. */ - if (x == CONST0_RTX (mode)) - return regclass; - - /* Force constants into memory if we are loading a (nonzero) constant into - an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK - instructions to load from a constant. */ - if (CONSTANT_P (x) - && (MAYBE_MMX_CLASS_P (regclass) - || MAYBE_SSE_CLASS_P (regclass) - || MAYBE_MASK_CLASS_P (regclass))) - return NO_REGS; - - /* Floating-point constants need more complex checks. */ - if (CONST_DOUBLE_P (x)) - { - /* General regs can load everything. */ - if (INTEGER_CLASS_P (regclass)) - return regclass; - - /* Floats can load 0 and 1 plus some others. Note that we eliminated - zero above. We only want to wind up preferring 80387 registers if - we plan on doing computation with them. */ - if (IS_STACK_MODE (mode) - && standard_80387_constant_p (x) > 0) - { - /* Limit class to FP regs. */ - if (FLOAT_CLASS_P (regclass)) - return FLOAT_REGS; - } - - return NO_REGS; - } - - /* Prefer SSE regs only, if we can use them for math. */ - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - return SSE_CLASS_P (regclass) ? regclass : NO_REGS; - - /* Generally when we see PLUS here, it's the function invariant - (plus soft-fp const_int). Which can only be computed into general - regs. */ - if (GET_CODE (x) == PLUS) - return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS; - - /* QImode constants are easy to load, but non-constant QImode data - must go into Q_REGS. */ - if (GET_MODE (x) == QImode && !CONSTANT_P (x)) - { - if (Q_CLASS_P (regclass)) - return regclass; - else if (reg_class_subset_p (Q_REGS, regclass)) - return Q_REGS; - else - return NO_REGS; - } - - return regclass; -} - -/* Discourage putting floating-point values in SSE registers unless - SSE math is being used, and likewise for the 387 registers. */ -static reg_class_t -ix86_preferred_output_reload_class (rtx x, reg_class_t regclass) -{ - machine_mode mode = GET_MODE (x); - - /* Restrict the output reload class to the register bank that we are doing - math on. If we would like not to return a subset of CLASS, reject this - alternative: if reload cannot do this, it will still use its choice. */ - mode = GET_MODE (x); - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS; - - if (IS_STACK_MODE (mode)) - return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS; - - return regclass; -} - -static reg_class_t -ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass, - machine_mode mode, secondary_reload_info *sri) -{ - /* Double-word spills from general registers to non-offsettable memory - references (zero-extended addresses) require special handling. */ - if (TARGET_64BIT - && MEM_P (x) - && GET_MODE_SIZE (mode) > UNITS_PER_WORD - && INTEGER_CLASS_P (rclass) - && !offsettable_memref_p (x)) - { - sri->icode = (in_p - ? CODE_FOR_reload_noff_load - : CODE_FOR_reload_noff_store); - /* Add the cost of moving address to a temporary. */ - sri->extra_cost = 1; - - return NO_REGS; - } - - /* QImode spills from non-QI registers require - intermediate register on 32bit targets. */ - if (mode == QImode - && ((!TARGET_64BIT && !in_p - && INTEGER_CLASS_P (rclass) - && MAYBE_NON_Q_CLASS_P (rclass)) - || (!TARGET_AVX512DQ - && MAYBE_MASK_CLASS_P (rclass)))) - { - int regno = true_regnum (x); - - /* Return Q_REGS if the operand is in memory. */ - if (regno == -1) - return Q_REGS; - - return NO_REGS; - } - - /* This condition handles corner case where an expression involving - pointers gets vectorized. We're trying to use the address of a - stack slot as a vector initializer. - - (set (reg:V2DI 74 [ vect_cst_.2 ]) - (vec_duplicate:V2DI (reg/f:DI 20 frame))) - - Eventually frame gets turned into sp+offset like this: - - (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) - (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp) - (const_int 392 [0x188])))) - - That later gets turned into: - - (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) - (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp) - (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64])))) - - We'll have the following reload recorded: - - Reload 0: reload_in (DI) = - (plus:DI (reg/f:DI 7 sp) - (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64])) - reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) - SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine - reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188])) - reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) - reload_reg_rtx: (reg:V2DI 22 xmm1) - - Which isn't going to work since SSE instructions can't handle scalar - additions. Returning GENERAL_REGS forces the addition into integer - register and reload can handle subsequent reloads without problems. */ - - if (in_p && GET_CODE (x) == PLUS - && SSE_CLASS_P (rclass) - && SCALAR_INT_MODE_P (mode)) - return GENERAL_REGS; - - return NO_REGS; -} - -/* Implement TARGET_CLASS_LIKELY_SPILLED_P. */ - -static bool -ix86_class_likely_spilled_p (reg_class_t rclass) -{ - switch (rclass) - { - case AREG: - case DREG: - case CREG: - case BREG: - case AD_REGS: - case SIREG: - case DIREG: - case SSE_FIRST_REG: - case FP_TOP_REG: - case FP_SECOND_REG: - return true; - - default: - break; - } - - return false; -} - -/* If we are copying between registers from different register sets - (e.g. FP and integer), we may need a memory location. - - The function can't work reliably when one of the CLASSES is a class - containing registers from multiple sets. We avoid this by never combining - different sets in a single alternative in the machine description. - Ensure that this constraint holds to avoid unexpected surprises. - - When STRICT is false, we are being called from REGISTER_MOVE_COST, - so do not enforce these sanity checks. - - To optimize register_move_cost performance, define inline variant. */ - -static inline bool -inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, - reg_class_t class2, int strict) -{ - if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS)) - return false; - - if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1) - || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2) - || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1) - || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2) - || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1) - || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2) - || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1) - || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2)) - { - gcc_assert (!strict || lra_in_progress); - return true; - } - - if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2)) - return true; - - /* Between mask and general, we have moves no larger than word size. */ - if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2)) - && (GET_MODE_SIZE (mode) > UNITS_PER_WORD)) - return true; - - /* ??? This is a lie. We do have moves between mmx/general, and for - mmx/sse2. But by saying we need secondary memory we discourage the - register allocator from using the mmx registers unless needed. */ - if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)) - return true; - - if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2)) - { - /* SSE1 doesn't have any direct moves from other classes. */ - if (!TARGET_SSE2) - return true; - - /* If the target says that inter-unit moves are more expensive - than moving through memory, then don't generate them. */ - if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) - || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) - return true; - - /* Between SSE and general, we have moves no larger than word size. */ - if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) - return true; - } - - return false; -} - -/* Implement TARGET_SECONDARY_MEMORY_NEEDED. */ - -static bool -ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1, - reg_class_t class2) -{ - return inline_secondary_memory_needed (mode, class1, class2, true); -} - -/* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE. - - get_secondary_mem widens integral modes to BITS_PER_WORD. - There is no need to emit full 64 bit move on 64 bit targets - for integral modes that can be moved using 32 bit move. */ - -static machine_mode -ix86_secondary_memory_needed_mode (machine_mode mode) -{ - if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode)) - return mode_for_size (32, GET_MODE_CLASS (mode), 0).require (); - return mode; -} - -/* Implement the TARGET_CLASS_MAX_NREGS hook. - - On the 80386, this is the size of MODE in words, - except in the FP regs, where a single reg is always enough. */ - -static unsigned char -ix86_class_max_nregs (reg_class_t rclass, machine_mode mode) -{ - if (MAYBE_INTEGER_CLASS_P (rclass)) - { - if (mode == XFmode) - return (TARGET_64BIT ? 2 : 3); - else if (mode == XCmode) - return (TARGET_64BIT ? 4 : 6); - else - return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD); - } - else - { - if (COMPLEX_MODE_P (mode)) - return 2; - else - return 1; - } -} - -/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */ - -static bool -ix86_can_change_mode_class (machine_mode from, machine_mode to, - reg_class_t regclass) -{ - if (from == to) - return true; - - /* x87 registers can't do subreg at all, as all values are reformatted - to extended precision. */ - if (MAYBE_FLOAT_CLASS_P (regclass)) - return false; - - if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass)) - { - /* Vector registers do not support QI or HImode loads. If we don't - disallow a change to these modes, reload will assume it's ok to - drop the subreg from (subreg:SI (reg:HI 100) 0). This affects - the vec_dupv4hi pattern. */ - if (GET_MODE_SIZE (from) < 4) - return false; - } - - return true; -} - -/* Return index of MODE in the sse load/store tables. */ - -static inline int -sse_store_index (machine_mode mode) -{ - switch (GET_MODE_SIZE (mode)) - { - case 4: - return 0; - case 8: - return 1; - case 16: - return 2; - case 32: - return 3; - case 64: - return 4; - default: - return -1; - } -} - -/* Return the cost of moving data of mode M between a - register and memory. A value of 2 is the default; this cost is - relative to those in `REGISTER_MOVE_COST'. - - This function is used extensively by register_move_cost that is used to - build tables at startup. Make it inline in this case. - When IN is 2, return maximum of in and out move cost. - - If moving between registers and memory is more expensive than - between two registers, you should define this macro to express the - relative cost. - - Model also increased moving costs of QImode registers in non - Q_REGS classes. - */ -static inline int -inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in) -{ - int cost; - if (FLOAT_CLASS_P (regclass)) - { - int index; - switch (mode) - { - case E_SFmode: - index = 0; - break; - case E_DFmode: - index = 1; - break; - case E_XFmode: - index = 2; - break; - default: - return 100; - } - if (in == 2) - return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]); - return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index]; - } - if (SSE_CLASS_P (regclass)) - { - int index = sse_store_index (mode); - if (index == -1) - return 100; - if (in == 2) - return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]); - return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index]; - } - if (MMX_CLASS_P (regclass)) - { - int index; - switch (GET_MODE_SIZE (mode)) - { - case 4: - index = 0; - break; - case 8: - index = 1; - break; - default: - return 100; - } - if (in == 2) - return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]); - return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index]; - } - switch (GET_MODE_SIZE (mode)) - { - case 1: - if (Q_CLASS_P (regclass) || TARGET_64BIT) - { - if (!in) - return ix86_cost->int_store[0]; - if (TARGET_PARTIAL_REG_DEPENDENCY - && optimize_function_for_speed_p (cfun)) - cost = ix86_cost->movzbl_load; - else - cost = ix86_cost->int_load[0]; - if (in == 2) - return MAX (cost, ix86_cost->int_store[0]); - return cost; - } - else - { - if (in == 2) - return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4); - if (in) - return ix86_cost->movzbl_load; - else - return ix86_cost->int_store[0] + 4; - } - break; - case 2: - if (in == 2) - return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]); - return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1]; - default: - if (in == 2) - cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]); - else if (in) - cost = ix86_cost->int_load[2]; - else - cost = ix86_cost->int_store[2]; - /* Multiply with the number of GPR moves needed. */ - return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD); - } -} - -static int -ix86_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in) -{ - return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0); -} - - -/* Return the cost of moving data from a register in class CLASS1 to - one in class CLASS2. - - It is not required that the cost always equal 2 when FROM is the same as TO; - on some machines it is expensive to move between registers if they are not - general registers. */ - -static int -ix86_register_move_cost (machine_mode mode, reg_class_t class1_i, - reg_class_t class2_i) -{ - enum reg_class class1 = (enum reg_class) class1_i; - enum reg_class class2 = (enum reg_class) class2_i; - - /* In case we require secondary memory, compute cost of the store followed - by load. In order to avoid bad register allocation choices, we need - for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */ - - if (inline_secondary_memory_needed (mode, class1, class2, false)) - { - int cost = 1; - - cost += inline_memory_move_cost (mode, class1, 2); - cost += inline_memory_move_cost (mode, class2, 2); - - /* In case of copying from general_purpose_register we may emit multiple - stores followed by single load causing memory size mismatch stall. - Count this as arbitrarily high cost of 20. */ - if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD - && TARGET_MEMORY_MISMATCH_STALL - && targetm.class_max_nregs (class1, mode) - > targetm.class_max_nregs (class2, mode)) - cost += 20; - - /* In the case of FP/MMX moves, the registers actually overlap, and we - have to switch modes in order to treat them differently. */ - if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2)) - || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1))) - cost += 20; - - return cost; - } - - /* Moves between SSE/MMX and integer unit are expensive. */ - if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2) - || SSE_CLASS_P (class1) != SSE_CLASS_P (class2)) - - /* ??? By keeping returned value relatively high, we limit the number - of moves between integer and MMX/SSE registers for all targets. - Additionally, high value prevents problem with x86_modes_tieable_p(), - where integer modes in MMX/SSE registers are not tieable - because of missing QImode and HImode moves to, from or between - MMX/SSE registers. */ - return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2) - ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer); - - if (MAYBE_FLOAT_CLASS_P (class1)) - return ix86_cost->fp_move; - if (MAYBE_SSE_CLASS_P (class1)) - { - if (GET_MODE_BITSIZE (mode) <= 128) - return ix86_cost->xmm_move; - if (GET_MODE_BITSIZE (mode) <= 256) - return ix86_cost->ymm_move; - return ix86_cost->zmm_move; - } - if (MAYBE_MMX_CLASS_P (class1)) - return ix86_cost->mmx_move; - return 2; -} - -/* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in - words of a value of mode MODE but can be less for certain modes in - special long registers. - - Actually there are no two word move instructions for consecutive - registers. And only registers 0-3 may have mov byte instructions - applied to them. */ - -static unsigned int -ix86_hard_regno_nregs (unsigned int regno, machine_mode mode) -{ - if (GENERAL_REGNO_P (regno)) - { - if (mode == XFmode) - return TARGET_64BIT ? 2 : 3; - if (mode == XCmode) - return TARGET_64BIT ? 4 : 6; - return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD); - } - if (COMPLEX_MODE_P (mode)) - return 2; - if (mode == V64SFmode || mode == V64SImode) - return 4; - return 1; -} - -/* Implement TARGET_HARD_REGNO_MODE_OK. */ - -static bool -ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) -{ - /* Flags and only flags can only hold CCmode values. */ - if (CC_REGNO_P (regno)) - return GET_MODE_CLASS (mode) == MODE_CC; - if (GET_MODE_CLASS (mode) == MODE_CC - || GET_MODE_CLASS (mode) == MODE_RANDOM - || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT) - return false; - if (STACK_REGNO_P (regno)) - return VALID_FP_MODE_P (mode); - if (MASK_REGNO_P (regno)) - return (VALID_MASK_REG_MODE (mode) - || (TARGET_AVX512BW - && VALID_MASK_AVX512BW_MODE (mode))); - if (SSE_REGNO_P (regno)) - { - /* We implement the move patterns for all vector modes into and - out of SSE registers, even when no operation instructions - are available. */ - - /* For AVX-512 we allow, regardless of regno: - - XI mode - - any of 512-bit wide vector mode - - any scalar mode. */ - if (TARGET_AVX512F - && (mode == XImode - || VALID_AVX512F_REG_MODE (mode) - || VALID_AVX512F_SCALAR_MODE (mode))) - return true; - - /* For AVX-5124FMAPS or AVX-5124VNNIW - allow V64SF and V64SI modes for special regnos. */ - if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW) - && (mode == V64SFmode || mode == V64SImode) - && MOD4_SSE_REGNO_P (regno)) - return true; - - /* TODO check for QI/HI scalars. */ - /* AVX512VL allows sse regs16+ for 128/256 bit modes. */ - if (TARGET_AVX512VL - && (mode == OImode - || mode == TImode - || VALID_AVX256_REG_MODE (mode) - || VALID_AVX512VL_128_REG_MODE (mode))) - return true; - - /* xmm16-xmm31 are only available for AVX-512. */ - if (EXT_REX_SSE_REGNO_P (regno)) - return false; - - /* OImode and AVX modes are available only when AVX is enabled. */ - return ((TARGET_AVX - && VALID_AVX256_REG_OR_OI_MODE (mode)) - || VALID_SSE_REG_MODE (mode) - || VALID_SSE2_REG_MODE (mode) - || VALID_MMX_REG_MODE (mode) - || VALID_MMX_REG_MODE_3DNOW (mode)); - } - if (MMX_REGNO_P (regno)) - { - /* We implement the move patterns for 3DNOW modes even in MMX mode, - so if the register is available at all, then we can move data of - the given mode into or out of it. */ - return (VALID_MMX_REG_MODE (mode) - || VALID_MMX_REG_MODE_3DNOW (mode)); - } - - if (mode == QImode) - { - /* Take care for QImode values - they can be in non-QI regs, - but then they do cause partial register stalls. */ - if (ANY_QI_REGNO_P (regno)) - return true; - if (!TARGET_PARTIAL_REG_STALL) - return true; - /* LRA checks if the hard register is OK for the given mode. - QImode values can live in non-QI regs, so we allow all - registers here. */ - if (lra_in_progress) - return true; - return !can_create_pseudo_p (); - } - /* We handle both integer and floats in the general purpose registers. */ - else if (VALID_INT_MODE_P (mode)) - return true; - else if (VALID_FP_MODE_P (mode)) - return true; - else if (VALID_DFP_MODE_P (mode)) - return true; - /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go - on to use that value in smaller contexts, this can easily force a - pseudo to be allocated to GENERAL_REGS. Since this is no worse than - supporting DImode, allow it. */ - else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode)) - return true; - - return false; -} - -/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that - saves SSE registers across calls is Win64 (thus no need to check the - current ABI here), and with AVX enabled Win64 only guarantees that - the low 16 bytes are saved. */ - -static bool -ix86_hard_regno_call_part_clobbered (rtx_insn *insn ATTRIBUTE_UNUSED, - unsigned int regno, machine_mode mode) -{ - return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16; -} - -/* A subroutine of ix86_modes_tieable_p. Return true if MODE is a - tieable integer mode. */ - -static bool -ix86_tieable_integer_mode_p (machine_mode mode) -{ - switch (mode) - { - case E_HImode: - case E_SImode: - return true; - - case E_QImode: - return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL; - - case E_DImode: - return TARGET_64BIT; - - default: - return false; - } -} - -/* Implement TARGET_MODES_TIEABLE_P. - - Return true if MODE1 is accessible in a register that can hold MODE2 - without copying. That is, all register classes that can hold MODE2 - can also hold MODE1. */ - -static bool -ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2) -{ - if (mode1 == mode2) - return true; - - if (ix86_tieable_integer_mode_p (mode1) - && ix86_tieable_integer_mode_p (mode2)) - return true; - - /* MODE2 being XFmode implies fp stack or general regs, which means we - can tie any smaller floating point modes to it. Note that we do not - tie this with TFmode. */ - if (mode2 == XFmode) - return mode1 == SFmode || mode1 == DFmode; - - /* MODE2 being DFmode implies fp stack, general or sse regs, which means - that we can tie it with SFmode. */ - if (mode2 == DFmode) - return mode1 == SFmode; - - /* If MODE2 is only appropriate for an SSE register, then tie with - any other mode acceptable to SSE registers. */ - if (GET_MODE_SIZE (mode2) == 64 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 64 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); - if (GET_MODE_SIZE (mode2) == 32 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 32 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); - if (GET_MODE_SIZE (mode2) == 16 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 16 - && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); - - /* If MODE2 is appropriate for an MMX register, then tie - with any other mode acceptable to MMX registers. */ - if (GET_MODE_SIZE (mode2) == 8 - && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 8 - && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1)); - - return false; -} - -/* Return the cost of moving between two registers of mode MODE. */ - -static int -ix86_set_reg_reg_cost (machine_mode mode) -{ - unsigned int units = UNITS_PER_WORD; - - switch (GET_MODE_CLASS (mode)) - { - default: - break; - - case MODE_CC: - units = GET_MODE_SIZE (CCmode); - break; - - case MODE_FLOAT: - if ((TARGET_SSE && mode == TFmode) - || (TARGET_80387 && mode == XFmode) - || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode) - || ((TARGET_80387 || TARGET_SSE) && mode == SFmode)) - units = GET_MODE_SIZE (mode); - break; - - case MODE_COMPLEX_FLOAT: - if ((TARGET_SSE && mode == TCmode) - || (TARGET_80387 && mode == XCmode) - || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode) - || ((TARGET_80387 || TARGET_SSE) && mode == SCmode)) - units = GET_MODE_SIZE (mode); - break; - - case MODE_VECTOR_INT: - case MODE_VECTOR_FLOAT: - if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) - || (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) - || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode)) - || (TARGET_SSE && VALID_SSE_REG_MODE (mode)) - || (TARGET_MMX && VALID_MMX_REG_MODE (mode))) - units = GET_MODE_SIZE (mode); - } - - /* Return the cost of moving between two registers of mode MODE, - assuming that the move will be in pieces of at most UNITS bytes. */ - return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units)); -} - -/* Return cost of vector operation in MODE given that scalar version has - COST. */ - -static int -ix86_vec_cost (machine_mode mode, int cost) -{ - if (!VECTOR_MODE_P (mode)) - return cost; - - if (GET_MODE_BITSIZE (mode) == 128 - && TARGET_SSE_SPLIT_REGS) - return cost * 2; - if (GET_MODE_BITSIZE (mode) > 128 - && TARGET_AVX128_OPTIMAL) - return cost * GET_MODE_BITSIZE (mode) / 128; - return cost; -} - -/* Return cost of multiplication in MODE. */ - -static int -ix86_multiplication_cost (const struct processor_costs *cost, - enum machine_mode mode) -{ - machine_mode inner_mode = mode; - if (VECTOR_MODE_P (mode)) - inner_mode = GET_MODE_INNER (mode); - - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - return inner_mode == DFmode ? cost->mulsd : cost->mulss; - else if (X87_FLOAT_MODE_P (mode)) - return cost->fmul; - else if (FLOAT_MODE_P (mode)) - return ix86_vec_cost (mode, - inner_mode == DFmode ? cost->mulsd : cost->mulss); - else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - /* vpmullq is used in this case. No emulation is needed. */ - if (TARGET_AVX512DQ) - return ix86_vec_cost (mode, cost->mulss); - - /* V*QImode is emulated with 7-13 insns. */ - if (mode == V16QImode || mode == V32QImode) - { - int extra = 11; - if (TARGET_XOP && mode == V16QImode) - extra = 5; - else if (TARGET_SSSE3) - extra = 6; - return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * extra); - } - /* V*DImode is emulated with 5-8 insns. */ - else if (mode == V2DImode || mode == V4DImode) - { - if (TARGET_XOP && mode == V2DImode) - return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3); - else - return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5); - } - /* Without sse4.1, we don't have PMULLD; it's emulated with 7 - insns, including two PMULUDQ. */ - else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX)) - return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5); - else - return ix86_vec_cost (mode, cost->mulss); - } - else - return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7); -} - -/* Return cost of multiplication in MODE. */ - -static int -ix86_division_cost (const struct processor_costs *cost, - enum machine_mode mode) -{ - machine_mode inner_mode = mode; - if (VECTOR_MODE_P (mode)) - inner_mode = GET_MODE_INNER (mode); - - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - return inner_mode == DFmode ? cost->divsd : cost->divss; - else if (X87_FLOAT_MODE_P (mode)) - return cost->fdiv; - else if (FLOAT_MODE_P (mode)) - return ix86_vec_cost (mode, - inner_mode == DFmode ? cost->divsd : cost->divss); - else - return cost->divide[MODE_INDEX (mode)]; -} - -/* Return cost of shift in MODE. - If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL. - AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE - if op1 is a result of subreg. - - SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */ - -static int -ix86_shift_rotate_cost (const struct processor_costs *cost, - enum machine_mode mode, bool constant_op1, - HOST_WIDE_INT op1_val, - bool speed, - bool and_in_op1, - bool shift_and_truncate, - bool *skip_op0, bool *skip_op1) -{ - if (skip_op0) - *skip_op0 = *skip_op1 = false; - if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - /* V*QImode is emulated with 1-11 insns. */ - if (mode == V16QImode || mode == V32QImode) - { - int count = 11; - if (TARGET_XOP && mode == V16QImode) - { - /* For XOP we use vpshab, which requires a broadcast of the - value to the variable shift insn. For constants this - means a V16Q const in mem; even when we can perform the - shift with one insn set the cost to prefer paddb. */ - if (constant_op1) - { - if (skip_op1) - *skip_op1 = true; - return ix86_vec_cost (mode, - cost->sse_op - + (speed - ? 2 - : COSTS_N_BYTES - (GET_MODE_UNIT_SIZE (mode)))); - } - count = 3; - } - else if (TARGET_SSSE3) - count = 7; - return ix86_vec_cost (mode, cost->sse_op * count); - } - else - return ix86_vec_cost (mode, cost->sse_op); - } - if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) - { - if (constant_op1) - { - if (op1_val > 32) - return cost->shift_const + COSTS_N_INSNS (2); - else - return cost->shift_const * 2; - } - else - { - if (and_in_op1) - return cost->shift_var * 2; - else - return cost->shift_var * 6 + COSTS_N_INSNS (2); - } - } - else - { - if (constant_op1) - return cost->shift_const; - else if (shift_and_truncate) - { - if (skip_op0) - *skip_op0 = *skip_op1 = true; - /* Return the cost after shift-and truncation. */ - return cost->shift_var; - } - else - return cost->shift_var; - } - return cost->shift_const; -} - -/* Compute a (partial) cost for rtx X. Return true if the complete - cost has been computed, and false if subexpressions should be - scanned. In either case, *TOTAL contains the cost result. */ - -static bool -ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, - int *total, bool speed) -{ - rtx mask; - enum rtx_code code = GET_CODE (x); - enum rtx_code outer_code = (enum rtx_code) outer_code_i; - const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost; - int src_cost; - - switch (code) - { - case SET: - if (register_operand (SET_DEST (x), VOIDmode) - && register_operand (SET_SRC (x), VOIDmode)) - { - *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x))); - return true; - } - - if (register_operand (SET_SRC (x), VOIDmode)) - /* Avoid potentially incorrect high cost from rtx_costs - for non-tieable SUBREGs. */ - src_cost = 0; - else - { - src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed); - - if (CONSTANT_P (SET_SRC (x))) - /* Constant costs assume a base value of COSTS_N_INSNS (1) and add - a small value, possibly zero for cheap constants. */ - src_cost += COSTS_N_INSNS (1); - } - - *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed); - return true; - - case CONST_INT: - case CONST: - case LABEL_REF: - case SYMBOL_REF: - if (x86_64_immediate_operand (x, VOIDmode)) - *total = 0; - else - *total = 1; - return true; - - case CONST_DOUBLE: - if (IS_STACK_MODE (mode)) - switch (standard_80387_constant_p (x)) - { - case -1: - case 0: - break; - case 1: /* 0.0 */ - *total = 1; - return true; - default: /* Other constants */ - *total = 2; - return true; - } - /* FALLTHRU */ - - case CONST_VECTOR: - switch (standard_sse_constant_p (x, mode)) - { - case 0: - break; - case 1: /* 0: xor eliminates false dependency */ - *total = 0; - return true; - default: /* -1: cmp contains false dependency */ - *total = 1; - return true; - } - /* FALLTHRU */ - - case CONST_WIDE_INT: - /* Fall back to (MEM (SYMBOL_REF)), since that's where - it'll probably end up. Add a penalty for size. */ - *total = (COSTS_N_INSNS (1) - + (!TARGET_64BIT && flag_pic) - + (GET_MODE_SIZE (mode) <= 4 - ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2)); - return true; - - case ZERO_EXTEND: - /* The zero extensions is often completely free on x86_64, so make - it as cheap as possible. */ - if (TARGET_64BIT && mode == DImode - && GET_MODE (XEXP (x, 0)) == SImode) - *total = 1; - else if (TARGET_ZERO_EXTEND_WITH_AND) - *total = cost->add; - else - *total = cost->movzx; - return false; - - case SIGN_EXTEND: - *total = cost->movsx; - return false; - - case ASHIFT: - if (SCALAR_INT_MODE_P (mode) - && GET_MODE_SIZE (mode) < UNITS_PER_WORD - && CONST_INT_P (XEXP (x, 1))) - { - HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); - if (value == 1) - { - *total = cost->add; - return false; - } - if ((value == 2 || value == 3) - && cost->lea <= cost->shift_const) - { - *total = cost->lea; - return false; - } - } - /* FALLTHRU */ - - case ROTATE: - case ASHIFTRT: - case LSHIFTRT: - case ROTATERT: - bool skip_op0, skip_op1; - *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)), - CONST_INT_P (XEXP (x, 1)) - ? INTVAL (XEXP (x, 1)) : -1, - speed, - GET_CODE (XEXP (x, 1)) == AND, - SUBREG_P (XEXP (x, 1)) - && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND, - &skip_op0, &skip_op1); - if (skip_op0 || skip_op1) - { - if (!skip_op0) - *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed); - if (!skip_op1) - *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed); - return true; - } - return false; - - case FMA: - { - rtx sub; - - gcc_assert (FLOAT_MODE_P (mode)); - gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F); - - *total = ix86_vec_cost (mode, - GET_MODE_INNER (mode) == SFmode - ? cost->fmass : cost->fmasd); - *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed); - - /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */ - sub = XEXP (x, 0); - if (GET_CODE (sub) == NEG) - sub = XEXP (sub, 0); - *total += rtx_cost (sub, mode, FMA, 0, speed); - - sub = XEXP (x, 2); - if (GET_CODE (sub) == NEG) - sub = XEXP (sub, 0); - *total += rtx_cost (sub, mode, FMA, 2, speed); - return true; - } - - case MULT: - if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode)) - { - rtx op0 = XEXP (x, 0); - rtx op1 = XEXP (x, 1); - int nbits; - if (CONST_INT_P (XEXP (x, 1))) - { - unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); - for (nbits = 0; value != 0; value &= value - 1) - nbits++; - } - else - /* This is arbitrary. */ - nbits = 7; - - /* Compute costs correctly for widening multiplication. */ - if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND) - && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2 - == GET_MODE_SIZE (mode)) - { - int is_mulwiden = 0; - machine_mode inner_mode = GET_MODE (op0); - - if (GET_CODE (op0) == GET_CODE (op1)) - is_mulwiden = 1, op1 = XEXP (op1, 0); - else if (CONST_INT_P (op1)) - { - if (GET_CODE (op0) == SIGN_EXTEND) - is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode) - == INTVAL (op1); - else - is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode)); - } - - if (is_mulwiden) - op0 = XEXP (op0, 0), mode = GET_MODE (op0); - } - - *total = (cost->mult_init[MODE_INDEX (mode)] - + nbits * cost->mult_bit - + rtx_cost (op0, mode, outer_code, opno, speed) - + rtx_cost (op1, mode, outer_code, opno, speed)); - - return true; - } - *total = ix86_multiplication_cost (cost, mode); - return false; - - case DIV: - case UDIV: - case MOD: - case UMOD: - *total = ix86_division_cost (cost, mode); - return false; - - case PLUS: - if (GET_MODE_CLASS (mode) == MODE_INT - && GET_MODE_SIZE (mode) <= UNITS_PER_WORD) - { - if (GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT - && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1)) - && CONSTANT_P (XEXP (x, 1))) - { - HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1)); - if (val == 2 || val == 4 || val == 8) - { - *total = cost->lea; - *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode, - outer_code, opno, speed); - *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode, - outer_code, opno, speed); - *total += rtx_cost (XEXP (x, 1), mode, - outer_code, opno, speed); - return true; - } - } - else if (GET_CODE (XEXP (x, 0)) == MULT - && CONST_INT_P (XEXP (XEXP (x, 0), 1))) - { - HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1)); - if (val == 2 || val == 4 || val == 8) - { - *total = cost->lea; - *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, - outer_code, opno, speed); - *total += rtx_cost (XEXP (x, 1), mode, - outer_code, opno, speed); - return true; - } - } - else if (GET_CODE (XEXP (x, 0)) == PLUS) - { - /* Add with carry, ignore the cost of adding a carry flag. */ - if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode)) - *total = cost->add; - else - { - *total = cost->lea; - *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, - outer_code, opno, speed); - } - - *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode, - outer_code, opno, speed); - *total += rtx_cost (XEXP (x, 1), mode, - outer_code, opno, speed); - return true; - } - } - /* FALLTHRU */ - - case MINUS: - /* Subtract with borrow, ignore the cost of subtracting a carry flag. */ - if (GET_MODE_CLASS (mode) == MODE_INT - && GET_MODE_SIZE (mode) <= UNITS_PER_WORD - && GET_CODE (XEXP (x, 0)) == MINUS - && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode)) - { - *total = cost->add; - *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, - outer_code, opno, speed); - *total += rtx_cost (XEXP (x, 1), mode, - outer_code, opno, speed); - return true; - } - - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - { - *total = cost->addss; - return false; - } - else if (X87_FLOAT_MODE_P (mode)) - { - *total = cost->fadd; - return false; - } - else if (FLOAT_MODE_P (mode)) - { - *total = ix86_vec_cost (mode, cost->addss); - return false; - } - /* FALLTHRU */ - - case AND: - case IOR: - case XOR: - if (GET_MODE_CLASS (mode) == MODE_INT - && GET_MODE_SIZE (mode) > UNITS_PER_WORD) - { - *total = (cost->add * 2 - + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed) - << (GET_MODE (XEXP (x, 0)) != DImode)) - + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed) - << (GET_MODE (XEXP (x, 1)) != DImode))); - return true; - } - /* FALLTHRU */ - - case NEG: - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - { - *total = cost->sse_op; - return false; - } - else if (X87_FLOAT_MODE_P (mode)) - { - *total = cost->fchs; - return false; - } - else if (FLOAT_MODE_P (mode)) - { - *total = ix86_vec_cost (mode, cost->sse_op); - return false; - } - /* FALLTHRU */ - - case NOT: - if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - *total = ix86_vec_cost (mode, cost->sse_op); - else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) - *total = cost->add * 2; - else - *total = cost->add; - return false; - - case COMPARE: - if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT - && XEXP (XEXP (x, 0), 1) == const1_rtx - && CONST_INT_P (XEXP (XEXP (x, 0), 2)) - && XEXP (x, 1) == const0_rtx) - { - /* This kind of construct is implemented using test[bwl]. - Treat it as if we had an AND. */ - mode = GET_MODE (XEXP (XEXP (x, 0), 0)); - *total = (cost->add - + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code, - opno, speed) - + rtx_cost (const1_rtx, mode, outer_code, opno, speed)); - return true; - } - - /* The embedded comparison operand is completely free. */ - if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0))) - && XEXP (x, 1) == const0_rtx) - *total = 0; - - return false; - - case FLOAT_EXTEND: - if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) - *total = 0; - else - *total = ix86_vec_cost (mode, cost->addss); - return false; - - case FLOAT_TRUNCATE: - if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) - *total = cost->fadd; - else - *total = ix86_vec_cost (mode, cost->addss); - return false; - - case ABS: - /* SSE requires memory load for the constant operand. It may make - sense to account for this. Of course the constant operand may or - may not be reused. */ - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - *total = cost->sse_op; - else if (X87_FLOAT_MODE_P (mode)) - *total = cost->fabs; - else if (FLOAT_MODE_P (mode)) - *total = ix86_vec_cost (mode, cost->sse_op); - return false; - - case SQRT: - if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) - *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd; - else if (X87_FLOAT_MODE_P (mode)) - *total = cost->fsqrt; - else if (FLOAT_MODE_P (mode)) - *total = ix86_vec_cost (mode, - mode == SFmode ? cost->sqrtss : cost->sqrtsd); - return false; - - case UNSPEC: - if (XINT (x, 1) == UNSPEC_TP) - *total = 0; - return false; - - case VEC_SELECT: - case VEC_CONCAT: - case VEC_DUPLICATE: - /* ??? Assume all of these vector manipulation patterns are - recognizable. In which case they all pretty much have the - same cost. */ - *total = cost->sse_op; - return true; - case VEC_MERGE: - mask = XEXP (x, 2); - /* This is masked instruction, assume the same cost, - as nonmasked variant. */ - if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask))) - *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed); - else - *total = cost->sse_op; - return true; - - default: - return false; - } -} - -#if TARGET_MACHO - -static int current_machopic_label_num; - -/* Given a symbol name and its associated stub, write out the - definition of the stub. */ - -void -machopic_output_stub (FILE *file, const char *symb, const char *stub) -{ - unsigned int length; - char *binder_name, *symbol_name, lazy_ptr_name[32]; - int label = ++current_machopic_label_num; - - /* For 64-bit we shouldn't get here. */ - gcc_assert (!TARGET_64BIT); - - /* Lose our funky encoding stuff so it doesn't contaminate the stub. */ - symb = targetm.strip_name_encoding (symb); - - length = strlen (stub); - binder_name = XALLOCAVEC (char, length + 32); - GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length); - - length = strlen (symb); - symbol_name = XALLOCAVEC (char, length + 32); - GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length); - - sprintf (lazy_ptr_name, "L%d$lz", label); - - if (MACHOPIC_ATT_STUB) - switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]); - else if (MACHOPIC_PURE) - switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]); - else - switch_to_section (darwin_sections[machopic_symbol_stub_section]); - - fprintf (file, "%s:\n", stub); - fprintf (file, "\t.indirect_symbol %s\n", symbol_name); - - if (MACHOPIC_ATT_STUB) - { - fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n"); - } - else if (MACHOPIC_PURE) - { - /* PIC stub. */ - /* 25-byte PIC stub using "CALL get_pc_thunk". */ - rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */); - output_set_got (tmp, NULL_RTX); /* "CALL ___.get_pc_thunk.cx". */ - fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n", - label, lazy_ptr_name, label); - fprintf (file, "\tjmp\t*%%ecx\n"); - } - else - fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name); - - /* The AT&T-style ("self-modifying") stub is not lazily bound, thus - it needs no stub-binding-helper. */ - if (MACHOPIC_ATT_STUB) - return; - - fprintf (file, "%s:\n", binder_name); - - if (MACHOPIC_PURE) - { - fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name); - fprintf (file, "\tpushl\t%%ecx\n"); - } - else - fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name); - - fputs ("\tjmp\tdyld_stub_binding_helper\n", file); - - /* N.B. Keep the correspondence of these - 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the - old-pic/new-pic/non-pic stubs; altering this will break - compatibility with existing dylibs. */ - if (MACHOPIC_PURE) - { - /* 25-byte PIC stub using "CALL get_pc_thunk". */ - switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]); - } - else - /* 16-byte -mdynamic-no-pic stub. */ - switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]); - - fprintf (file, "%s:\n", lazy_ptr_name); - fprintf (file, "\t.indirect_symbol %s\n", symbol_name); - fprintf (file, ASM_LONG "%s\n", binder_name); -} -#endif /* TARGET_MACHO */ - -/* Order the registers for register allocator. */ - -void -x86_order_regs_for_local_alloc (void) -{ - int pos = 0; - int i; - - /* First allocate the local general purpose registers. */ - for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) - if (GENERAL_REGNO_P (i) && call_used_regs[i]) - reg_alloc_order [pos++] = i; - - /* Global general purpose registers. */ - for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) - if (GENERAL_REGNO_P (i) && !call_used_regs[i]) - reg_alloc_order [pos++] = i; - - /* x87 registers come first in case we are doing FP math - using them. */ - if (!TARGET_SSE_MATH) - for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++) - reg_alloc_order [pos++] = i; - - /* SSE registers. */ - for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++) - reg_alloc_order [pos++] = i; - for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++) - reg_alloc_order [pos++] = i; - - /* Extended REX SSE registers. */ - for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++) - reg_alloc_order [pos++] = i; - - /* Mask register. */ - for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++) - reg_alloc_order [pos++] = i; - - /* x87 registers. */ - if (TARGET_SSE_MATH) - for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++) - reg_alloc_order [pos++] = i; - - for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++) - reg_alloc_order [pos++] = i; - - /* Initialize the rest of array as we do not allocate some registers - at all. */ - while (pos < FIRST_PSEUDO_REGISTER) - reg_alloc_order [pos++] = 0; -} - -/* Handle a "callee_pop_aggregate_return" attribute; arguments as - in struct attribute_spec handler. */ -static tree -ix86_handle_callee_pop_aggregate_return (tree *node, tree name, tree args, int, - bool *no_add_attrs) -{ - if (TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE - && TREE_CODE (*node) != FIELD_DECL - && TREE_CODE (*node) != TYPE_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - return NULL_TREE; - } - if (TARGET_64BIT) - { - warning (OPT_Wattributes, "%qE attribute only available for 32-bit", - name); - *no_add_attrs = true; - return NULL_TREE; - } - if (is_attribute_p ("callee_pop_aggregate_return", name)) - { - tree cst; - - cst = TREE_VALUE (args); - if (TREE_CODE (cst) != INTEGER_CST) - { - warning (OPT_Wattributes, - "%qE attribute requires an integer constant argument", - name); - *no_add_attrs = true; - } - else if (compare_tree_int (cst, 0) != 0 - && compare_tree_int (cst, 1) != 0) - { - warning (OPT_Wattributes, - "argument to %qE attribute is neither zero, nor one", - name); - *no_add_attrs = true; - } - - return NULL_TREE; - } - - return NULL_TREE; -} - -/* Handle a "ms_abi" or "sysv" attribute; arguments as in - struct attribute_spec.handler. */ -static tree -ix86_handle_abi_attribute (tree *node, tree name, tree, int, - bool *no_add_attrs) -{ - if (TREE_CODE (*node) != FUNCTION_TYPE - && TREE_CODE (*node) != METHOD_TYPE - && TREE_CODE (*node) != FIELD_DECL - && TREE_CODE (*node) != TYPE_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - return NULL_TREE; - } - - /* Can combine regparm with all attributes but fastcall. */ - if (is_attribute_p ("ms_abi", name)) - { - if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node))) - { - error ("ms_abi and sysv_abi attributes are not compatible"); - } - - return NULL_TREE; - } - else if (is_attribute_p ("sysv_abi", name)) - { - if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node))) - { - error ("ms_abi and sysv_abi attributes are not compatible"); - } - - return NULL_TREE; - } - - return NULL_TREE; -} - -/* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in - struct attribute_spec.handler. */ -static tree -ix86_handle_struct_attribute (tree *node, tree name, tree, int, - bool *no_add_attrs) -{ - tree *type = NULL; - if (DECL_P (*node)) - { - if (TREE_CODE (*node) == TYPE_DECL) - type = &TREE_TYPE (*node); - } - else - type = node; - - if (!(type && RECORD_OR_UNION_TYPE_P (*type))) - { - warning (OPT_Wattributes, "%qE attribute ignored", - name); - *no_add_attrs = true; - } - - else if ((is_attribute_p ("ms_struct", name) - && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type))) - || ((is_attribute_p ("gcc_struct", name) - && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type))))) - { - warning (OPT_Wattributes, "%qE incompatible attribute ignored", - name); - *no_add_attrs = true; - } - - return NULL_TREE; -} - -static tree -ix86_handle_fndecl_attribute (tree *node, tree name, tree args, int, - bool *no_add_attrs) -{ - if (TREE_CODE (*node) != FUNCTION_DECL) - { - warning (OPT_Wattributes, "%qE attribute only applies to functions", - name); - *no_add_attrs = true; - } - - if (is_attribute_p ("indirect_branch", name)) - { - tree cst = TREE_VALUE (args); - if (TREE_CODE (cst) != STRING_CST) - { - warning (OPT_Wattributes, - "%qE attribute requires a string constant argument", - name); - *no_add_attrs = true; - } - else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0) - { - warning (OPT_Wattributes, - "argument to %qE attribute is not " - "(keep|thunk|thunk-inline|thunk-extern)", name); - *no_add_attrs = true; - } - } - - if (is_attribute_p ("function_return", name)) - { - tree cst = TREE_VALUE (args); - if (TREE_CODE (cst) != STRING_CST) - { - warning (OPT_Wattributes, - "%qE attribute requires a string constant argument", - name); - *no_add_attrs = true; - } - else if (strcmp (TREE_STRING_POINTER (cst), "keep") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk-inline") != 0 - && strcmp (TREE_STRING_POINTER (cst), "thunk-extern") != 0) - { - warning (OPT_Wattributes, - "argument to %qE attribute is not " - "(keep|thunk|thunk-inline|thunk-extern)", name); - *no_add_attrs = true; - } - } - - return NULL_TREE; -} - -static tree -ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree, - int, bool *) -{ - return NULL_TREE; -} - -static tree -ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *) -{ - /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet, - but the function type contains args and return type data. */ - tree func_type = *node; - tree return_type = TREE_TYPE (func_type); - - int nargs = 0; - tree current_arg_type = TYPE_ARG_TYPES (func_type); - while (current_arg_type - && ! VOID_TYPE_P (TREE_VALUE (current_arg_type))) - { - if (nargs == 0) - { - if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type))) - error ("interrupt service routine should have a pointer " - "as the first argument"); - } - else if (nargs == 1) - { - if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE - || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode) - error ("interrupt service routine should have %qs " - "as the second argument", - TARGET_64BIT - ? (TARGET_X32 ? "unsigned long long int" - : "unsigned long int") - : "unsigned int"); - } - nargs++; - current_arg_type = TREE_CHAIN (current_arg_type); - } - if (!nargs || nargs > 2) - error ("interrupt service routine can only have a pointer argument " - "and an optional integer argument"); - if (! VOID_TYPE_P (return_type)) - error ("interrupt service routine can%'t have non-void return value"); - - return NULL_TREE; -} - -static bool -ix86_ms_bitfield_layout_p (const_tree record_type) -{ - return ((TARGET_MS_BITFIELD_LAYOUT - && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type))) - || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type))); -} - -/* Returns an expression indicating where the this parameter is - located on entry to the FUNCTION. */ - -static rtx -x86_this_parameter (tree function) -{ - tree type = TREE_TYPE (function); - bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0; - int nregs; - - if (TARGET_64BIT) - { - const int *parm_regs; - - if (ix86_function_type_abi (type) == MS_ABI) - parm_regs = x86_64_ms_abi_int_parameter_registers; - else - parm_regs = x86_64_int_parameter_registers; - return gen_rtx_REG (Pmode, parm_regs[aggr]); - } - - nregs = ix86_function_regparm (type, function); - - if (nregs > 0 && !stdarg_p (type)) - { - int regno; - unsigned int ccvt = ix86_get_callcvt (type); - - if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) - regno = aggr ? DX_REG : CX_REG; - else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) - { - regno = CX_REG; - if (aggr) - return gen_rtx_MEM (SImode, - plus_constant (Pmode, stack_pointer_rtx, 4)); - } - else - { - regno = AX_REG; - if (aggr) - { - regno = DX_REG; - if (nregs == 1) - return gen_rtx_MEM (SImode, - plus_constant (Pmode, - stack_pointer_rtx, 4)); - } - } - return gen_rtx_REG (SImode, regno); - } - - return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx, - aggr ? 8 : 4)); -} - -/* Determine whether x86_output_mi_thunk can succeed. */ - -static bool -x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset, - const_tree function) -{ - /* 64-bit can handle anything. */ - if (TARGET_64BIT) - return true; - - /* For 32-bit, everything's fine if we have one free register. */ - if (ix86_function_regparm (TREE_TYPE (function), function) < 3) - return true; - - /* Need a free register for vcall_offset. */ - if (vcall_offset) - return false; - - /* Need a free register for GOT references. */ - if (flag_pic && !targetm.binds_local_p (function)) - return false; - - /* Otherwise ok. */ - return true; -} - -/* Output the assembler code for a thunk function. THUNK_DECL is the - declaration for the thunk function itself, FUNCTION is the decl for - the target function. DELTA is an immediate constant offset to be - added to THIS. If VCALL_OFFSET is nonzero, the word at - *(*this + vcall_offset) should be added to THIS. */ - -static void -x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta, - HOST_WIDE_INT vcall_offset, tree function) -{ - rtx this_param = x86_this_parameter (function); - rtx this_reg, tmp, fnaddr; - unsigned int tmp_regno; - rtx_insn *insn; - - if (TARGET_64BIT) - tmp_regno = R10_REG; - else - { - unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function)); - if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) - tmp_regno = AX_REG; - else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) - tmp_regno = DX_REG; - else - tmp_regno = CX_REG; - } - - emit_note (NOTE_INSN_PROLOGUE_END); - - /* CET is enabled, insert EB instruction. */ - if ((flag_cf_protection & CF_BRANCH)) - emit_insn (gen_nop_endbr ()); - - /* If VCALL_OFFSET, we'll need THIS in a register. Might as well - pull it in now and let DELTA benefit. */ - if (REG_P (this_param)) - this_reg = this_param; - else if (vcall_offset) - { - /* Put the this parameter into %eax. */ - this_reg = gen_rtx_REG (Pmode, AX_REG); - emit_move_insn (this_reg, this_param); - } - else - this_reg = NULL_RTX; - - /* Adjust the this parameter by a fixed constant. */ - if (delta) - { - rtx delta_rtx = GEN_INT (delta); - rtx delta_dst = this_reg ? this_reg : this_param; - - if (TARGET_64BIT) - { - if (!x86_64_general_operand (delta_rtx, Pmode)) - { - tmp = gen_rtx_REG (Pmode, tmp_regno); - emit_move_insn (tmp, delta_rtx); - delta_rtx = tmp; - } - } - - ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx); - } - - /* Adjust the this parameter by a value stored in the vtable. */ - if (vcall_offset) - { - rtx vcall_addr, vcall_mem, this_mem; - - tmp = gen_rtx_REG (Pmode, tmp_regno); - - this_mem = gen_rtx_MEM (ptr_mode, this_reg); - if (Pmode != ptr_mode) - this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem); - emit_move_insn (tmp, this_mem); - - /* Adjust the this parameter. */ - vcall_addr = plus_constant (Pmode, tmp, vcall_offset); - if (TARGET_64BIT - && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true)) - { - rtx tmp2 = gen_rtx_REG (Pmode, R11_REG); - emit_move_insn (tmp2, GEN_INT (vcall_offset)); - vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2); - } - - vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr); - if (Pmode != ptr_mode) - emit_insn (gen_addsi_1_zext (this_reg, - gen_rtx_REG (ptr_mode, - REGNO (this_reg)), - vcall_mem)); - else - ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem); - } - - /* If necessary, drop THIS back to its stack slot. */ - if (this_reg && this_reg != this_param) - emit_move_insn (this_param, this_reg); - - fnaddr = XEXP (DECL_RTL (function), 0); - if (TARGET_64BIT) - { - if (!flag_pic || targetm.binds_local_p (function) - || TARGET_PECOFF) - ; - else - { - tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL); - tmp = gen_rtx_CONST (Pmode, tmp); - fnaddr = gen_const_mem (Pmode, tmp); - } - } - else - { - if (!flag_pic || targetm.binds_local_p (function)) - ; -#if TARGET_MACHO - else if (TARGET_MACHO) - { - fnaddr = machopic_indirect_call_target (DECL_RTL (function)); - fnaddr = XEXP (fnaddr, 0); - } -#endif /* TARGET_MACHO */ - else - { - tmp = gen_rtx_REG (Pmode, CX_REG); - output_set_got (tmp, NULL_RTX); - - fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT); - fnaddr = gen_rtx_CONST (Pmode, fnaddr); - fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr); - fnaddr = gen_const_mem (Pmode, fnaddr); - } - } - - /* Our sibling call patterns do not allow memories, because we have no - predicate that can distinguish between frame and non-frame memory. - For our purposes here, we can get away with (ab)using a jump pattern, - because we're going to do no optimization. */ - if (MEM_P (fnaddr)) - { - if (sibcall_insn_operand (fnaddr, word_mode)) - { - fnaddr = XEXP (DECL_RTL (function), 0); - tmp = gen_rtx_MEM (QImode, fnaddr); - tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx); - tmp = emit_call_insn (tmp); - SIBLING_CALL_P (tmp) = 1; - } - else - emit_jump_insn (gen_indirect_jump (fnaddr)); - } - else - { - if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr)) - { - // CM_LARGE_PIC always uses pseudo PIC register which is - // uninitialized. Since FUNCTION is local and calling it - // doesn't go through PLT, we use scratch register %r11 as - // PIC register and initialize it here. - pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG); - ix86_init_large_pic_reg (tmp_regno); - fnaddr = legitimize_pic_address (fnaddr, - gen_rtx_REG (Pmode, tmp_regno)); - } - - if (!sibcall_insn_operand (fnaddr, word_mode)) - { - tmp = gen_rtx_REG (word_mode, tmp_regno); - if (GET_MODE (fnaddr) != word_mode) - fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr); - emit_move_insn (tmp, fnaddr); - fnaddr = tmp; - } - - tmp = gen_rtx_MEM (QImode, fnaddr); - tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx); - tmp = emit_call_insn (tmp); - SIBLING_CALL_P (tmp) = 1; - } - emit_barrier (); - - /* Emit just enough of rest_of_compilation to get the insns emitted. - Note that use_thunk calls assemble_start_function et al. */ - insn = get_insns (); - shorten_branches (insn); - final_start_function (insn, file, 1); - final (insn, file, 1); - final_end_function (); -} - -static void -x86_file_start (void) -{ - default_file_start (); - if (TARGET_16BIT) - fputs ("\t.code16gcc\n", asm_out_file); -#if TARGET_MACHO - darwin_file_start (); -#endif - if (X86_FILE_START_VERSION_DIRECTIVE) - fputs ("\t.version\t\"01.01\"\n", asm_out_file); - if (X86_FILE_START_FLTUSED) - fputs ("\t.global\t__fltused\n", asm_out_file); - if (ix86_asm_dialect == ASM_INTEL) - fputs ("\t.intel_syntax noprefix\n", asm_out_file); -} - -int -x86_field_alignment (tree type, int computed) -{ - machine_mode mode; - - if (TARGET_64BIT || TARGET_ALIGN_DOUBLE) - return computed; - if (TARGET_IAMCU) - return iamcu_alignment (type, computed); - mode = TYPE_MODE (strip_array_types (type)); - if (mode == DFmode || mode == DCmode - || GET_MODE_CLASS (mode) == MODE_INT - || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT) - return MIN (32, computed); - return computed; -} - -/* Print call to TARGET to FILE. */ - -static void -x86_print_call_or_nop (FILE *file, const char *target) -{ - if (flag_nop_mcount || !strcmp (target, "nop")) - /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */ - fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n"); - else - fprintf (file, "1:\tcall\t%s\n", target); -} - -static bool -current_fentry_name (const char **name) -{ - tree attr = lookup_attribute ("fentry_name", - DECL_ATTRIBUTES (current_function_decl)); - if (!attr) - return false; - *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr))); - return true; -} - -static bool -current_fentry_section (const char **name) -{ - tree attr = lookup_attribute ("fentry_section", - DECL_ATTRIBUTES (current_function_decl)); - if (!attr) - return false; - *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr))); - return true; -} - -/* Output assembler code to FILE to increment profiler label # LABELNO - for profiling a function entry. */ -void -x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) -{ - if (cfun->machine->endbr_queued_at_entrance) - fprintf (file, "\t%s\n", TARGET_64BIT ? "endbr64" : "endbr32"); - - const char *mcount_name = MCOUNT_NAME; - - if (current_fentry_name (&mcount_name)) - ; - else if (fentry_name) - mcount_name = fentry_name; - else if (flag_fentry) - mcount_name = MCOUNT_NAME_BEFORE_PROLOGUE; - - if (TARGET_64BIT) - { -#ifndef NO_PROFILE_COUNTERS - fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno); -#endif - - if (!TARGET_PECOFF && flag_pic) - fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name); - else - x86_print_call_or_nop (file, mcount_name); - } - else if (flag_pic) - { -#ifndef NO_PROFILE_COUNTERS - fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n", - LPREFIX, labelno); -#endif - fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name); - } - else - { -#ifndef NO_PROFILE_COUNTERS - fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n", - LPREFIX, labelno); -#endif - x86_print_call_or_nop (file, mcount_name); - } - - if (flag_record_mcount - || lookup_attribute ("fentry_section", - DECL_ATTRIBUTES (current_function_decl))) - { - const char *sname = "__mcount_loc"; - - if (current_fentry_section (&sname)) - ; - else if (fentry_section) - sname = fentry_section; - - fprintf (file, "\t.section %s, \"a\",@progbits\n", sname); - fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long"); - fprintf (file, "\t.previous\n"); - } -} - -/* We don't have exact information about the insn sizes, but we may assume - quite safely that we are informed about all 1 byte insns and memory - address sizes. This is enough to eliminate unnecessary padding in - 99% of cases. */ - -int -ix86_min_insn_size (rtx_insn *insn) -{ - int l = 0, len; - - if (!INSN_P (insn) || !active_insn_p (insn)) - return 0; - - /* Discard alignments we've emit and jump instructions. */ - if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE - && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN) - return 0; - - /* Important case - calls are always 5 bytes. - It is common to have many calls in the row. */ - if (CALL_P (insn) - && symbolic_reference_mentioned_p (PATTERN (insn)) - && !SIBLING_CALL_P (insn)) - return 5; - len = get_attr_length (insn); - if (len <= 1) - return 1; - - /* For normal instructions we rely on get_attr_length being exact, - with a few exceptions. */ - if (!JUMP_P (insn)) - { - enum attr_type type = get_attr_type (insn); - - switch (type) - { - case TYPE_MULTI: - if (GET_CODE (PATTERN (insn)) == ASM_INPUT - || asm_noperands (PATTERN (insn)) >= 0) - return 0; - break; - case TYPE_OTHER: - case TYPE_FCMP: - break; - default: - /* Otherwise trust get_attr_length. */ - return len; - } - - l = get_attr_length_address (insn); - if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn))) - l = 4; - } - if (l) - return 1+l; - else - return 2; -} - -#ifdef ASM_OUTPUT_MAX_SKIP_PAD - -/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte - window. */ - -static void -ix86_avoid_jump_mispredicts (void) -{ - rtx_insn *insn, *start = get_insns (); - int nbytes = 0, njumps = 0; - bool isjump = false; - - /* Look for all minimal intervals of instructions containing 4 jumps. - The intervals are bounded by START and INSN. NBYTES is the total - size of instructions in the interval including INSN and not including - START. When the NBYTES is smaller than 16 bytes, it is possible - that the end of START and INSN ends up in the same 16byte page. - - The smallest offset in the page INSN can start is the case where START - ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN). - We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN). - - Don't consider asm goto as jump, while it can contain a jump, it doesn't - have to, control transfer to label(s) can be performed through other - means, and also we estimate minimum length of all asm stmts as 0. */ - for (insn = start; insn; insn = NEXT_INSN (insn)) - { - int min_size; - - if (LABEL_P (insn)) - { - align_flags alignment = label_to_alignment (insn); - int align = alignment.levels[0].log; - int max_skip = alignment.levels[0].maxskip; - - if (max_skip > 15) - max_skip = 15; - /* If align > 3, only up to 16 - max_skip - 1 bytes can be - already in the current 16 byte page, because otherwise - ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer - bytes to reach 16 byte boundary. */ - if (align <= 0 - || (align <= 3 && max_skip != (1 << align) - 1)) - max_skip = 0; - if (dump_file) - fprintf (dump_file, "Label %i with max_skip %i\n", - INSN_UID (insn), max_skip); - if (max_skip) - { - while (nbytes + max_skip >= 16) - { - start = NEXT_INSN (start); - if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0) - || CALL_P (start)) - njumps--, isjump = true; - else - isjump = false; - nbytes -= ix86_min_insn_size (start); - } - } - continue; - } - - min_size = ix86_min_insn_size (insn); - nbytes += min_size; - if (dump_file) - fprintf (dump_file, "Insn %i estimated to %i bytes\n", - INSN_UID (insn), min_size); - if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0) - || CALL_P (insn)) - njumps++; - else - continue; - - while (njumps > 3) - { - start = NEXT_INSN (start); - if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0) - || CALL_P (start)) - njumps--, isjump = true; - else - isjump = false; - nbytes -= ix86_min_insn_size (start); - } - gcc_assert (njumps >= 0); - if (dump_file) - fprintf (dump_file, "Interval %i to %i has %i bytes\n", - INSN_UID (start), INSN_UID (insn), nbytes); - - if (njumps == 3 && isjump && nbytes < 16) - { - int padsize = 15 - nbytes + ix86_min_insn_size (insn); - - if (dump_file) - fprintf (dump_file, "Padding insn %i by %i bytes!\n", - INSN_UID (insn), padsize); - emit_insn_before (gen_pad (GEN_INT (padsize)), insn); - } - } -} -#endif - -/* AMD Athlon works faster - when RET is not destination of conditional jump or directly preceded - by other jump instruction. We avoid the penalty by inserting NOP just - before the RET instructions in such cases. */ -static void -ix86_pad_returns (void) -{ - edge e; - edge_iterator ei; - - FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) - { - basic_block bb = e->src; - rtx_insn *ret = BB_END (bb); - rtx_insn *prev; - bool replace = false; - - if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret)) - || optimize_bb_for_size_p (bb)) - continue; - for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev)) - if (active_insn_p (prev) || LABEL_P (prev)) - break; - if (prev && LABEL_P (prev)) - { - edge e; - edge_iterator ei; - - FOR_EACH_EDGE (e, ei, bb->preds) - if (EDGE_FREQUENCY (e) && e->src->index >= 0 - && !(e->flags & EDGE_FALLTHRU)) - { - replace = true; - break; - } - } - if (!replace) - { - prev = prev_active_insn (ret); - if (prev - && ((JUMP_P (prev) && any_condjump_p (prev)) - || CALL_P (prev))) - replace = true; - /* Empty functions get branch mispredict even when - the jump destination is not visible to us. */ - if (!prev && !optimize_function_for_size_p (cfun)) - replace = true; - } - if (replace) - { - emit_jump_insn_before (gen_simple_return_internal_long (), ret); - delete_insn (ret); - } - } -} - -/* Count the minimum number of instructions in BB. Return 4 if the - number of instructions >= 4. */ - -static int -ix86_count_insn_bb (basic_block bb) -{ - rtx_insn *insn; - int insn_count = 0; - - /* Count number of instructions in this block. Return 4 if the number - of instructions >= 4. */ - FOR_BB_INSNS (bb, insn) - { - /* Only happen in exit blocks. */ - if (JUMP_P (insn) - && ANY_RETURN_P (PATTERN (insn))) - break; - - if (NONDEBUG_INSN_P (insn) - && GET_CODE (PATTERN (insn)) != USE - && GET_CODE (PATTERN (insn)) != CLOBBER) - { - insn_count++; - if (insn_count >= 4) - return insn_count; - } - } - - return insn_count; -} - - -/* Count the minimum number of instructions in code path in BB. - Return 4 if the number of instructions >= 4. */ - -static int -ix86_count_insn (basic_block bb) -{ - edge e; - edge_iterator ei; - int min_prev_count; - - /* Only bother counting instructions along paths with no - more than 2 basic blocks between entry and exit. Given - that BB has an edge to exit, determine if a predecessor - of BB has an edge from entry. If so, compute the number - of instructions in the predecessor block. If there - happen to be multiple such blocks, compute the minimum. */ - min_prev_count = 4; - FOR_EACH_EDGE (e, ei, bb->preds) - { - edge prev_e; - edge_iterator prev_ei; - - if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun)) - { - min_prev_count = 0; - break; - } - FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds) - { - if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun)) - { - int count = ix86_count_insn_bb (e->src); - if (count < min_prev_count) - min_prev_count = count; - break; - } - } - } - - if (min_prev_count < 4) - min_prev_count += ix86_count_insn_bb (bb); - - return min_prev_count; -} - -/* Pad short function to 4 instructions. */ - -static void -ix86_pad_short_function (void) -{ - edge e; - edge_iterator ei; - - FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) - { - rtx_insn *ret = BB_END (e->src); - if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret))) - { - int insn_count = ix86_count_insn (e->src); - - /* Pad short function. */ - if (insn_count < 4) - { - rtx_insn *insn = ret; - - /* Find epilogue. */ - while (insn - && (!NOTE_P (insn) - || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG)) - insn = PREV_INSN (insn); - - if (!insn) - insn = ret; - - /* Two NOPs count as one instruction. */ - insn_count = 2 * (4 - insn_count); - emit_insn_before (gen_nops (GEN_INT (insn_count)), insn); - } - } - } -} - -/* Fix up a Windows system unwinder issue. If an EH region falls through into - the epilogue, the Windows system unwinder will apply epilogue logic and - produce incorrect offsets. This can be avoided by adding a nop between - the last insn that can throw and the first insn of the epilogue. */ - -static void -ix86_seh_fixup_eh_fallthru (void) -{ - edge e; - edge_iterator ei; - - FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) - { - rtx_insn *insn, *next; - - /* Find the beginning of the epilogue. */ - for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn)) - if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG) - break; - if (insn == NULL) - continue; - - /* We only care about preceding insns that can throw. */ - insn = prev_active_insn (insn); - if (insn == NULL || !can_throw_internal (insn)) - continue; - - /* Do not separate calls from their debug information. */ - for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next)) - if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION) - insn = next; - else - break; - - emit_insn_after (gen_nops (const1_rtx), insn); - } -} - -/* Implement machine specific optimizations. We implement padding of returns - for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ -static void -ix86_reorg (void) -{ - /* We are freeing block_for_insn in the toplev to keep compatibility - with old MDEP_REORGS that are not CFG based. Recompute it now. */ - compute_bb_for_insn (); - - if (TARGET_SEH && current_function_has_exception_handlers ()) - ix86_seh_fixup_eh_fallthru (); - - if (optimize && optimize_function_for_speed_p (cfun)) - { - if (TARGET_PAD_SHORT_FUNCTION) - ix86_pad_short_function (); - else if (TARGET_PAD_RETURNS) - ix86_pad_returns (); -#ifdef ASM_OUTPUT_MAX_SKIP_PAD - if (TARGET_FOUR_JUMP_LIMIT) - ix86_avoid_jump_mispredicts (); -#endif - } -} - -/* Return nonzero when QImode register that must be represented via REX prefix - is used. */ -bool -x86_extended_QIreg_mentioned_p (rtx_insn *insn) -{ - int i; - extract_insn_cached (insn); - for (i = 0; i < recog_data.n_operands; i++) - if (GENERAL_REG_P (recog_data.operand[i]) - && !QI_REGNO_P (REGNO (recog_data.operand[i]))) - return true; - return false; -} - -/* Return true when INSN mentions register that must be encoded using REX - prefix. */ -bool -x86_extended_reg_mentioned_p (rtx insn) -{ - subrtx_iterator::array_type array; - FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST) - { - const_rtx x = *iter; - if (REG_P (x) - && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x)))) - return true; - } - return false; -} - -/* If profitable, negate (without causing overflow) integer constant - of mode MODE at location LOC. Return true in this case. */ -bool -x86_maybe_negate_const_int (rtx *loc, machine_mode mode) -{ - HOST_WIDE_INT val; - - if (!CONST_INT_P (*loc)) - return false; - - switch (mode) - { - case E_DImode: - /* DImode x86_64 constants must fit in 32 bits. */ - gcc_assert (x86_64_immediate_operand (*loc, mode)); - - mode = SImode; - break; - - case E_SImode: - case E_HImode: - case E_QImode: - break; - - default: - gcc_unreachable (); - } - - /* Avoid overflows. */ - if (mode_signbit_p (mode, *loc)) - return false; - - val = INTVAL (*loc); - - /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'. - Exceptions: -128 encodes smaller than 128, so swap sign and op. */ - if ((val < 0 && val != -128) - || val == 128) - { - *loc = GEN_INT (-val); - return true; - } - - return false; -} - -/* Generate an unsigned DImode/SImode to FP conversion. This is the same code - optabs would emit if we didn't have TFmode patterns. */ - -void -x86_emit_floatuns (rtx operands[2]) -{ - rtx_code_label *neglab, *donelab; - rtx i0, i1, f0, in, out; - machine_mode mode, inmode; - - inmode = GET_MODE (operands[1]); - gcc_assert (inmode == SImode || inmode == DImode); - - out = operands[0]; - in = force_reg (inmode, operands[1]); - mode = GET_MODE (out); - neglab = gen_label_rtx (); - donelab = gen_label_rtx (); - f0 = gen_reg_rtx (mode); - - emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab); - - expand_float (out, in, 0); - - emit_jump_insn (gen_jump (donelab)); - emit_barrier (); - - emit_label (neglab); - - i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL, - 1, OPTAB_DIRECT); - i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL, - 1, OPTAB_DIRECT); - i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT); - - expand_float (f0, i0, 0); - - emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0))); - - emit_label (donelab); -} - -static bool canonicalize_perm (struct expand_vec_perm_d *d); -static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); -static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); -static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool); - -/* Get a vector mode of the same size as the original but with elements - twice as wide. This is only guaranteed to apply to integral vectors. */ - -static inline machine_mode -get_mode_wider_vector (machine_mode o) -{ - /* ??? Rely on the ordering that genmodes.c gives to vectors. */ - machine_mode n = GET_MODE_WIDER_MODE (o).require (); - gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2); - gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n)); - return n; -} - -/* A subroutine of ix86_expand_vector_init_duplicate. Tries to - fill target with val via vec_duplicate. */ - -static bool -ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val) -{ - bool ok; - rtx_insn *insn; - rtx dup; - - /* First attempt to recognize VAL as-is. */ - dup = gen_vec_duplicate (mode, val); - insn = emit_insn (gen_rtx_SET (target, dup)); - if (recog_memoized (insn) < 0) - { - rtx_insn *seq; - machine_mode innermode = GET_MODE_INNER (mode); - rtx reg; - - /* If that fails, force VAL into a register. */ - - start_sequence (); - reg = force_reg (innermode, val); - if (GET_MODE (reg) != innermode) - reg = gen_lowpart (innermode, reg); - SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg); - seq = get_insns (); - end_sequence (); - if (seq) - emit_insn_before (seq, insn); - - ok = recog_memoized (insn) >= 0; - gcc_assert (ok); - } - return true; -} - -/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector - with all elements equal to VAR. Return true if successful. */ - -static bool -ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, - rtx target, rtx val) -{ - bool ok; - - switch (mode) - { - case E_V2SImode: - case E_V2SFmode: - if (!mmx_ok) - return false; - /* FALLTHRU */ - - case E_V4DFmode: - case E_V4DImode: - case E_V8SFmode: - case E_V8SImode: - case E_V2DFmode: - case E_V2DImode: - case E_V4SFmode: - case E_V4SImode: - case E_V16SImode: - case E_V8DImode: - case E_V16SFmode: - case E_V8DFmode: - return ix86_vector_duplicate_value (mode, target, val); - - case E_V4HImode: - if (!mmx_ok) - return false; - if (TARGET_SSE || TARGET_3DNOW_A) - { - rtx x; - - val = gen_lowpart (SImode, val); - x = gen_rtx_TRUNCATE (HImode, val); - x = gen_rtx_VEC_DUPLICATE (mode, x); - emit_insn (gen_rtx_SET (target, x)); - return true; - } - goto widen; - - case E_V8QImode: - if (!mmx_ok) - return false; - goto widen; - - case E_V8HImode: - if (TARGET_AVX2) - return ix86_vector_duplicate_value (mode, target, val); - - if (TARGET_SSE2) - { - struct expand_vec_perm_d dperm; - rtx tmp1, tmp2; - - permute: - memset (&dperm, 0, sizeof (dperm)); - dperm.target = target; - dperm.vmode = mode; - dperm.nelt = GET_MODE_NUNITS (mode); - dperm.op0 = dperm.op1 = gen_reg_rtx (mode); - dperm.one_operand_p = true; - - /* Extend to SImode using a paradoxical SUBREG. */ - tmp1 = gen_reg_rtx (SImode); - emit_move_insn (tmp1, gen_lowpart (SImode, val)); - - /* Insert the SImode value as low element of a V4SImode vector. */ - tmp2 = gen_reg_rtx (V4SImode); - emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1)); - emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2)); - - ok = (expand_vec_perm_1 (&dperm) - || expand_vec_perm_broadcast_1 (&dperm)); - gcc_assert (ok); - return ok; - } - goto widen; - - case E_V16QImode: - if (TARGET_AVX2) - return ix86_vector_duplicate_value (mode, target, val); - - if (TARGET_SSE2) - goto permute; - goto widen; - - widen: - /* Replicate the value once into the next wider mode and recurse. */ - { - machine_mode smode, wsmode, wvmode; - rtx x; - - smode = GET_MODE_INNER (mode); - wvmode = get_mode_wider_vector (mode); - wsmode = GET_MODE_INNER (wvmode); - - val = convert_modes (wsmode, smode, val, true); - x = expand_simple_binop (wsmode, ASHIFT, val, - GEN_INT (GET_MODE_BITSIZE (smode)), - NULL_RTX, 1, OPTAB_LIB_WIDEN); - val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN); - - x = gen_reg_rtx (wvmode); - ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val); - gcc_assert (ok); - emit_move_insn (target, gen_lowpart (GET_MODE (target), x)); - return ok; - } - - case E_V16HImode: - case E_V32QImode: - if (TARGET_AVX2) - return ix86_vector_duplicate_value (mode, target, val); - else - { - machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode); - rtx x = gen_reg_rtx (hvmode); - - ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); - gcc_assert (ok); - - x = gen_rtx_VEC_CONCAT (mode, x, x); - emit_insn (gen_rtx_SET (target, x)); - } - return true; - - case E_V64QImode: - case E_V32HImode: - if (TARGET_AVX512BW) - return ix86_vector_duplicate_value (mode, target, val); - else - { - machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode); - rtx x = gen_reg_rtx (hvmode); - - ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val); - gcc_assert (ok); - - x = gen_rtx_VEC_CONCAT (mode, x, x); - emit_insn (gen_rtx_SET (target, x)); - } - return true; - - default: - return false; - } -} - -/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector - whose ONE_VAR element is VAR, and other elements are zero. Return true - if successful. */ - -static bool -ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, - rtx target, rtx var, int one_var) -{ - machine_mode vsimode; - rtx new_target; - rtx x, tmp; - bool use_vector_set = false; - rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL; - - switch (mode) - { - case E_V2DImode: - /* For SSE4.1, we normally use vector set. But if the second - element is zero and inter-unit moves are OK, we use movq - instead. */ - use_vector_set = (TARGET_64BIT && TARGET_SSE4_1 - && !(TARGET_INTER_UNIT_MOVES_TO_VEC - && one_var == 0)); - break; - case E_V16QImode: - case E_V4SImode: - case E_V4SFmode: - use_vector_set = TARGET_SSE4_1; - break; - case E_V8HImode: - use_vector_set = TARGET_SSE2; - break; - case E_V4HImode: - use_vector_set = TARGET_SSE || TARGET_3DNOW_A; - break; - case E_V32QImode: - case E_V16HImode: - use_vector_set = TARGET_AVX; - break; - case E_V8SImode: - use_vector_set = TARGET_AVX; - gen_vec_set_0 = gen_vec_setv8si_0; - break; - case E_V8SFmode: - use_vector_set = TARGET_AVX; - gen_vec_set_0 = gen_vec_setv8sf_0; - break; - case E_V4DFmode: - use_vector_set = TARGET_AVX; - gen_vec_set_0 = gen_vec_setv4df_0; - break; - case E_V4DImode: - /* Use ix86_expand_vector_set in 64bit mode only. */ - use_vector_set = TARGET_AVX && TARGET_64BIT; - gen_vec_set_0 = gen_vec_setv4di_0; - break; - case E_V16SImode: - use_vector_set = TARGET_AVX512F && one_var == 0; - gen_vec_set_0 = gen_vec_setv16si_0; - break; - case E_V16SFmode: - use_vector_set = TARGET_AVX512F && one_var == 0; - gen_vec_set_0 = gen_vec_setv16sf_0; - break; - case E_V8DFmode: - use_vector_set = TARGET_AVX512F && one_var == 0; - gen_vec_set_0 = gen_vec_setv8df_0; - break; - case E_V8DImode: - /* Use ix86_expand_vector_set in 64bit mode only. */ - use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0; - gen_vec_set_0 = gen_vec_setv8di_0; - break; - default: - break; - } - - if (use_vector_set) - { - if (gen_vec_set_0 && one_var == 0) - { - var = force_reg (GET_MODE_INNER (mode), var); - emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var)); - return true; - } - emit_insn (gen_rtx_SET (target, CONST0_RTX (mode))); - var = force_reg (GET_MODE_INNER (mode), var); - ix86_expand_vector_set (mmx_ok, target, var, one_var); - return true; - } - - switch (mode) - { - case E_V2SFmode: - case E_V2SImode: - if (!mmx_ok) - return false; - /* FALLTHRU */ - - case E_V2DFmode: - case E_V2DImode: - if (one_var != 0) - return false; - var = force_reg (GET_MODE_INNER (mode), var); - x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode))); - emit_insn (gen_rtx_SET (target, x)); - return true; - - case E_V4SFmode: - case E_V4SImode: - if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER) - new_target = gen_reg_rtx (mode); - else - new_target = target; - var = force_reg (GET_MODE_INNER (mode), var); - x = gen_rtx_VEC_DUPLICATE (mode, var); - x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx); - emit_insn (gen_rtx_SET (new_target, x)); - if (one_var != 0) - { - /* We need to shuffle the value to the correct position, so - create a new pseudo to store the intermediate result. */ - - /* With SSE2, we can use the integer shuffle insns. */ - if (mode != V4SFmode && TARGET_SSE2) - { - emit_insn (gen_sse2_pshufd_1 (new_target, new_target, - const1_rtx, - GEN_INT (one_var == 1 ? 0 : 1), - GEN_INT (one_var == 2 ? 0 : 1), - GEN_INT (one_var == 3 ? 0 : 1))); - if (target != new_target) - emit_move_insn (target, new_target); - return true; - } - - /* Otherwise convert the intermediate result to V4SFmode and - use the SSE1 shuffle instructions. */ - if (mode != V4SFmode) - { - tmp = gen_reg_rtx (V4SFmode); - emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target)); - } - else - tmp = new_target; - - emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp, - const1_rtx, - GEN_INT (one_var == 1 ? 0 : 1), - GEN_INT (one_var == 2 ? 0+4 : 1+4), - GEN_INT (one_var == 3 ? 0+4 : 1+4))); - - if (mode != V4SFmode) - emit_move_insn (target, gen_lowpart (V4SImode, tmp)); - else if (tmp != target) - emit_move_insn (target, tmp); - } - else if (target != new_target) - emit_move_insn (target, new_target); - return true; - - case E_V8HImode: - case E_V16QImode: - vsimode = V4SImode; - goto widen; - case E_V4HImode: - case E_V8QImode: - if (!mmx_ok) - return false; - vsimode = V2SImode; - goto widen; - widen: - if (one_var != 0) - return false; - - /* Zero extend the variable element to SImode and recurse. */ - var = convert_modes (SImode, GET_MODE_INNER (mode), var, true); - - x = gen_reg_rtx (vsimode); - if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x, - var, one_var)) - gcc_unreachable (); - - emit_move_insn (target, gen_lowpart (mode, x)); - return true; - - default: - return false; - } -} - -/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector - consisting of the values in VALS. It is known that all elements - except ONE_VAR are constants. Return true if successful. */ - -static bool -ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode, - rtx target, rtx vals, int one_var) -{ - rtx var = XVECEXP (vals, 0, one_var); - machine_mode wmode; - rtx const_vec, x; - - const_vec = copy_rtx (vals); - XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode)); - const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0)); - - switch (mode) - { - case E_V2DFmode: - case E_V2DImode: - case E_V2SFmode: - case E_V2SImode: - /* For the two element vectors, it's just as easy to use - the general case. */ - return false; - - case E_V4DImode: - /* Use ix86_expand_vector_set in 64bit mode only. */ - if (!TARGET_64BIT) - return false; - /* FALLTHRU */ - case E_V4DFmode: - case E_V8SFmode: - case E_V8SImode: - case E_V16HImode: - case E_V32QImode: - case E_V4SFmode: - case E_V4SImode: - case E_V8HImode: - case E_V4HImode: - break; - - case E_V16QImode: - if (TARGET_SSE4_1) - break; - wmode = V8HImode; - goto widen; - case E_V8QImode: - wmode = V4HImode; - goto widen; - widen: - /* There's no way to set one QImode entry easily. Combine - the variable value with its adjacent constant value, and - promote to an HImode set. */ - x = XVECEXP (vals, 0, one_var ^ 1); - if (one_var & 1) - { - var = convert_modes (HImode, QImode, var, true); - var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8), - NULL_RTX, 1, OPTAB_LIB_WIDEN); - x = GEN_INT (INTVAL (x) & 0xff); - } - else - { - var = convert_modes (HImode, QImode, var, true); - x = gen_int_mode (UINTVAL (x) << 8, HImode); - } - if (x != const0_rtx) - var = expand_simple_binop (HImode, IOR, var, x, var, - 1, OPTAB_LIB_WIDEN); - - x = gen_reg_rtx (wmode); - emit_move_insn (x, gen_lowpart (wmode, const_vec)); - ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1); - - emit_move_insn (target, gen_lowpart (mode, x)); - return true; - - default: - return false; - } - - emit_move_insn (target, const_vec); - ix86_expand_vector_set (mmx_ok, target, var, one_var); - return true; -} - -/* A subroutine of ix86_expand_vector_init_general. Use vector - concatenate to handle the most general case: all values variable, - and none identical. */ - -static void -ix86_expand_vector_init_concat (machine_mode mode, - rtx target, rtx *ops, int n) -{ - machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode; - rtx first[16], second[8], third[4]; - rtvec v; - int i, j; - - switch (n) - { - case 2: - switch (mode) - { - case E_V16SImode: - cmode = V8SImode; - break; - case E_V16SFmode: - cmode = V8SFmode; - break; - case E_V8DImode: - cmode = V4DImode; - break; - case E_V8DFmode: - cmode = V4DFmode; - break; - case E_V8SImode: - cmode = V4SImode; - break; - case E_V8SFmode: - cmode = V4SFmode; - break; - case E_V4DImode: - cmode = V2DImode; - break; - case E_V4DFmode: - cmode = V2DFmode; - break; - case E_V4SImode: - cmode = V2SImode; - break; - case E_V4SFmode: - cmode = V2SFmode; - break; - case E_V2DImode: - cmode = DImode; - break; - case E_V2SImode: - cmode = SImode; - break; - case E_V2DFmode: - cmode = DFmode; - break; - case E_V2SFmode: - cmode = SFmode; - break; - default: - gcc_unreachable (); - } - - if (!register_operand (ops[1], cmode)) - ops[1] = force_reg (cmode, ops[1]); - if (!register_operand (ops[0], cmode)) - ops[0] = force_reg (cmode, ops[0]); - emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0], - ops[1]))); - break; - - case 4: - switch (mode) - { - case E_V4DImode: - cmode = V2DImode; - break; - case E_V4DFmode: - cmode = V2DFmode; - break; - case E_V4SImode: - cmode = V2SImode; - break; - case E_V4SFmode: - cmode = V2SFmode; - break; - default: - gcc_unreachable (); - } - goto half; - - case 8: - switch (mode) - { - case E_V8DImode: - cmode = V2DImode; - hmode = V4DImode; - break; - case E_V8DFmode: - cmode = V2DFmode; - hmode = V4DFmode; - break; - case E_V8SImode: - cmode = V2SImode; - hmode = V4SImode; - break; - case E_V8SFmode: - cmode = V2SFmode; - hmode = V4SFmode; - break; - default: - gcc_unreachable (); - } - goto half; - - case 16: - switch (mode) - { - case E_V16SImode: - cmode = V2SImode; - hmode = V4SImode; - gmode = V8SImode; - break; - case E_V16SFmode: - cmode = V2SFmode; - hmode = V4SFmode; - gmode = V8SFmode; - break; - default: - gcc_unreachable (); - } - goto half; - -half: - /* FIXME: We process inputs backward to help RA. PR 36222. */ - i = n - 1; - j = (n >> 1) - 1; - for (; i > 0; i -= 2, j--) - { - first[j] = gen_reg_rtx (cmode); - v = gen_rtvec (2, ops[i - 1], ops[i]); - ix86_expand_vector_init (false, first[j], - gen_rtx_PARALLEL (cmode, v)); - } - - n >>= 1; - if (n > 4) - { - gcc_assert (hmode != VOIDmode); - gcc_assert (gmode != VOIDmode); - for (i = j = 0; i < n; i += 2, j++) - { - second[j] = gen_reg_rtx (hmode); - ix86_expand_vector_init_concat (hmode, second [j], - &first [i], 2); - } - n >>= 1; - for (i = j = 0; i < n; i += 2, j++) - { - third[j] = gen_reg_rtx (gmode); - ix86_expand_vector_init_concat (gmode, third[j], - &second[i], 2); - } - n >>= 1; - ix86_expand_vector_init_concat (mode, target, third, n); - } - else if (n > 2) - { - gcc_assert (hmode != VOIDmode); - for (i = j = 0; i < n; i += 2, j++) - { - second[j] = gen_reg_rtx (hmode); - ix86_expand_vector_init_concat (hmode, second [j], - &first [i], 2); - } - n >>= 1; - ix86_expand_vector_init_concat (mode, target, second, n); - } - else - ix86_expand_vector_init_concat (mode, target, first, n); - break; - - default: - gcc_unreachable (); - } -} - -/* A subroutine of ix86_expand_vector_init_general. Use vector - interleave to handle the most general case: all values variable, - and none identical. */ - -static void -ix86_expand_vector_init_interleave (machine_mode mode, - rtx target, rtx *ops, int n) -{ - machine_mode first_imode, second_imode, third_imode, inner_mode; - int i, j; - rtx op0, op1; - rtx (*gen_load_even) (rtx, rtx, rtx); - rtx (*gen_interleave_first_low) (rtx, rtx, rtx); - rtx (*gen_interleave_second_low) (rtx, rtx, rtx); - - switch (mode) - { - case E_V8HImode: - gen_load_even = gen_vec_setv8hi; - gen_interleave_first_low = gen_vec_interleave_lowv4si; - gen_interleave_second_low = gen_vec_interleave_lowv2di; - inner_mode = HImode; - first_imode = V4SImode; - second_imode = V2DImode; - third_imode = VOIDmode; - break; - case E_V16QImode: - gen_load_even = gen_vec_setv16qi; - gen_interleave_first_low = gen_vec_interleave_lowv8hi; - gen_interleave_second_low = gen_vec_interleave_lowv4si; - inner_mode = QImode; - first_imode = V8HImode; - second_imode = V4SImode; - third_imode = V2DImode; - break; - default: - gcc_unreachable (); - } - - for (i = 0; i < n; i++) - { - /* Extend the odd elment to SImode using a paradoxical SUBREG. */ - op0 = gen_reg_rtx (SImode); - emit_move_insn (op0, gen_lowpart (SImode, ops [i + i])); - - /* Insert the SImode value as low element of V4SImode vector. */ - op1 = gen_reg_rtx (V4SImode); - op0 = gen_rtx_VEC_MERGE (V4SImode, - gen_rtx_VEC_DUPLICATE (V4SImode, - op0), - CONST0_RTX (V4SImode), - const1_rtx); - emit_insn (gen_rtx_SET (op1, op0)); - - /* Cast the V4SImode vector back to a vector in orignal mode. */ - op0 = gen_reg_rtx (mode); - emit_move_insn (op0, gen_lowpart (mode, op1)); - - /* Load even elements into the second position. */ - emit_insn (gen_load_even (op0, - force_reg (inner_mode, - ops [i + i + 1]), - const1_rtx)); - - /* Cast vector to FIRST_IMODE vector. */ - ops[i] = gen_reg_rtx (first_imode); - emit_move_insn (ops[i], gen_lowpart (first_imode, op0)); - } - - /* Interleave low FIRST_IMODE vectors. */ - for (i = j = 0; i < n; i += 2, j++) - { - op0 = gen_reg_rtx (first_imode); - emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1])); - - /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */ - ops[j] = gen_reg_rtx (second_imode); - emit_move_insn (ops[j], gen_lowpart (second_imode, op0)); - } - - /* Interleave low SECOND_IMODE vectors. */ - switch (second_imode) - { - case E_V4SImode: - for (i = j = 0; i < n / 2; i += 2, j++) - { - op0 = gen_reg_rtx (second_imode); - emit_insn (gen_interleave_second_low (op0, ops[i], - ops[i + 1])); - - /* Cast the SECOND_IMODE vector to the THIRD_IMODE - vector. */ - ops[j] = gen_reg_rtx (third_imode); - emit_move_insn (ops[j], gen_lowpart (third_imode, op0)); - } - second_imode = V2DImode; - gen_interleave_second_low = gen_vec_interleave_lowv2di; - /* FALLTHRU */ - - case E_V2DImode: - op0 = gen_reg_rtx (second_imode); - emit_insn (gen_interleave_second_low (op0, ops[0], - ops[1])); - - /* Cast the SECOND_IMODE vector back to a vector on original - mode. */ - emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0))); - break; - - default: - gcc_unreachable (); - } -} - -/* A subroutine of ix86_expand_vector_init. Handle the most general case: - all values variable, and none identical. */ - -static void -ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode, - rtx target, rtx vals) -{ - rtx ops[64], op0, op1, op2, op3, op4, op5; - machine_mode half_mode = VOIDmode; - machine_mode quarter_mode = VOIDmode; - int n, i; - - switch (mode) - { - case E_V2SFmode: - case E_V2SImode: - if (!mmx_ok && !TARGET_SSE) - break; - /* FALLTHRU */ - - case E_V16SImode: - case E_V16SFmode: - case E_V8DFmode: - case E_V8DImode: - case E_V8SFmode: - case E_V8SImode: - case E_V4DFmode: - case E_V4DImode: - case E_V4SFmode: - case E_V4SImode: - case E_V2DFmode: - case E_V2DImode: - n = GET_MODE_NUNITS (mode); - for (i = 0; i < n; i++) - ops[i] = XVECEXP (vals, 0, i); - ix86_expand_vector_init_concat (mode, target, ops, n); - return; - - case E_V2TImode: - for (i = 0; i < 2; i++) - ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); - op0 = gen_reg_rtx (V4DImode); - ix86_expand_vector_init_concat (V4DImode, op0, ops, 2); - emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); - return; - - case E_V4TImode: - for (i = 0; i < 4; i++) - ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); - ops[4] = gen_reg_rtx (V4DImode); - ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2); - ops[5] = gen_reg_rtx (V4DImode); - ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2); - op0 = gen_reg_rtx (V8DImode); - ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2); - emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); - return; - - case E_V32QImode: - half_mode = V16QImode; - goto half; - - case E_V16HImode: - half_mode = V8HImode; - goto half; - -half: - n = GET_MODE_NUNITS (mode); - for (i = 0; i < n; i++) - ops[i] = XVECEXP (vals, 0, i); - op0 = gen_reg_rtx (half_mode); - op1 = gen_reg_rtx (half_mode); - ix86_expand_vector_init_interleave (half_mode, op0, ops, - n >> 2); - ix86_expand_vector_init_interleave (half_mode, op1, - &ops [n >> 1], n >> 2); - emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1))); - return; - - case E_V64QImode: - quarter_mode = V16QImode; - half_mode = V32QImode; - goto quarter; - - case E_V32HImode: - quarter_mode = V8HImode; - half_mode = V16HImode; - goto quarter; - -quarter: - n = GET_MODE_NUNITS (mode); - for (i = 0; i < n; i++) - ops[i] = XVECEXP (vals, 0, i); - op0 = gen_reg_rtx (quarter_mode); - op1 = gen_reg_rtx (quarter_mode); - op2 = gen_reg_rtx (quarter_mode); - op3 = gen_reg_rtx (quarter_mode); - op4 = gen_reg_rtx (half_mode); - op5 = gen_reg_rtx (half_mode); - ix86_expand_vector_init_interleave (quarter_mode, op0, ops, - n >> 3); - ix86_expand_vector_init_interleave (quarter_mode, op1, - &ops [n >> 2], n >> 3); - ix86_expand_vector_init_interleave (quarter_mode, op2, - &ops [n >> 1], n >> 3); - ix86_expand_vector_init_interleave (quarter_mode, op3, - &ops [(n >> 1) | (n >> 2)], n >> 3); - emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1))); - emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3))); - emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5))); - return; - - case E_V16QImode: - if (!TARGET_SSE4_1) - break; - /* FALLTHRU */ - - case E_V8HImode: - if (!TARGET_SSE2) - break; - - /* Don't use ix86_expand_vector_init_interleave if we can't - move from GPR to SSE register directly. */ - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) - break; - - n = GET_MODE_NUNITS (mode); - for (i = 0; i < n; i++) - ops[i] = XVECEXP (vals, 0, i); - ix86_expand_vector_init_interleave (mode, target, ops, n >> 1); - return; - - case E_V4HImode: - case E_V8QImode: - break; - - default: - gcc_unreachable (); - } - - { - int i, j, n_elts, n_words, n_elt_per_word; - machine_mode inner_mode; - rtx words[4], shift; - - inner_mode = GET_MODE_INNER (mode); - n_elts = GET_MODE_NUNITS (mode); - n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD; - n_elt_per_word = n_elts / n_words; - shift = GEN_INT (GET_MODE_BITSIZE (inner_mode)); - - for (i = 0; i < n_words; ++i) - { - rtx word = NULL_RTX; - - for (j = 0; j < n_elt_per_word; ++j) - { - rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1); - elt = convert_modes (word_mode, inner_mode, elt, true); - - if (j == 0) - word = elt; - else - { - word = expand_simple_binop (word_mode, ASHIFT, word, shift, - word, 1, OPTAB_LIB_WIDEN); - word = expand_simple_binop (word_mode, IOR, word, elt, - word, 1, OPTAB_LIB_WIDEN); - } - } - - words[i] = word; - } - - if (n_words == 1) - emit_move_insn (target, gen_lowpart (mode, words[0])); - else if (n_words == 2) - { - rtx tmp = gen_reg_rtx (mode); - emit_clobber (tmp); - emit_move_insn (gen_lowpart (word_mode, tmp), words[0]); - emit_move_insn (gen_highpart (word_mode, tmp), words[1]); - emit_move_insn (target, tmp); - } - else if (n_words == 4) - { - rtx tmp = gen_reg_rtx (V4SImode); - gcc_assert (word_mode == SImode); - vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words)); - ix86_expand_vector_init_general (false, V4SImode, tmp, vals); - emit_move_insn (target, gen_lowpart (mode, tmp)); - } - else - gcc_unreachable (); - } -} - -/* Initialize vector TARGET via VALS. Suppress the use of MMX - instructions unless MMX_OK is true. */ - -void -ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals) -{ - machine_mode mode = GET_MODE (target); - machine_mode inner_mode = GET_MODE_INNER (mode); - int n_elts = GET_MODE_NUNITS (mode); - int n_var = 0, one_var = -1; - bool all_same = true, all_const_zero = true; - int i; - rtx x; - - /* Handle first initialization from vector elts. */ - if (n_elts != XVECLEN (vals, 0)) - { - rtx subtarget = target; - x = XVECEXP (vals, 0, 0); - gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode); - if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts) - { - rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) }; - if (inner_mode == QImode || inner_mode == HImode) - { - unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode); - mode = mode_for_vector (SImode, n_bits / 4).require (); - inner_mode = mode_for_vector (SImode, n_bits / 8).require (); - ops[0] = gen_lowpart (inner_mode, ops[0]); - ops[1] = gen_lowpart (inner_mode, ops[1]); - subtarget = gen_reg_rtx (mode); - } - ix86_expand_vector_init_concat (mode, subtarget, ops, 2); - if (subtarget != target) - emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget)); - return; - } - gcc_unreachable (); - } - - for (i = 0; i < n_elts; ++i) - { - x = XVECEXP (vals, 0, i); - if (!(CONST_SCALAR_INT_P (x) - || CONST_DOUBLE_P (x) - || CONST_FIXED_P (x))) - n_var++, one_var = i; - else if (x != CONST0_RTX (inner_mode)) - all_const_zero = false; - if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) - all_same = false; - } - - /* Constants are best loaded from the constant pool. */ - if (n_var == 0) - { - emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0))); - return; - } - - /* If all values are identical, broadcast the value. */ - if (all_same - && ix86_expand_vector_init_duplicate (mmx_ok, mode, target, - XVECEXP (vals, 0, 0))) - return; - - /* Values where only one field is non-constant are best loaded from - the pool and overwritten via move later. */ - if (n_var == 1) - { - if (all_const_zero - && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target, - XVECEXP (vals, 0, one_var), - one_var)) - return; - - if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var)) - return; - } - - ix86_expand_vector_init_general (mmx_ok, mode, target, vals); -} - -void -ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) -{ - machine_mode mode = GET_MODE (target); - machine_mode inner_mode = GET_MODE_INNER (mode); - machine_mode half_mode; - bool use_vec_merge = false; - rtx tmp; - static rtx (*gen_extract[6][2]) (rtx, rtx) - = { - { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi }, - { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi }, - { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si }, - { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di }, - { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf }, - { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df } - }; - static rtx (*gen_insert[6][2]) (rtx, rtx, rtx) - = { - { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi }, - { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi }, - { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si }, - { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di }, - { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf }, - { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df } - }; - int i, j, n; - machine_mode mmode = VOIDmode; - rtx (*gen_blendm) (rtx, rtx, rtx, rtx); - - switch (mode) - { - case E_V2SFmode: - case E_V2SImode: - if (mmx_ok) - { - tmp = gen_reg_rtx (GET_MODE_INNER (mode)); - ix86_expand_vector_extract (true, tmp, target, 1 - elt); - if (elt == 0) - tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); - else - tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); - emit_insn (gen_rtx_SET (target, tmp)); - return; - } - break; - - case E_V2DImode: - use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT; - if (use_vec_merge) - break; - - tmp = gen_reg_rtx (GET_MODE_INNER (mode)); - ix86_expand_vector_extract (false, tmp, target, 1 - elt); - if (elt == 0) - tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); - else - tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); - emit_insn (gen_rtx_SET (target, tmp)); - return; - - case E_V2DFmode: - { - rtx op0, op1; - - /* For the two element vectors, we implement a VEC_CONCAT with - the extraction of the other element. */ - - tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt))); - tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp); - - if (elt == 0) - op0 = val, op1 = tmp; - else - op0 = tmp, op1 = val; - - tmp = gen_rtx_VEC_CONCAT (mode, op0, op1); - emit_insn (gen_rtx_SET (target, tmp)); - } - return; - - case E_V4SFmode: - use_vec_merge = TARGET_SSE4_1; - if (use_vec_merge) - break; - - switch (elt) - { - case 0: - use_vec_merge = true; - break; - - case 1: - /* tmp = target = A B C D */ - tmp = copy_to_reg (target); - /* target = A A B B */ - emit_insn (gen_vec_interleave_lowv4sf (target, target, target)); - /* target = X A B B */ - ix86_expand_vector_set (false, target, val, 0); - /* target = A X C D */ - emit_insn (gen_sse_shufps_v4sf (target, target, tmp, - const1_rtx, const0_rtx, - GEN_INT (2+4), GEN_INT (3+4))); - return; - - case 2: - /* tmp = target = A B C D */ - tmp = copy_to_reg (target); - /* tmp = X B C D */ - ix86_expand_vector_set (false, tmp, val, 0); - /* target = A B X D */ - emit_insn (gen_sse_shufps_v4sf (target, target, tmp, - const0_rtx, const1_rtx, - GEN_INT (0+4), GEN_INT (3+4))); - return; - - case 3: - /* tmp = target = A B C D */ - tmp = copy_to_reg (target); - /* tmp = X B C D */ - ix86_expand_vector_set (false, tmp, val, 0); - /* target = A B X D */ - emit_insn (gen_sse_shufps_v4sf (target, target, tmp, - const0_rtx, const1_rtx, - GEN_INT (2+4), GEN_INT (0+4))); - return; - - default: - gcc_unreachable (); - } - break; - - case E_V4SImode: - use_vec_merge = TARGET_SSE4_1; - if (use_vec_merge) - break; - - /* Element 0 handled by vec_merge below. */ - if (elt == 0) - { - use_vec_merge = true; - break; - } - - if (TARGET_SSE2) - { - /* With SSE2, use integer shuffles to swap element 0 and ELT, - store into element 0, then shuffle them back. */ - - rtx order[4]; - - order[0] = GEN_INT (elt); - order[1] = const1_rtx; - order[2] = const2_rtx; - order[3] = GEN_INT (3); - order[elt] = const0_rtx; - - emit_insn (gen_sse2_pshufd_1 (target, target, order[0], - order[1], order[2], order[3])); - - ix86_expand_vector_set (false, target, val, 0); - - emit_insn (gen_sse2_pshufd_1 (target, target, order[0], - order[1], order[2], order[3])); - } - else - { - /* For SSE1, we have to reuse the V4SF code. */ - rtx t = gen_reg_rtx (V4SFmode); - emit_move_insn (t, gen_lowpart (V4SFmode, target)); - ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt); - emit_move_insn (target, gen_lowpart (mode, t)); - } - return; - - case E_V8HImode: - use_vec_merge = TARGET_SSE2; - break; - case E_V4HImode: - use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); - break; - - case E_V16QImode: - use_vec_merge = TARGET_SSE4_1; - break; - - case E_V8QImode: - break; - - case E_V32QImode: - half_mode = V16QImode; - j = 0; - n = 16; - goto half; - - case E_V16HImode: - half_mode = V8HImode; - j = 1; - n = 8; - goto half; - - case E_V8SImode: - half_mode = V4SImode; - j = 2; - n = 4; - goto half; - - case E_V4DImode: - half_mode = V2DImode; - j = 3; - n = 2; - goto half; - - case E_V8SFmode: - half_mode = V4SFmode; - j = 4; - n = 4; - goto half; - - case E_V4DFmode: - half_mode = V2DFmode; - j = 5; - n = 2; - goto half; - -half: - /* Compute offset. */ - i = elt / n; - elt %= n; - - gcc_assert (i <= 1); - - /* Extract the half. */ - tmp = gen_reg_rtx (half_mode); - emit_insn (gen_extract[j][i] (tmp, target)); - - /* Put val in tmp at elt. */ - ix86_expand_vector_set (false, tmp, val, elt); - - /* Put it back. */ - emit_insn (gen_insert[j][i] (target, target, tmp)); - return; - - case E_V8DFmode: - if (TARGET_AVX512F) - { - mmode = QImode; - gen_blendm = gen_avx512f_blendmv8df; - } - break; - - case E_V8DImode: - if (TARGET_AVX512F) - { - mmode = QImode; - gen_blendm = gen_avx512f_blendmv8di; - } - break; - - case E_V16SFmode: - if (TARGET_AVX512F) - { - mmode = HImode; - gen_blendm = gen_avx512f_blendmv16sf; - } - break; - - case E_V16SImode: - if (TARGET_AVX512F) - { - mmode = HImode; - gen_blendm = gen_avx512f_blendmv16si; - } - break; - - case E_V32HImode: - if (TARGET_AVX512BW) - { - mmode = SImode; - gen_blendm = gen_avx512bw_blendmv32hi; - } - else if (TARGET_AVX512F) - { - half_mode = E_V8HImode; - n = 8; - goto quarter; - } - break; - - case E_V64QImode: - if (TARGET_AVX512BW) - { - mmode = DImode; - gen_blendm = gen_avx512bw_blendmv64qi; - } - else if (TARGET_AVX512F) - { - half_mode = E_V16QImode; - n = 16; - goto quarter; - } - break; - -quarter: - /* Compute offset. */ - i = elt / n; - elt %= n; - - gcc_assert (i <= 3); - - { - /* Extract the quarter. */ - tmp = gen_reg_rtx (V4SImode); - rtx tmp2 = gen_lowpart (V16SImode, target); - rtx mask = gen_reg_rtx (QImode); - - emit_move_insn (mask, constm1_rtx); - emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i), - tmp, mask)); - - tmp2 = gen_reg_rtx (half_mode); - emit_move_insn (tmp2, gen_lowpart (half_mode, tmp)); - tmp = tmp2; - - /* Put val in tmp at elt. */ - ix86_expand_vector_set (false, tmp, val, elt); - - /* Put it back. */ - tmp2 = gen_reg_rtx (V16SImode); - rtx tmp3 = gen_lowpart (V16SImode, target); - mask = gen_reg_rtx (HImode); - emit_move_insn (mask, constm1_rtx); - tmp = gen_lowpart (V4SImode, tmp); - emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i), - tmp3, mask)); - emit_move_insn (target, gen_lowpart (mode, tmp2)); - } - return; - - default: - break; - } - - if (mmode != VOIDmode) - { - tmp = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val))); - /* The avx512*_blendm expanders have different operand order - from VEC_MERGE. In VEC_MERGE, the first input operand is used for - elements where the mask is set and second input operand otherwise, - in {sse,avx}*_*blend* the first input operand is used for elements - where the mask is clear and second input operand otherwise. */ - emit_insn (gen_blendm (target, target, tmp, - force_reg (mmode, - gen_int_mode (HOST_WIDE_INT_1U << elt, - mmode)))); - } - else if (use_vec_merge) - { - tmp = gen_rtx_VEC_DUPLICATE (mode, val); - tmp = gen_rtx_VEC_MERGE (mode, tmp, target, - GEN_INT (HOST_WIDE_INT_1U << elt)); - emit_insn (gen_rtx_SET (target, tmp)); - } - else - { - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); - - emit_move_insn (mem, target); - - tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode)); - emit_move_insn (tmp, val); - - emit_move_insn (target, mem); - } -} - -void -ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) -{ - machine_mode mode = GET_MODE (vec); - machine_mode inner_mode = GET_MODE_INNER (mode); - bool use_vec_extr = false; - rtx tmp; - - switch (mode) - { - case E_V2SImode: - case E_V2SFmode: - if (!mmx_ok) - break; - /* FALLTHRU */ - - case E_V2DFmode: - case E_V2DImode: - case E_V2TImode: - case E_V4TImode: - use_vec_extr = true; - break; - - case E_V4SFmode: - use_vec_extr = TARGET_SSE4_1; - if (use_vec_extr) - break; - - switch (elt) - { - case 0: - tmp = vec; - break; - - case 1: - case 3: - tmp = gen_reg_rtx (mode); - emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec, - GEN_INT (elt), GEN_INT (elt), - GEN_INT (elt+4), GEN_INT (elt+4))); - break; - - case 2: - tmp = gen_reg_rtx (mode); - emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec)); - break; - - default: - gcc_unreachable (); - } - vec = tmp; - use_vec_extr = true; - elt = 0; - break; - - case E_V4SImode: - use_vec_extr = TARGET_SSE4_1; - if (use_vec_extr) - break; - - if (TARGET_SSE2) - { - switch (elt) - { - case 0: - tmp = vec; - break; - - case 1: - case 3: - tmp = gen_reg_rtx (mode); - emit_insn (gen_sse2_pshufd_1 (tmp, vec, - GEN_INT (elt), GEN_INT (elt), - GEN_INT (elt), GEN_INT (elt))); - break; - - case 2: - tmp = gen_reg_rtx (mode); - emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec)); - break; - - default: - gcc_unreachable (); - } - vec = tmp; - use_vec_extr = true; - elt = 0; - } - else - { - /* For SSE1, we have to reuse the V4SF code. */ - ix86_expand_vector_extract (false, gen_lowpart (SFmode, target), - gen_lowpart (V4SFmode, vec), elt); - return; - } - break; - - case E_V8HImode: - use_vec_extr = TARGET_SSE2; - break; - case E_V4HImode: - use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A); - break; - - case E_V16QImode: - use_vec_extr = TARGET_SSE4_1; - break; - - case E_V8SFmode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V4SFmode); - if (elt < 4) - emit_insn (gen_vec_extract_lo_v8sf (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v8sf (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 3); - return; - } - break; - - case E_V4DFmode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V2DFmode); - if (elt < 2) - emit_insn (gen_vec_extract_lo_v4df (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v4df (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 1); - return; - } - break; - - case E_V32QImode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V16QImode); - if (elt < 16) - emit_insn (gen_vec_extract_lo_v32qi (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v32qi (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 15); - return; - } - break; - - case E_V16HImode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V8HImode); - if (elt < 8) - emit_insn (gen_vec_extract_lo_v16hi (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v16hi (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 7); - return; - } - break; - - case E_V8SImode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V4SImode); - if (elt < 4) - emit_insn (gen_vec_extract_lo_v8si (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v8si (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 3); - return; - } - break; - - case E_V4DImode: - if (TARGET_AVX) - { - tmp = gen_reg_rtx (V2DImode); - if (elt < 2) - emit_insn (gen_vec_extract_lo_v4di (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v4di (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 1); - return; - } - break; - - case E_V32HImode: - if (TARGET_AVX512BW) - { - tmp = gen_reg_rtx (V16HImode); - if (elt < 16) - emit_insn (gen_vec_extract_lo_v32hi (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v32hi (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 15); - return; - } - break; - - case E_V64QImode: - if (TARGET_AVX512BW) - { - tmp = gen_reg_rtx (V32QImode); - if (elt < 32) - emit_insn (gen_vec_extract_lo_v64qi (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v64qi (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 31); - return; - } - break; - - case E_V16SFmode: - tmp = gen_reg_rtx (V8SFmode); - if (elt < 8) - emit_insn (gen_vec_extract_lo_v16sf (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v16sf (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 7); - return; - - case E_V8DFmode: - tmp = gen_reg_rtx (V4DFmode); - if (elt < 4) - emit_insn (gen_vec_extract_lo_v8df (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v8df (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 3); - return; - - case E_V16SImode: - tmp = gen_reg_rtx (V8SImode); - if (elt < 8) - emit_insn (gen_vec_extract_lo_v16si (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v16si (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 7); - return; - - case E_V8DImode: - tmp = gen_reg_rtx (V4DImode); - if (elt < 4) - emit_insn (gen_vec_extract_lo_v8di (tmp, vec)); - else - emit_insn (gen_vec_extract_hi_v8di (tmp, vec)); - ix86_expand_vector_extract (false, target, tmp, elt & 3); - return; - - case E_V8QImode: - /* ??? Could extract the appropriate HImode element and shift. */ - default: - break; - } - - if (use_vec_extr) - { - tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt))); - tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp); - - /* Let the rtl optimizers know about the zero extension performed. */ - if (inner_mode == QImode || inner_mode == HImode) - { - tmp = gen_rtx_ZERO_EXTEND (SImode, tmp); - target = gen_lowpart (SImode, target); - } - - emit_insn (gen_rtx_SET (target, tmp)); - } - else - { - rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode)); - - emit_move_insn (mem, vec); - - tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode)); - emit_move_insn (target, tmp); - } -} - -/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC - to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode. - The upper bits of DEST are undefined, though they shouldn't cause - exceptions (some bits from src or all zeros are ok). */ - -static void -emit_reduc_half (rtx dest, rtx src, int i) -{ - rtx tem, d = dest; - switch (GET_MODE (src)) - { - case E_V4SFmode: - if (i == 128) - tem = gen_sse_movhlps (dest, src, src); - else - tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx, - GEN_INT (1 + 4), GEN_INT (1 + 4)); - break; - case E_V2DFmode: - tem = gen_vec_interleave_highv2df (dest, src, src); - break; - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - d = gen_reg_rtx (V1TImode); - tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src), - GEN_INT (i / 2)); - break; - case E_V8SFmode: - if (i == 256) - tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx); - else - tem = gen_avx_shufps256 (dest, src, src, - GEN_INT (i == 128 ? 2 + (3 << 2) : 1)); - break; - case E_V4DFmode: - if (i == 256) - tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx); - else - tem = gen_avx_shufpd256 (dest, src, src, const1_rtx); - break; - case E_V32QImode: - case E_V16HImode: - case E_V8SImode: - case E_V4DImode: - if (i == 256) - { - if (GET_MODE (dest) != V4DImode) - d = gen_reg_rtx (V4DImode); - tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src), - gen_lowpart (V4DImode, src), - const1_rtx); - } - else - { - d = gen_reg_rtx (V2TImode); - tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src), - GEN_INT (i / 2)); - } - break; - case E_V64QImode: - case E_V32HImode: - case E_V16SImode: - case E_V16SFmode: - case E_V8DImode: - case E_V8DFmode: - if (i > 128) - tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest), - gen_lowpart (V16SImode, src), - gen_lowpart (V16SImode, src), - GEN_INT (0x4 + (i == 512 ? 4 : 0)), - GEN_INT (0x5 + (i == 512 ? 4 : 0)), - GEN_INT (0x6 + (i == 512 ? 4 : 0)), - GEN_INT (0x7 + (i == 512 ? 4 : 0)), - GEN_INT (0xC), GEN_INT (0xD), - GEN_INT (0xE), GEN_INT (0xF), - GEN_INT (0x10), GEN_INT (0x11), - GEN_INT (0x12), GEN_INT (0x13), - GEN_INT (0x14), GEN_INT (0x15), - GEN_INT (0x16), GEN_INT (0x17)); - else - tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest), - gen_lowpart (V16SImode, src), - GEN_INT (i == 128 ? 0x2 : 0x1), - GEN_INT (0x3), - GEN_INT (0x3), - GEN_INT (0x3), - GEN_INT (i == 128 ? 0x6 : 0x5), - GEN_INT (0x7), - GEN_INT (0x7), - GEN_INT (0x7), - GEN_INT (i == 128 ? 0xA : 0x9), - GEN_INT (0xB), - GEN_INT (0xB), - GEN_INT (0xB), - GEN_INT (i == 128 ? 0xE : 0xD), - GEN_INT (0xF), - GEN_INT (0xF), - GEN_INT (0xF)); - break; - default: - gcc_unreachable (); - } - emit_insn (tem); - if (d != dest) - emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d)); -} - -/* Expand a vector reduction. FN is the binary pattern to reduce; - DEST is the destination; IN is the input vector. */ - -void -ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in) -{ - rtx half, dst, vec = in; - machine_mode mode = GET_MODE (in); - int i; - - /* SSE4 has a special instruction for V8HImode UMIN reduction. */ - if (TARGET_SSE4_1 - && mode == V8HImode - && fn == gen_uminv8hi3) - { - emit_insn (gen_sse4_1_phminposuw (dest, in)); - return; - } - - for (i = GET_MODE_BITSIZE (mode); - i > GET_MODE_UNIT_BITSIZE (mode); - i >>= 1) - { - half = gen_reg_rtx (mode); - emit_reduc_half (half, vec, i); - if (i == GET_MODE_UNIT_BITSIZE (mode) * 2) - dst = dest; - else - dst = gen_reg_rtx (mode); - emit_insn (fn (dst, half, vec)); - vec = dst; - } -} - -/* Target hook for scalar_mode_supported_p. */ -static bool -ix86_scalar_mode_supported_p (scalar_mode mode) -{ - if (DECIMAL_FLOAT_MODE_P (mode)) - return default_decimal_float_supported_p (); - else if (mode == TFmode) - return true; - else - return default_scalar_mode_supported_p (mode); -} - -/* Implements target hook vector_mode_supported_p. */ -static bool -ix86_vector_mode_supported_p (machine_mode mode) -{ - if (TARGET_SSE && VALID_SSE_REG_MODE (mode)) - return true; - if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode)) - return true; - if (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) - return true; - if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) - return true; - if (TARGET_MMX && VALID_MMX_REG_MODE (mode)) - return true; - if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode)) - return true; - return false; -} - -/* Target hook for c_mode_for_suffix. */ -static machine_mode -ix86_c_mode_for_suffix (char suffix) -{ - if (suffix == 'q') - return TFmode; - if (suffix == 'w') - return XFmode; - - return VOIDmode; -} - -/* Worker function for TARGET_MD_ASM_ADJUST. - - We implement asm flag outputs, and maintain source compatibility - with the old cc0-based compiler. */ - -static rtx_insn * -ix86_md_asm_adjust (vec &outputs, vec &/*inputs*/, - vec &constraints, - vec &clobbers, HARD_REG_SET &clobbered_regs) -{ - bool saw_asm_flag = false; - - start_sequence (); - for (unsigned i = 0, n = outputs.length (); i < n; ++i) - { - const char *con = constraints[i]; - if (strncmp (con, "=@cc", 4) != 0) - continue; - con += 4; - if (strchr (con, ',') != NULL) - { - error ("alternatives not allowed in asm flag output"); - continue; - } - - bool invert = false; - if (con[0] == 'n') - invert = true, con++; - - machine_mode mode = CCmode; - rtx_code code = UNKNOWN; - - switch (con[0]) - { - case 'a': - if (con[1] == 0) - mode = CCAmode, code = EQ; - else if (con[1] == 'e' && con[2] == 0) - mode = CCCmode, code = NE; - break; - case 'b': - if (con[1] == 0) - mode = CCCmode, code = EQ; - else if (con[1] == 'e' && con[2] == 0) - mode = CCAmode, code = NE; - break; - case 'c': - if (con[1] == 0) - mode = CCCmode, code = EQ; - break; - case 'e': - if (con[1] == 0) - mode = CCZmode, code = EQ; - break; - case 'g': - if (con[1] == 0) - mode = CCGCmode, code = GT; - else if (con[1] == 'e' && con[2] == 0) - mode = CCGCmode, code = GE; - break; - case 'l': - if (con[1] == 0) - mode = CCGCmode, code = LT; - else if (con[1] == 'e' && con[2] == 0) - mode = CCGCmode, code = LE; - break; - case 'o': - if (con[1] == 0) - mode = CCOmode, code = EQ; - break; - case 'p': - if (con[1] == 0) - mode = CCPmode, code = EQ; - break; - case 's': - if (con[1] == 0) - mode = CCSmode, code = EQ; - break; - case 'z': - if (con[1] == 0) - mode = CCZmode, code = EQ; - break; - } - if (code == UNKNOWN) - { - error ("unknown asm flag output %qs", constraints[i]); - continue; - } - if (invert) - code = reverse_condition (code); - - rtx dest = outputs[i]; - if (!saw_asm_flag) - { - /* This is the first asm flag output. Here we put the flags - register in as the real output and adjust the condition to - allow it. */ - constraints[i] = "=Bf"; - outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG); - saw_asm_flag = true; - } - else - { - /* We don't need the flags register as output twice. */ - constraints[i] = "=X"; - outputs[i] = gen_rtx_SCRATCH (SImode); - } - - rtx x = gen_rtx_REG (mode, FLAGS_REG); - x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx); - - machine_mode dest_mode = GET_MODE (dest); - if (!SCALAR_INT_MODE_P (dest_mode)) - { - error ("invalid type for asm flag output"); - continue; - } - - if (dest_mode == DImode && !TARGET_64BIT) - dest_mode = SImode; - - if (dest_mode != QImode) - { - rtx destqi = gen_reg_rtx (QImode); - emit_insn (gen_rtx_SET (destqi, x)); - - if (TARGET_ZERO_EXTEND_WITH_AND - && optimize_function_for_speed_p (cfun)) - { - x = force_reg (dest_mode, const0_rtx); - - emit_insn (gen_movstrictqi - (gen_lowpart (QImode, x), destqi)); - } - else - x = gen_rtx_ZERO_EXTEND (dest_mode, destqi); - } - - if (dest_mode != GET_MODE (dest)) - { - rtx tmp = gen_reg_rtx (SImode); - - emit_insn (gen_rtx_SET (tmp, x)); - emit_insn (gen_zero_extendsidi2 (dest, tmp)); - } - else - emit_insn (gen_rtx_SET (dest, x)); - } - rtx_insn *seq = get_insns (); - end_sequence (); - - if (saw_asm_flag) - return seq; - else - { - /* If we had no asm flag outputs, clobber the flags. */ - clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG)); - SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG); - return NULL; - } -} - -/* Implements target vector targetm.asm.encode_section_info. */ - -static void ATTRIBUTE_UNUSED -ix86_encode_section_info (tree decl, rtx rtl, int first) -{ - default_encode_section_info (decl, rtl, first); - - if (ix86_in_large_data_p (decl)) - SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR; -} - -/* Worker function for REVERSE_CONDITION. */ - -enum rtx_code -ix86_reverse_condition (enum rtx_code code, machine_mode mode) -{ - return (mode == CCFPmode - ? reverse_condition_maybe_unordered (code) - : reverse_condition (code)); -} - -/* Output code to perform an x87 FP register move, from OPERANDS[1] - to OPERANDS[0]. */ - -const char * -output_387_reg_move (rtx_insn *insn, rtx *operands) -{ - if (REG_P (operands[0])) - { - if (REG_P (operands[1]) - && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - { - if (REGNO (operands[0]) == FIRST_STACK_REG) - return output_387_ffreep (operands, 0); - return "fstp\t%y0"; - } - if (STACK_TOP_P (operands[0])) - return "fld%Z1\t%y1"; - return "fst\t%y0"; - } - else if (MEM_P (operands[0])) - { - gcc_assert (REG_P (operands[1])); - if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) - return "fstp%Z0\t%y0"; - else - { - /* There is no non-popping store to memory for XFmode. - So if we need one, follow the store with a load. */ - if (GET_MODE (operands[0]) == XFmode) - return "fstp%Z0\t%y0\n\tfld%Z0\t%y0"; - else - return "fst%Z0\t%y0"; - } - } - else - gcc_unreachable(); -} - -/* Output code to perform a conditional jump to LABEL, if C2 flag in - FP status register is set. */ - -void -ix86_emit_fp_unordered_jump (rtx label) -{ - rtx reg = gen_reg_rtx (HImode); - rtx_insn *insn; - rtx temp; - - emit_insn (gen_x86_fnstsw_1 (reg)); - - if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ())) - { - emit_insn (gen_x86_sahf_1 (reg)); - - temp = gen_rtx_REG (CCmode, FLAGS_REG); - temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx); - } - else - { - emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04))); - - temp = gen_rtx_REG (CCNOmode, FLAGS_REG); - temp = gen_rtx_NE (VOIDmode, temp, const0_rtx); - } - - temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp, - gen_rtx_LABEL_REF (VOIDmode, label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp)); - predict_jump (REG_BR_PROB_BASE * 10 / 100); - JUMP_LABEL (insn) = label; -} - -/* Output code to perform an sinh XFmode calculation. */ - -void ix86_emit_i387_sinh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx half = const_double_from_real_value (dconsthalf, XFmode); - rtx cst1, tmp; - rtx_code_label *jump_label = gen_label_rtx (); - rtx_insn *insn; - - /* scratch = fxam (op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e1 = expm1 (|op1|) */ - emit_insn (gen_absxf2 (e2, op1)); - emit_insn (gen_expm1xf2 (e1, e2)); - - /* e2 = e1 / (e1 + 1.0) + e1 */ - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_addxf3 (e2, e1, cst1)); - emit_insn (gen_divxf3 (e2, e1, e2)); - emit_insn (gen_addxf3 (e2, e2, e1)); - - /* flags = signbit (op1) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (flags) then e2 = -e2 */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_EQ (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (gen_negxf2 (e2, e2)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - /* op0 = 0.5 * e2 */ - half = force_reg (XFmode, half); - emit_insn (gen_mulxf3 (op0, e2, half)); -} - -/* Output code to perform an cosh XFmode calculation. */ - -void ix86_emit_i387_cosh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx half = const_double_from_real_value (dconsthalf, XFmode); - rtx cst1; - - /* e1 = exp (op1) */ - emit_insn (gen_expxf2 (e1, op1)); - - /* e2 = e1 + 1.0 / e1 */ - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_divxf3 (e2, cst1, e1)); - emit_insn (gen_addxf3 (e2, e1, e2)); - - /* op0 = 0.5 * e2 */ - half = force_reg (XFmode, half); - emit_insn (gen_mulxf3 (op0, e2, half)); -} - -/* Output code to perform an tanh XFmode calculation. */ - -void ix86_emit_i387_tanh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx cst2, tmp; - rtx_code_label *jump_label = gen_label_rtx (); - rtx_insn *insn; - - /* scratch = fxam (op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e1 = expm1 (-|2 * op1|) */ - emit_insn (gen_addxf3 (e2, op1, op1)); - emit_insn (gen_absxf2 (e2, e2)); - emit_insn (gen_negxf2 (e2, e2)); - emit_insn (gen_expm1xf2 (e1, e2)); - - /* e2 = e1 / (e1 + 2.0) */ - cst2 = force_reg (XFmode, CONST2_RTX (XFmode)); - emit_insn (gen_addxf3 (e2, e1, cst2)); - emit_insn (gen_divxf3 (e2, e1, e2)); - - /* flags = signbit (op1) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (!flags) then e2 = -e2 */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_NE (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (gen_negxf2 (e2, e2)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - emit_move_insn (op0, e2); -} - -/* Output code to perform an asinh XFmode calculation. */ - -void ix86_emit_i387_asinh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx cst1, tmp; - rtx_code_label *jump_label = gen_label_rtx (); - rtx_insn *insn; - - /* e2 = sqrt (op1^2 + 1.0) + 1.0 */ - emit_insn (gen_mulxf3 (e1, op1, op1)); - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_addxf3 (e2, e1, cst1)); - emit_insn (gen_sqrtxf2 (e2, e2)); - emit_insn (gen_addxf3 (e2, e2, cst1)); - - /* e1 = e1 / e2 */ - emit_insn (gen_divxf3 (e1, e1, e2)); - - /* scratch = fxam (op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e1 = e1 + |op1| */ - emit_insn (gen_absxf2 (e2, op1)); - emit_insn (gen_addxf3 (e1, e1, e2)); - - /* e2 = log1p (e1) */ - ix86_emit_i387_log1p (e2, e1); - - /* flags = signbit (op1) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (flags) then e2 = -e2 */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_EQ (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (gen_negxf2 (e2, e2)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - emit_move_insn (op0, e2); -} - -/* Output code to perform an acosh XFmode calculation. */ - -void ix86_emit_i387_acosh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - - /* e2 = sqrt (op1 + 1.0) */ - emit_insn (gen_addxf3 (e2, op1, cst1)); - emit_insn (gen_sqrtxf2 (e2, e2)); - - /* e1 = sqrt (op1 - 1.0) */ - emit_insn (gen_subxf3 (e1, op1, cst1)); - emit_insn (gen_sqrtxf2 (e1, e1)); - - /* e1 = e1 * e2 */ - emit_insn (gen_mulxf3 (e1, e1, e2)); - - /* e1 = e1 + op1 */ - emit_insn (gen_addxf3 (e1, e1, op1)); - - /* op0 = log (e1) */ - emit_insn (gen_logxf2 (op0, e1)); -} - -/* Output code to perform an atanh XFmode calculation. */ - -void ix86_emit_i387_atanh (rtx op0, rtx op1) -{ - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx half = const_double_from_real_value (dconsthalf, XFmode); - rtx cst1, tmp; - rtx_code_label *jump_label = gen_label_rtx (); - rtx_insn *insn; - - /* scratch = fxam (op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e2 = |op1| */ - emit_insn (gen_absxf2 (e2, op1)); - - /* e1 = -(e2 + e2) / (e2 + 1.0) */ - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_addxf3 (e1, e2, cst1)); - emit_insn (gen_addxf3 (e2, e2, e2)); - emit_insn (gen_negxf2 (e2, e2)); - emit_insn (gen_divxf3 (e1, e2, e1)); - - /* e2 = log1p (e1) */ - ix86_emit_i387_log1p (e2, e1); - - /* flags = signbit (op1) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (!flags) then e2 = -e2 */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_NE (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (gen_negxf2 (e2, e2)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - /* op0 = 0.5 * e2 */ - half = force_reg (XFmode, half); - emit_insn (gen_mulxf3 (op0, e2, half)); -} - -/* Output code to perform a log1p XFmode calculation. */ - -void ix86_emit_i387_log1p (rtx op0, rtx op1) -{ - rtx_code_label *label1 = gen_label_rtx (); - rtx_code_label *label2 = gen_label_rtx (); - - rtx tmp = gen_reg_rtx (XFmode); - rtx res = gen_reg_rtx (XFmode); - rtx cst, cstln2, cst1; - rtx_insn *insn; - - cst = const_double_from_real_value - (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode); - cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */ - - emit_insn (gen_absxf2 (tmp, op1)); - - cst = force_reg (XFmode, cst); - ix86_expand_branch (GE, tmp, cst, label1); - predict_jump (REG_BR_PROB_BASE * 10 / 100); - insn = get_last_insn (); - JUMP_LABEL (insn) = label1; - - emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2)); - emit_jump (label2); - - emit_label (label1); - LABEL_NUSES (label1) = 1; - - cst1 = force_reg (XFmode, CONST1_RTX (XFmode)); - emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1))); - emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2)); - - emit_label (label2); - LABEL_NUSES (label2) = 1; - - emit_move_insn (op0, res); -} - -/* Emit code for round calculation. */ -void ix86_emit_i387_round (rtx op0, rtx op1) -{ - machine_mode inmode = GET_MODE (op1); - machine_mode outmode = GET_MODE (op0); - rtx e1 = gen_reg_rtx (XFmode); - rtx e2 = gen_reg_rtx (XFmode); - rtx scratch = gen_reg_rtx (HImode); - rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG); - rtx half = const_double_from_real_value (dconsthalf, XFmode); - rtx res = gen_reg_rtx (outmode); - rtx_code_label *jump_label = gen_label_rtx (); - rtx (*floor_insn) (rtx, rtx); - rtx (*neg_insn) (rtx, rtx); - rtx_insn *insn; - rtx tmp; - - switch (inmode) - { - case E_SFmode: - case E_DFmode: - tmp = gen_reg_rtx (XFmode); - - emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1))); - op1 = tmp; - break; - case E_XFmode: - break; - default: - gcc_unreachable (); - } - - switch (outmode) - { - case E_SFmode: - floor_insn = gen_frndintxf2_floor; - neg_insn = gen_negsf2; - break; - case E_DFmode: - floor_insn = gen_frndintxf2_floor; - neg_insn = gen_negdf2; - break; - case E_XFmode: - floor_insn = gen_frndintxf2_floor; - neg_insn = gen_negxf2; - break; - case E_HImode: - floor_insn = gen_lfloorxfhi2; - neg_insn = gen_neghi2; - break; - case E_SImode: - floor_insn = gen_lfloorxfsi2; - neg_insn = gen_negsi2; - break; - case E_DImode: - floor_insn = gen_lfloorxfdi2; - neg_insn = gen_negdi2; - break; - default: - gcc_unreachable (); - } - - /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */ - - /* scratch = fxam(op1) */ - emit_insn (gen_fxamxf2_i387 (scratch, op1)); - - /* e1 = fabs(op1) */ - emit_insn (gen_absxf2 (e1, op1)); - - /* e2 = e1 + 0.5 */ - half = force_reg (XFmode, half); - emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half))); - - /* res = floor(e2) */ - switch (outmode) - { - case E_SFmode: - case E_DFmode: - { - tmp = gen_reg_rtx (XFmode); - - emit_insn (floor_insn (tmp, e2)); - emit_insn (gen_rtx_SET (res, - gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp), - UNSPEC_TRUNC_NOOP))); - } - break; - default: - emit_insn (floor_insn (res, e2)); - } - - /* flags = signbit(a) */ - emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02))); - - /* if (flags) then res = -res */ - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, - gen_rtx_EQ (VOIDmode, flags, const0_rtx), - gen_rtx_LABEL_REF (VOIDmode, jump_label), - pc_rtx); - insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - predict_jump (REG_BR_PROB_BASE * 50 / 100); - JUMP_LABEL (insn) = jump_label; - - emit_insn (neg_insn (res, res)); - - emit_label (jump_label); - LABEL_NUSES (jump_label) = 1; - - emit_move_insn (op0, res); -} - -/* Output code to perform a Newton-Rhapson approximation of a single precision - floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */ - -void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode) -{ - rtx x0, x1, e0, e1; - - x0 = gen_reg_rtx (mode); - e0 = gen_reg_rtx (mode); - e1 = gen_reg_rtx (mode); - x1 = gen_reg_rtx (mode); - - /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ - - b = force_reg (mode, b); - - /* x0 = rcp(b) estimate */ - if (mode == V16SFmode || mode == V8DFmode) - { - if (TARGET_AVX512ER) - { - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), - UNSPEC_RCP28))); - /* res = a * x0 */ - emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0))); - return; - } - else - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), - UNSPEC_RCP14))); - } - else - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), - UNSPEC_RCP))); - - /* e0 = x0 * b */ - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b))); - - /* e0 = x0 * e0 */ - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0))); - - /* e1 = x0 + x0 */ - emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0))); - - /* x1 = e1 - e0 */ - emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0))); - - /* res = a * x1 */ - emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1))); -} - -/* Output code to perform a Newton-Rhapson approximation of a - single precision floating point [reciprocal] square root. */ - -void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip) -{ - rtx x0, e0, e1, e2, e3, mthree, mhalf; - REAL_VALUE_TYPE r; - int unspec; - - x0 = gen_reg_rtx (mode); - e0 = gen_reg_rtx (mode); - e1 = gen_reg_rtx (mode); - e2 = gen_reg_rtx (mode); - e3 = gen_reg_rtx (mode); - - if (TARGET_AVX512ER && mode == V16SFmode) - { - if (recip) - /* res = rsqrt28(a) estimate */ - emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), - UNSPEC_RSQRT28))); - else - { - /* x0 = rsqrt28(a) estimate */ - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), - UNSPEC_RSQRT28))); - /* res = rcp28(x0) estimate */ - emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0), - UNSPEC_RCP28))); - } - return; - } - - real_from_integer (&r, VOIDmode, -3, SIGNED); - mthree = const_double_from_real_value (r, SFmode); - - real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL); - mhalf = const_double_from_real_value (r, SFmode); - unspec = UNSPEC_RSQRT; - - if (VECTOR_MODE_P (mode)) - { - mthree = ix86_build_const_vector (mode, true, mthree); - mhalf = ix86_build_const_vector (mode, true, mhalf); - /* There is no 512-bit rsqrt. There is however rsqrt14. */ - if (GET_MODE_SIZE (mode) == 64) - unspec = UNSPEC_RSQRT14; - } - - /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) - rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */ - - a = force_reg (mode, a); - - /* x0 = rsqrt(a) estimate */ - emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), - unspec))); - - /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */ - if (!recip) - { - rtx zero = force_reg (mode, CONST0_RTX(mode)); - rtx mask; - - /* Handle masked compare. */ - if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64) - { - mask = gen_reg_rtx (HImode); - /* Imm value 0x4 corresponds to not-equal comparison. */ - emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4))); - emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask)); - } - else - { - mask = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a))); - emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask))); + default: + break; } } - /* e0 = x0 * a */ - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a))); - /* e1 = e0 * x0 */ - emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0))); - - /* e2 = e1 - 3. */ - mthree = force_reg (mode, mthree); - emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree))); +#ifdef SUBTARGET_FOLD_BUILTIN + return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore); +#endif - mhalf = force_reg (mode, mhalf); - if (recip) - /* e3 = -.5 * x0 */ - emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf))); - else - /* e3 = -.5 * e0 */ - emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf))); - /* ret = e2 * e3 */ - emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3))); + return NULL_TREE; } -#ifdef TARGET_SOLARIS -/* Solaris implementation of TARGET_ASM_NAMED_SECTION. */ +/* Fold a MD builtin (use ix86_fold_builtin for folding into + constant) in GIMPLE. */ -static void -i386_solaris_elf_named_section (const char *name, unsigned int flags, - tree decl) +bool +ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi) { - /* With Binutils 2.15, the "@unwind" marker must be specified on - every occurrence of the ".eh_frame" section, not just the first - one. */ - if (TARGET_64BIT - && strcmp (name, ".eh_frame") == 0) - { - fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name, - flags & SECTION_WRITE ? "aw" : "a"); - return; - } + gimple *stmt = gsi_stmt (*gsi); + tree fndecl = gimple_call_fndecl (stmt); + gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD)); + int n_args = gimple_call_num_args (stmt); + enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl); + tree decl = NULL_TREE; + tree arg0, arg1; + enum rtx_code rcode; + unsigned HOST_WIDE_INT count; + bool is_vshift; -#ifndef USE_GAS - if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE) + switch (fn_code) { - solaris_elf_asm_comdat_section (name, flags, decl); - return; - } + case IX86_BUILTIN_TZCNT32: + decl = builtin_decl_implicit (BUILT_IN_CTZ); + goto fold_tzcnt_lzcnt; - /* Solaris/x86 as uses the same syntax for the SHF_EXCLUDE flags as the - SPARC assembler. One cannot mix single-letter flags and #exclude, so - only emit the latter here. */ - if (flags & SECTION_EXCLUDE) - { - fprintf (asm_out_file, "\t.section\t%s,#exclude\n", name); - return; - } -#endif + case IX86_BUILTIN_TZCNT64: + decl = builtin_decl_implicit (BUILT_IN_CTZLL); + goto fold_tzcnt_lzcnt; - default_elf_asm_named_section (name, flags, decl); -} -#endif /* TARGET_SOLARIS */ + case IX86_BUILTIN_LZCNT32: + decl = builtin_decl_implicit (BUILT_IN_CLZ); + goto fold_tzcnt_lzcnt; -/* Return the mangling of TYPE if it is an extended fundamental type. */ + case IX86_BUILTIN_LZCNT64: + decl = builtin_decl_implicit (BUILT_IN_CLZLL); + goto fold_tzcnt_lzcnt; -static const char * -ix86_mangle_type (const_tree type) -{ - type = TYPE_MAIN_VARIANT (type); + fold_tzcnt_lzcnt: + gcc_assert (n_args == 1); + arg0 = gimple_call_arg (stmt, 0); + if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt)) + { + int prec = TYPE_PRECISION (TREE_TYPE (arg0)); + /* If arg0 is provably non-zero, optimize into generic + __builtin_c[tl]z{,ll} function the middle-end handles + better. */ + if (!expr_not_equal_to (arg0, wi::zero (prec))) + return false; - if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE - && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE) - return NULL; + location_t loc = gimple_location (stmt); + gimple *g = gimple_build_call (decl, 1, arg0); + gimple_set_location (g, loc); + tree lhs = make_ssa_name (integer_type_node); + gimple_call_set_lhs (g, lhs); + gsi_insert_before (gsi, g, GSI_SAME_STMT); + g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + return true; + } + break; - switch (TYPE_MODE (type)) - { - case E_TFmode: - /* __float128 is "g". */ - return "g"; - case E_XFmode: - /* "long double" or __float80 is "e". */ - return "e"; - default: - return NULL; - } -} + case IX86_BUILTIN_BZHI32: + case IX86_BUILTIN_BZHI64: + gcc_assert (n_args == 2); + arg1 = gimple_call_arg (stmt, 1); + if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt)) + { + unsigned int idx = tree_to_uhwi (arg1) & 0xff; + arg0 = gimple_call_arg (stmt, 0); + if (idx < TYPE_PRECISION (TREE_TYPE (arg0))) + break; + location_t loc = gimple_location (stmt); + gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + return true; + } + break; -static GTY(()) tree ix86_tls_stack_chk_guard_decl; + case IX86_BUILTIN_PDEP32: + case IX86_BUILTIN_PDEP64: + case IX86_BUILTIN_PEXT32: + case IX86_BUILTIN_PEXT64: + gcc_assert (n_args == 2); + arg1 = gimple_call_arg (stmt, 1); + if (integer_all_onesp (arg1) && gimple_call_lhs (stmt)) + { + location_t loc = gimple_location (stmt); + arg0 = gimple_call_arg (stmt, 0); + gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + return true; + } + break; -static tree -ix86_stack_protect_guard (void) -{ - if (TARGET_SSP_TLS_GUARD) - { - tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1); - int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg); - tree type = build_qualified_type (type_node, qual); - tree t; + case IX86_BUILTIN_PSLLD: + case IX86_BUILTIN_PSLLD128: + case IX86_BUILTIN_PSLLD128_MASK: + case IX86_BUILTIN_PSLLD256: + case IX86_BUILTIN_PSLLD256_MASK: + case IX86_BUILTIN_PSLLD512: + case IX86_BUILTIN_PSLLDI: + case IX86_BUILTIN_PSLLDI128: + case IX86_BUILTIN_PSLLDI128_MASK: + case IX86_BUILTIN_PSLLDI256: + case IX86_BUILTIN_PSLLDI256_MASK: + case IX86_BUILTIN_PSLLDI512: + case IX86_BUILTIN_PSLLQ: + case IX86_BUILTIN_PSLLQ128: + case IX86_BUILTIN_PSLLQ128_MASK: + case IX86_BUILTIN_PSLLQ256: + case IX86_BUILTIN_PSLLQ256_MASK: + case IX86_BUILTIN_PSLLQ512: + case IX86_BUILTIN_PSLLQI: + case IX86_BUILTIN_PSLLQI128: + case IX86_BUILTIN_PSLLQI128_MASK: + case IX86_BUILTIN_PSLLQI256: + case IX86_BUILTIN_PSLLQI256_MASK: + case IX86_BUILTIN_PSLLQI512: + case IX86_BUILTIN_PSLLW: + case IX86_BUILTIN_PSLLW128: + case IX86_BUILTIN_PSLLW128_MASK: + case IX86_BUILTIN_PSLLW256: + case IX86_BUILTIN_PSLLW256_MASK: + case IX86_BUILTIN_PSLLW512_MASK: + case IX86_BUILTIN_PSLLWI: + case IX86_BUILTIN_PSLLWI128: + case IX86_BUILTIN_PSLLWI128_MASK: + case IX86_BUILTIN_PSLLWI256: + case IX86_BUILTIN_PSLLWI256_MASK: + case IX86_BUILTIN_PSLLWI512_MASK: + rcode = ASHIFT; + is_vshift = false; + goto do_shift; + case IX86_BUILTIN_PSRAD: + case IX86_BUILTIN_PSRAD128: + case IX86_BUILTIN_PSRAD128_MASK: + case IX86_BUILTIN_PSRAD256: + case IX86_BUILTIN_PSRAD256_MASK: + case IX86_BUILTIN_PSRAD512: + case IX86_BUILTIN_PSRADI: + case IX86_BUILTIN_PSRADI128: + case IX86_BUILTIN_PSRADI128_MASK: + case IX86_BUILTIN_PSRADI256: + case IX86_BUILTIN_PSRADI256_MASK: + case IX86_BUILTIN_PSRADI512: + case IX86_BUILTIN_PSRAQ128_MASK: + case IX86_BUILTIN_PSRAQ256_MASK: + case IX86_BUILTIN_PSRAQ512: + case IX86_BUILTIN_PSRAQI128_MASK: + case IX86_BUILTIN_PSRAQI256_MASK: + case IX86_BUILTIN_PSRAQI512: + case IX86_BUILTIN_PSRAW: + case IX86_BUILTIN_PSRAW128: + case IX86_BUILTIN_PSRAW128_MASK: + case IX86_BUILTIN_PSRAW256: + case IX86_BUILTIN_PSRAW256_MASK: + case IX86_BUILTIN_PSRAW512: + case IX86_BUILTIN_PSRAWI: + case IX86_BUILTIN_PSRAWI128: + case IX86_BUILTIN_PSRAWI128_MASK: + case IX86_BUILTIN_PSRAWI256: + case IX86_BUILTIN_PSRAWI256_MASK: + case IX86_BUILTIN_PSRAWI512: + rcode = ASHIFTRT; + is_vshift = false; + goto do_shift; + case IX86_BUILTIN_PSRLD: + case IX86_BUILTIN_PSRLD128: + case IX86_BUILTIN_PSRLD128_MASK: + case IX86_BUILTIN_PSRLD256: + case IX86_BUILTIN_PSRLD256_MASK: + case IX86_BUILTIN_PSRLD512: + case IX86_BUILTIN_PSRLDI: + case IX86_BUILTIN_PSRLDI128: + case IX86_BUILTIN_PSRLDI128_MASK: + case IX86_BUILTIN_PSRLDI256: + case IX86_BUILTIN_PSRLDI256_MASK: + case IX86_BUILTIN_PSRLDI512: + case IX86_BUILTIN_PSRLQ: + case IX86_BUILTIN_PSRLQ128: + case IX86_BUILTIN_PSRLQ128_MASK: + case IX86_BUILTIN_PSRLQ256: + case IX86_BUILTIN_PSRLQ256_MASK: + case IX86_BUILTIN_PSRLQ512: + case IX86_BUILTIN_PSRLQI: + case IX86_BUILTIN_PSRLQI128: + case IX86_BUILTIN_PSRLQI128_MASK: + case IX86_BUILTIN_PSRLQI256: + case IX86_BUILTIN_PSRLQI256_MASK: + case IX86_BUILTIN_PSRLQI512: + case IX86_BUILTIN_PSRLW: + case IX86_BUILTIN_PSRLW128: + case IX86_BUILTIN_PSRLW128_MASK: + case IX86_BUILTIN_PSRLW256: + case IX86_BUILTIN_PSRLW256_MASK: + case IX86_BUILTIN_PSRLW512: + case IX86_BUILTIN_PSRLWI: + case IX86_BUILTIN_PSRLWI128: + case IX86_BUILTIN_PSRLWI128_MASK: + case IX86_BUILTIN_PSRLWI256: + case IX86_BUILTIN_PSRLWI256_MASK: + case IX86_BUILTIN_PSRLWI512: + rcode = LSHIFTRT; + is_vshift = false; + goto do_shift; + case IX86_BUILTIN_PSLLVV16HI: + case IX86_BUILTIN_PSLLVV16SI: + case IX86_BUILTIN_PSLLVV2DI: + case IX86_BUILTIN_PSLLVV2DI_MASK: + case IX86_BUILTIN_PSLLVV32HI: + case IX86_BUILTIN_PSLLVV4DI: + case IX86_BUILTIN_PSLLVV4DI_MASK: + case IX86_BUILTIN_PSLLVV4SI: + case IX86_BUILTIN_PSLLVV4SI_MASK: + case IX86_BUILTIN_PSLLVV8DI: + case IX86_BUILTIN_PSLLVV8HI: + case IX86_BUILTIN_PSLLVV8SI: + case IX86_BUILTIN_PSLLVV8SI_MASK: + rcode = ASHIFT; + is_vshift = true; + goto do_shift; + case IX86_BUILTIN_PSRAVQ128: + case IX86_BUILTIN_PSRAVQ256: + case IX86_BUILTIN_PSRAVV16HI: + case IX86_BUILTIN_PSRAVV16SI: + case IX86_BUILTIN_PSRAVV32HI: + case IX86_BUILTIN_PSRAVV4SI: + case IX86_BUILTIN_PSRAVV4SI_MASK: + case IX86_BUILTIN_PSRAVV8DI: + case IX86_BUILTIN_PSRAVV8HI: + case IX86_BUILTIN_PSRAVV8SI: + case IX86_BUILTIN_PSRAVV8SI_MASK: + rcode = ASHIFTRT; + is_vshift = true; + goto do_shift; + case IX86_BUILTIN_PSRLVV16HI: + case IX86_BUILTIN_PSRLVV16SI: + case IX86_BUILTIN_PSRLVV2DI: + case IX86_BUILTIN_PSRLVV2DI_MASK: + case IX86_BUILTIN_PSRLVV32HI: + case IX86_BUILTIN_PSRLVV4DI: + case IX86_BUILTIN_PSRLVV4DI_MASK: + case IX86_BUILTIN_PSRLVV4SI: + case IX86_BUILTIN_PSRLVV4SI_MASK: + case IX86_BUILTIN_PSRLVV8DI: + case IX86_BUILTIN_PSRLVV8HI: + case IX86_BUILTIN_PSRLVV8SI: + case IX86_BUILTIN_PSRLVV8SI_MASK: + rcode = LSHIFTRT; + is_vshift = true; + goto do_shift; - if (global_options_set.x_ix86_stack_protector_guard_symbol_str) + do_shift: + gcc_assert (n_args >= 2); + arg0 = gimple_call_arg (stmt, 0); + arg1 = gimple_call_arg (stmt, 1); + if (n_args > 2) { - t = ix86_tls_stack_chk_guard_decl; - - if (t == NULL) - { - rtx x; - - t = build_decl - (UNKNOWN_LOCATION, VAR_DECL, - get_identifier (ix86_stack_protector_guard_symbol_str), - type); - TREE_STATIC (t) = 1; - TREE_PUBLIC (t) = 1; - DECL_EXTERNAL (t) = 1; - TREE_USED (t) = 1; - TREE_THIS_VOLATILE (t) = 1; - DECL_ARTIFICIAL (t) = 1; - DECL_IGNORED_P (t) = 1; - - /* Do not share RTL as the declaration is visible outside of - current function. */ - x = DECL_RTL (t); - RTX_FLAG (x, used) = 1; - - ix86_tls_stack_chk_guard_decl = t; - } + /* This is masked shift. Only optimize if the mask is all ones. */ + tree argl = gimple_call_arg (stmt, n_args - 1); + if (!tree_fits_uhwi_p (argl)) + break; + unsigned HOST_WIDE_INT mask = tree_to_uhwi (argl); + unsigned elems = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + if ((mask | (HOST_WIDE_INT_M1U << elems)) != HOST_WIDE_INT_M1U) + break; } - else + if (is_vshift) { - tree asptrtype = build_pointer_type (type); - - t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset); - t = build2 (MEM_REF, asptrtype, t, - build_int_cst (asptrtype, 0)); - TREE_THIS_VOLATILE (t) = 1; + if (TREE_CODE (arg1) != VECTOR_CST) + break; + count = TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0))); + if (integer_zerop (arg1)) + count = 0; + else if (rcode == ASHIFTRT) + break; + else + for (unsigned int i = 0; i < VECTOR_CST_NELTS (arg1); ++i) + { + tree elt = VECTOR_CST_ELT (arg1, i); + if (!wi::neg_p (wi::to_wide (elt)) + && wi::to_widest (elt) < count) + return false; + } } - - return t; - } - - return default_stack_protect_guard (); -} - -/* For 32-bit code we can save PIC register setup by using - __stack_chk_fail_local hidden function instead of calling - __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC - register, so it is better to call __stack_chk_fail directly. */ - -static tree ATTRIBUTE_UNUSED -ix86_stack_protect_fail (void) -{ - return TARGET_64BIT - ? default_external_stack_protect_fail () - : default_hidden_stack_protect_fail (); -} - -/* Select a format to encode pointers in exception handling data. CODE - is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is - true if the symbol may be affected by dynamic relocations. - - ??? All x86 object file formats are capable of representing this. - After all, the relocation needed is the same as for the call insn. - Whether or not a particular assembler allows us to enter such, I - guess we'll have to see. */ -int -asm_preferred_eh_data_format (int code, int global) -{ - if (flag_pic) - { - int type = DW_EH_PE_sdata8; - if (!TARGET_64BIT - || ix86_cmodel == CM_SMALL_PIC - || (ix86_cmodel == CM_MEDIUM_PIC && (global || code))) - type = DW_EH_PE_sdata4; - return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type; - } - if (ix86_cmodel == CM_SMALL - || (ix86_cmodel == CM_MEDIUM && code)) - return DW_EH_PE_udata4; - return DW_EH_PE_absptr; -} - -/* Expand copysign from SIGN to the positive value ABS_VALUE - storing in RESULT. If MASK is non-null, it shall be a mask to mask out - the sign-bit. */ -static void -ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) -{ - machine_mode mode = GET_MODE (sign); - rtx sgn = gen_reg_rtx (mode); - if (mask == NULL_RTX) - { - machine_mode vmode; - - if (mode == SFmode) - vmode = V4SFmode; - else if (mode == DFmode) - vmode = V2DFmode; else - vmode = mode; - - mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false); - if (!VECTOR_MODE_P (mode)) { - /* We need to generate a scalar mode mask in this case. */ - rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); - tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); - mask = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (mask, tmp)); + arg1 = ix86_vector_shift_count (arg1); + if (!arg1) + break; + count = tree_to_uhwi (arg1); } - } - else - mask = gen_rtx_NOT (mode, mask); - emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign))); - emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn))); -} - -/* Expand fabs (OP0) and return a new rtx that holds the result. The - mask for masking out the sign-bit is stored in *SMASK, if that is - non-null. */ -static rtx -ix86_expand_sse_fabs (rtx op0, rtx *smask) -{ - machine_mode vmode, mode = GET_MODE (op0); - rtx xa, mask; + if (count == 0) + { + /* Just return the first argument for shift by 0. */ + location_t loc = gimple_location (stmt); + gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + return true; + } + if (rcode != ASHIFTRT + && count >= TYPE_PRECISION (TREE_TYPE (TREE_TYPE (arg0)))) + { + /* For shift counts equal or greater than precision, except for + arithmetic right shift the result is zero. */ + location_t loc = gimple_location (stmt); + gimple *g = gimple_build_assign (gimple_call_lhs (stmt), + build_zero_cst (TREE_TYPE (arg0))); + gimple_set_location (g, loc); + gsi_replace (gsi, g, false); + return true; + } + break; - xa = gen_reg_rtx (mode); - if (mode == SFmode) - vmode = V4SFmode; - else if (mode == DFmode) - vmode = V2DFmode; - else - vmode = mode; - mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true); - if (!VECTOR_MODE_P (mode)) - { - /* We need to generate a scalar mode mask in this case. */ - rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); - tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); - mask = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (mask, tmp)); + default: + break; } - emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask))); - - if (smask) - *smask = mask; - - return xa; -} - -/* Expands a comparison of OP0 with OP1 using comparison code CODE, - swapping the operands if SWAP_OPERANDS is true. The expanded - code is a forward jump to a newly created label in case the - comparison is true. The generated label rtx is returned. */ -static rtx_code_label * -ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1, - bool swap_operands) -{ - bool unordered_compare = ix86_unordered_fp_compare (code); - rtx_code_label *label; - rtx tmp, reg; - - if (swap_operands) - std::swap (op0, op1); - - label = gen_label_rtx (); - tmp = gen_rtx_COMPARE (CCFPmode, op0, op1); - if (unordered_compare) - tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP); - reg = gen_rtx_REG (CCFPmode, FLAGS_REG); - emit_insn (gen_rtx_SET (reg, tmp)); - tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx); - tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, - gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx); - tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp)); - JUMP_LABEL (tmp) = label; - - return label; -} - -/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1 - using comparison code CODE. Operands are swapped for the comparison if - SWAP_OPERANDS is true. Returns a rtx for the generated mask. */ -static rtx -ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1, - bool swap_operands) -{ - rtx (*insn)(rtx, rtx, rtx, rtx); - machine_mode mode = GET_MODE (op0); - rtx mask = gen_reg_rtx (mode); - - if (swap_operands) - std::swap (op0, op1); - - insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse; - - emit_insn (insn (mask, op0, op1, - gen_rtx_fmt_ee (code, mode, op0, op1))); - return mask; -} - -/* Generate and return a rtx of mode MODE for 2**n where n is the number - of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */ -static rtx -ix86_gen_TWO52 (machine_mode mode) -{ - REAL_VALUE_TYPE TWO52r; - rtx TWO52; - - real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23); - TWO52 = const_double_from_real_value (TWO52r, mode); - TWO52 = force_reg (mode, TWO52); - return TWO52; + return false; } -/* Expand SSE sequence for computing lround from OP1 storing - into OP0. */ -void -ix86_expand_lround (rtx op0, rtx op1) -{ - /* C code for the stuff we're doing below: - tmp = op1 + copysign (nextafter (0.5, 0.0), op1) - return (long)tmp; - */ - machine_mode mode = GET_MODE (op1); - const struct real_format *fmt; - REAL_VALUE_TYPE pred_half, half_minus_pred_half; - rtx adj; - - /* load nextafter (0.5, 0.0) */ - fmt = REAL_MODE_FORMAT (mode); - real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); - real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); - - /* adj = copysign (0.5, op1) */ - adj = force_reg (mode, const_double_from_real_value (pred_half, mode)); - ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX); - - /* adj = op1 + adj */ - adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT); - - /* op0 = (imode)adj */ - expand_fix (op0, adj, 0); -} +/* Handler for an SVML-style interface to + a library with vectorized intrinsics. */ -/* Expand SSE2 sequence for computing lround from OPERAND1 storing - into OPERAND0. */ -void -ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor) +tree +ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in) { - /* C code for the stuff we're doing below (for do_floor): - xi = (long)op1; - xi -= (double)xi > op1 ? 1 : 0; - return xi; - */ - machine_mode fmode = GET_MODE (op1); - machine_mode imode = GET_MODE (op0); - rtx ireg, freg, tmp; - rtx_code_label *label; - - /* reg = (long)op1 */ - ireg = gen_reg_rtx (imode); - expand_fix (ireg, op1, 0); - - /* freg = (double)reg */ - freg = gen_reg_rtx (fmode); - expand_float (freg, ireg, 0); - - /* ireg = (freg > op1) ? ireg - 1 : ireg */ - label = ix86_expand_sse_compare_and_jump (UNLE, - freg, op1, !do_floor); - tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS, - ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT); - emit_move_insn (ireg, tmp); - - emit_label (label); - LABEL_NUSES (label) = 1; + char name[20]; + tree fntype, new_fndecl, args; + unsigned arity; + const char *bname; + machine_mode el_mode, in_mode; + int n, in_n; - emit_move_insn (op0, ireg); -} + /* The SVML is suitable for unsafe math only. */ + if (!flag_unsafe_math_optimizations) + return NULL_TREE; -/* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */ -void -ix86_expand_rint (rtx operand0, rtx operand1) -{ - /* C code for the stuff we're doing below: - xa = fabs (operand1); - if (!isless (xa, 2**52)) - return operand1; - two52 = 2**52; - if (flag_rounding_math) - { - two52 = copysign (two52, operand1); - xa = operand1; - } - xa = xa + two52 - two52; - return copysign (xa, operand1); - */ - machine_mode mode = GET_MODE (operand0); - rtx res, xa, TWO52, two52, mask; - rtx_code_label *label; + el_mode = TYPE_MODE (TREE_TYPE (type_out)); + n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + if (el_mode != in_mode + || n != in_n) + return NULL_TREE; - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); + switch (fn) + { + CASE_CFN_EXP: + CASE_CFN_LOG: + CASE_CFN_LOG10: + CASE_CFN_POW: + CASE_CFN_TANH: + CASE_CFN_TAN: + CASE_CFN_ATAN: + CASE_CFN_ATAN2: + CASE_CFN_ATANH: + CASE_CFN_CBRT: + CASE_CFN_SINH: + CASE_CFN_SIN: + CASE_CFN_ASINH: + CASE_CFN_ASIN: + CASE_CFN_COSH: + CASE_CFN_COS: + CASE_CFN_ACOSH: + CASE_CFN_ACOS: + if ((el_mode != DFmode || n != 2) + && (el_mode != SFmode || n != 4)) + return NULL_TREE; + break; - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); + default: + return NULL_TREE; + } - /* if (!isless (xa, TWO52)) goto label; */ - TWO52 = ix86_gen_TWO52 (mode); - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn); + bname = IDENTIFIER_POINTER (DECL_NAME (fndecl)); - two52 = TWO52; - if (flag_rounding_math) + if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF) + strcpy (name, "vmlsLn4"); + else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG) + strcpy (name, "vmldLn2"); + else if (n == 4) { - two52 = gen_reg_rtx (mode); - ix86_sse_copysign_to_positive (two52, TWO52, res, mask); - xa = res; + sprintf (name, "vmls%s", bname+10); + name[strlen (name)-1] = '4'; } + else + sprintf (name, "vmld%s2", bname+10); - xa = expand_simple_binop (mode, PLUS, xa, two52, NULL_RTX, 0, OPTAB_DIRECT); - xa = expand_simple_binop (mode, MINUS, xa, two52, xa, 0, OPTAB_DIRECT); + /* Convert to uppercase. */ + name[4] &= ~0x20; - ix86_sse_copysign_to_positive (res, xa, res, mask); + arity = 0; + for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args)) + arity++; - emit_label (label); - LABEL_NUSES (label) = 1; + if (arity == 1) + fntype = build_function_type_list (type_out, type_in, NULL); + else + fntype = build_function_type_list (type_out, type_in, type_in, NULL); + + /* Build a function declaration for the vectorized function. */ + new_fndecl = build_decl (BUILTINS_LOCATION, + FUNCTION_DECL, get_identifier (name), fntype); + TREE_PUBLIC (new_fndecl) = 1; + DECL_EXTERNAL (new_fndecl) = 1; + DECL_IS_NOVOPS (new_fndecl) = 1; + TREE_READONLY (new_fndecl) = 1; - emit_move_insn (operand0, res); + return new_fndecl; } -/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing - into OPERAND0. */ -void -ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor) -{ - /* C code for the stuff we expand below. - double xa = fabs (x), x2; - if (!isless (xa, TWO52)) - return x; - xa = xa + TWO52 - TWO52; - x2 = copysign (xa, x); - Compensate. Floor: - if (x2 > x) - x2 -= 1; - Compensate. Ceil: - if (x2 < x) - x2 += 1; - if (HONOR_SIGNED_ZEROS (mode)) - x2 = copysign (x2, x); - return x2; - */ - machine_mode mode = GET_MODE (operand0); - rtx xa, TWO52, tmp, one, res, mask; - rtx_code_label *label; +/* Handler for an ACML-style interface to + a library with vectorized intrinsics. */ - TWO52 = ix86_gen_TWO52 (mode); +tree +ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in) +{ + char name[20] = "__vr.._"; + tree fntype, new_fndecl, args; + unsigned arity; + const char *bname; + machine_mode el_mode, in_mode; + int n, in_n; - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); + /* The ACML is 64bits only and suitable for unsafe math only as + it does not correctly support parts of IEEE with the required + precision such as denormals. */ + if (!TARGET_64BIT + || !flag_unsafe_math_optimizations) + return NULL_TREE; - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); + el_mode = TYPE_MODE (TREE_TYPE (type_out)); + n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + if (el_mode != in_mode + || n != in_n) + return NULL_TREE; - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + switch (fn) + { + CASE_CFN_SIN: + CASE_CFN_COS: + CASE_CFN_EXP: + CASE_CFN_LOG: + CASE_CFN_LOG2: + CASE_CFN_LOG10: + if (el_mode == DFmode && n == 2) + { + name[4] = 'd'; + name[5] = '2'; + } + else if (el_mode == SFmode && n == 4) + { + name[4] = 's'; + name[5] = '4'; + } + else + return NULL_TREE; + break; - /* xa = xa + TWO52 - TWO52; */ - xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); - xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); + default: + return NULL_TREE; + } - /* xa = copysign (xa, operand1) */ - ix86_sse_copysign_to_positive (xa, xa, res, mask); + tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn); + bname = IDENTIFIER_POINTER (DECL_NAME (fndecl)); + sprintf (name + 7, "%s", bname+10); - /* generate 1.0 */ - one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + arity = 0; + for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args)) + arity++; - /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ - tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); - emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); - tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, - xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); - if (!do_floor && HONOR_SIGNED_ZEROS (mode)) - ix86_sse_copysign_to_positive (tmp, tmp, res, mask); - emit_move_insn (res, tmp); + if (arity == 1) + fntype = build_function_type_list (type_out, type_in, NULL); + else + fntype = build_function_type_list (type_out, type_in, type_in, NULL); - emit_label (label); - LABEL_NUSES (label) = 1; + /* Build a function declaration for the vectorized function. */ + new_fndecl = build_decl (BUILTINS_LOCATION, + FUNCTION_DECL, get_identifier (name), fntype); + TREE_PUBLIC (new_fndecl) = 1; + DECL_EXTERNAL (new_fndecl) = 1; + DECL_IS_NOVOPS (new_fndecl) = 1; + TREE_READONLY (new_fndecl) = 1; - emit_move_insn (operand0, res); + return new_fndecl; } -/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing - into OPERAND0. */ -void -ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor) -{ - /* C code for the stuff we expand below. - double xa = fabs (x), x2; - if (!isless (xa, TWO52)) - return x; - x2 = (double)(long)x; - Compensate. Floor: - if (x2 > x) - x2 -= 1; - Compensate. Ceil: - if (x2 < x) - x2 += 1; - if (HONOR_SIGNED_ZEROS (mode)) - return copysign (x2, x); - return x2; - */ - machine_mode mode = GET_MODE (operand0); - rtx xa, xi, TWO52, tmp, one, res, mask; - rtx_code_label *label; +/* Returns a decl of a function that implements scatter store with + register type VECTYPE and index type INDEX_TYPE and SCALE. + Return NULL_TREE if it is not available. */ - TWO52 = ix86_gen_TWO52 (mode); +static tree +ix86_vectorize_builtin_scatter (const_tree vectype, + const_tree index_type, int scale) +{ + bool si; + enum ix86_builtins code; - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); + if (!TARGET_AVX512F) + return NULL_TREE; - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); + if ((TREE_CODE (index_type) != INTEGER_TYPE + && !POINTER_TYPE_P (index_type)) + || (TYPE_MODE (index_type) != SImode + && TYPE_MODE (index_type) != DImode)) + return NULL_TREE; - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + if (TYPE_PRECISION (index_type) > POINTER_SIZE) + return NULL_TREE; - /* xa = (double)(long)x */ - xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); - expand_fix (xi, res, 0); - expand_float (xa, xi, 0); + /* v*scatter* insn sign extends index to pointer mode. */ + if (TYPE_PRECISION (index_type) < POINTER_SIZE + && TYPE_UNSIGNED (index_type)) + return NULL_TREE; - /* generate 1.0 */ - one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + /* Scale can be 1, 2, 4 or 8. */ + if (scale <= 0 + || scale > 8 + || (scale & (scale - 1)) != 0) + return NULL_TREE; - /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ - tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); - emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); - tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, - xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); - emit_move_insn (res, tmp); + si = TYPE_MODE (index_type) == SImode; + switch (TYPE_MODE (vectype)) + { + case E_V8DFmode: + code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF; + break; + case E_V8DImode: + code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI; + break; + case E_V16SFmode: + code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF; + break; + case E_V16SImode: + code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI; + break; + case E_V4DFmode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_SCATTERALTSIV4DF : IX86_BUILTIN_SCATTERDIV4DF; + else + return NULL_TREE; + break; + case E_V4DImode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_SCATTERALTSIV4DI : IX86_BUILTIN_SCATTERDIV4DI; + else + return NULL_TREE; + break; + case E_V8SFmode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_SCATTERSIV8SF : IX86_BUILTIN_SCATTERALTDIV8SF; + else + return NULL_TREE; + break; + case E_V8SImode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_SCATTERSIV8SI : IX86_BUILTIN_SCATTERALTDIV8SI; + else + return NULL_TREE; + break; + case E_V2DFmode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_SCATTERALTSIV2DF : IX86_BUILTIN_SCATTERDIV2DF; + else + return NULL_TREE; + break; + case E_V2DImode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_SCATTERALTSIV2DI : IX86_BUILTIN_SCATTERDIV2DI; + else + return NULL_TREE; + break; + case E_V4SFmode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_SCATTERSIV4SF : IX86_BUILTIN_SCATTERALTDIV4SF; + else + return NULL_TREE; + break; + case E_V4SImode: + if (TARGET_AVX512VL) + code = si ? IX86_BUILTIN_SCATTERSIV4SI : IX86_BUILTIN_SCATTERALTDIV4SI; + else + return NULL_TREE; + break; + default: + return NULL_TREE; + } - if (HONOR_SIGNED_ZEROS (mode)) - ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); + return get_ix86_builtin (code); +} - emit_label (label); - LABEL_NUSES (label) = 1; +/* Return true if it is safe to use the rsqrt optabs to optimize + 1.0/sqrt. */ - emit_move_insn (operand0, res); +static bool +use_rsqrt_p () +{ + return (TARGET_SSE && TARGET_SSE_MATH + && flag_finite_math_only + && !flag_trapping_math + && flag_unsafe_math_optimizations); } + +/* Helper for avx_vpermilps256_operand et al. This is also used by + the expansion functions to turn the parallel back into a mask. + The return value is 0 for no match and the imm8+1 for a match. */ -/* Expand SSE sequence for computing round from OPERAND1 storing - into OPERAND0. Sequence that works without relying on DImode truncation - via cvttsd2siq that is only available on 64bit targets. */ -void -ix86_expand_rounddf_32 (rtx operand0, rtx operand1) -{ - /* C code for the stuff we expand below. - double xa = fabs (x), xa2, x2; - if (!isless (xa, TWO52)) - return x; - Using the absolute value and copying back sign makes - -0.0 -> -0.0 correct. - xa2 = xa + TWO52 - TWO52; - Compensate. - dxa = xa2 - xa; - if (dxa <= -0.5) - xa2 += 1; - else if (dxa > 0.5) - xa2 -= 1; - x2 = copysign (xa2, x); - return x2; - */ - machine_mode mode = GET_MODE (operand0); - rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask; - rtx_code_label *label; - - TWO52 = ix86_gen_TWO52 (mode); +int +avx_vpermilp_parallel (rtx par, machine_mode mode) +{ + unsigned i, nelt = GET_MODE_NUNITS (mode); + unsigned mask = 0; + unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */ - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); + if (XVECLEN (par, 0) != (int) nelt) + return 0; - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); + /* Validate that all of the elements are constants, and not totally + out of range. Copy the data into an integral array to make the + subsequent checks easier. */ + for (i = 0; i < nelt; ++i) + { + rtx er = XVECEXP (par, 0, i); + unsigned HOST_WIDE_INT ei; - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + if (!CONST_INT_P (er)) + return 0; + ei = INTVAL (er); + if (ei >= nelt) + return 0; + ipar[i] = ei; + } - /* xa2 = xa + TWO52 - TWO52; */ - xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); - xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT); + switch (mode) + { + case E_V8DFmode: + /* In the 512-bit DFmode case, we can only move elements within + a 128-bit lane. First fill the second part of the mask, + then fallthru. */ + for (i = 4; i < 6; ++i) + { + if (ipar[i] < 4 || ipar[i] >= 6) + return 0; + mask |= (ipar[i] - 4) << i; + } + for (i = 6; i < 8; ++i) + { + if (ipar[i] < 6) + return 0; + mask |= (ipar[i] - 6) << i; + } + /* FALLTHRU */ - /* dxa = xa2 - xa; */ - dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT); + case E_V4DFmode: + /* In the 256-bit DFmode case, we can only move elements within + a 128-bit lane. */ + for (i = 0; i < 2; ++i) + { + if (ipar[i] >= 2) + return 0; + mask |= ipar[i] << i; + } + for (i = 2; i < 4; ++i) + { + if (ipar[i] < 2) + return 0; + mask |= (ipar[i] - 2) << i; + } + break; - /* generate 0.5, 1.0 and -0.5 */ - half = force_reg (mode, const_double_from_real_value (dconsthalf, mode)); - one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT); - mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX, - 0, OPTAB_DIRECT); + case E_V16SFmode: + /* In 512 bit SFmode case, permutation in the upper 256 bits + must mirror the permutation in the lower 256-bits. */ + for (i = 0; i < 8; ++i) + if (ipar[i] + 8 != ipar[i + 8]) + return 0; + /* FALLTHRU */ - /* Compensate. */ - tmp = gen_reg_rtx (mode); - /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */ - tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false); - emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); - xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); - /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */ - tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false); - emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp))); - xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); + case E_V8SFmode: + /* In 256 bit SFmode case, we have full freedom of + movement within the low 128-bit lane, but the high 128-bit + lane must mirror the exact same pattern. */ + for (i = 0; i < 4; ++i) + if (ipar[i] + 4 != ipar[i + 4]) + return 0; + nelt = 4; + /* FALLTHRU */ - /* res = copysign (xa2, operand1) */ - ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask); + case E_V2DFmode: + case E_V4SFmode: + /* In the 128-bit case, we've full freedom in the placement of + the elements from the source operand. */ + for (i = 0; i < nelt; ++i) + mask |= ipar[i] << (i * (nelt / 2)); + break; - emit_label (label); - LABEL_NUSES (label) = 1; + default: + gcc_unreachable (); + } - emit_move_insn (operand0, res); + /* Make sure success has a non-zero value by adding one. */ + return mask + 1; } -/* Expand SSE sequence for computing trunc from OPERAND1 storing - into OPERAND0. */ -void -ix86_expand_trunc (rtx operand0, rtx operand1) -{ - /* C code for SSE variant we expand below. - double xa = fabs (x), x2; - if (!isless (xa, TWO52)) - return x; - x2 = (double)(long)x; - if (HONOR_SIGNED_ZEROS (mode)) - return copysign (x2, x); - return x2; - */ - machine_mode mode = GET_MODE (operand0); - rtx xa, xi, TWO52, res, mask; - rtx_code_label *label; - - TWO52 = ix86_gen_TWO52 (mode); +/* Helper for avx_vperm2f128_v4df_operand et al. This is also used by + the expansion functions to turn the parallel back into a mask. + The return value is 0 for no match and the imm8+1 for a match. */ - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); +int +avx_vperm2f128_parallel (rtx par, machine_mode mode) +{ + unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2; + unsigned mask = 0; + unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */ - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &mask); + if (XVECLEN (par, 0) != (int) nelt) + return 0; - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + /* Validate that all of the elements are constants, and not totally + out of range. Copy the data into an integral array to make the + subsequent checks easier. */ + for (i = 0; i < nelt; ++i) + { + rtx er = XVECEXP (par, 0, i); + unsigned HOST_WIDE_INT ei; - /* x = (double)(long)x */ - xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); - expand_fix (xi, res, 0); - expand_float (res, xi, 0); + if (!CONST_INT_P (er)) + return 0; + ei = INTVAL (er); + if (ei >= 2 * nelt) + return 0; + ipar[i] = ei; + } - if (HONOR_SIGNED_ZEROS (mode)) - ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); + /* Validate that the halves of the permute are halves. */ + for (i = 0; i < nelt2 - 1; ++i) + if (ipar[i] + 1 != ipar[i + 1]) + return 0; + for (i = nelt2; i < nelt - 1; ++i) + if (ipar[i] + 1 != ipar[i + 1]) + return 0; - emit_label (label); - LABEL_NUSES (label) = 1; + /* Reconstruct the mask. */ + for (i = 0; i < 2; ++i) + { + unsigned e = ipar[i * nelt2]; + if (e % nelt2) + return 0; + e /= nelt2; + mask |= e << (i * 4); + } - emit_move_insn (operand0, res); + /* Make sure success has a non-zero value by adding one. */ + return mask + 1; } - -/* Expand SSE sequence for computing trunc from OPERAND1 storing - into OPERAND0. */ -void -ix86_expand_truncdf_32 (rtx operand0, rtx operand1) + +/* Return a register priority for hard reg REGNO. */ +static int +ix86_register_priority (int hard_regno) { - machine_mode mode = GET_MODE (operand0); - rtx xa, mask, TWO52, one, res, smask, tmp; - rtx_code_label *label; - - /* C code for SSE variant we expand below. - double xa = fabs (x), x2; - if (!isless (xa, TWO52)) - return x; - xa2 = xa + TWO52 - TWO52; - Compensate: - if (xa2 > xa) - xa2 -= 1.0; - x2 = copysign (xa2, x); - return x2; - */ - - TWO52 = ix86_gen_TWO52 (mode); - - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); - - /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, &smask); - - /* if (!isless (xa, TWO52)) goto label; */ - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); - - /* res = xa + TWO52 - TWO52; */ - tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); - tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT); - emit_move_insn (res, tmp); - - /* generate 1.0 */ - one = force_reg (mode, const_double_from_real_value (dconst1, mode)); - - /* Compensate: res = xa2 - (res > xa ? 1 : 0) */ - mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false); - emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one))); - tmp = expand_simple_binop (mode, MINUS, - res, mask, NULL_RTX, 0, OPTAB_DIRECT); - emit_move_insn (res, tmp); - - /* res = copysign (res, operand1) */ - ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask); - - emit_label (label); - LABEL_NUSES (label) = 1; - - emit_move_insn (operand0, res); + /* ebp and r13 as the base always wants a displacement, r12 as the + base always wants an index. So discourage their usage in an + address. */ + if (hard_regno == R12_REG || hard_regno == R13_REG) + return 0; + if (hard_regno == BP_REG) + return 1; + /* New x86-64 int registers result in bigger code size. Discourage + them. */ + if (IN_RANGE (hard_regno, FIRST_REX_INT_REG, LAST_REX_INT_REG)) + return 2; + /* New x86-64 SSE registers result in bigger code size. Discourage + them. */ + if (IN_RANGE (hard_regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG)) + return 2; + if (IN_RANGE (hard_regno, FIRST_EXT_REX_SSE_REG, LAST_EXT_REX_SSE_REG)) + return 1; + /* Usage of AX register results in smaller code. Prefer it. */ + if (hard_regno == AX_REG) + return 4; + return 3; } -/* Expand SSE sequence for computing round from OPERAND1 storing - into OPERAND0. */ -void -ix86_expand_round (rtx operand0, rtx operand1) -{ - /* C code for the stuff we're doing below: - double xa = fabs (x); - if (!isless (xa, TWO52)) - return x; - xa = (double)(long)(xa + nextafter (0.5, 0.0)); - return copysign (xa, x); - */ - machine_mode mode = GET_MODE (operand0); - rtx res, TWO52, xa, xi, half, mask; - rtx_code_label *label; - const struct real_format *fmt; - REAL_VALUE_TYPE pred_half, half_minus_pred_half; +/* Implement TARGET_PREFERRED_RELOAD_CLASS. + + Put float CONST_DOUBLE in the constant pool instead of fp regs. + QImode must go into class Q_REGS. + Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and + movdf to do mem-to-mem moves through integer regs. */ - /* Temporary for holding the result, initialized to the input - operand to ease control flow. */ - res = gen_reg_rtx (mode); - emit_move_insn (res, operand1); +static reg_class_t +ix86_preferred_reload_class (rtx x, reg_class_t regclass) +{ + machine_mode mode = GET_MODE (x); - TWO52 = ix86_gen_TWO52 (mode); - xa = ix86_expand_sse_fabs (res, &mask); - label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + /* We're only allowed to return a subclass of CLASS. Many of the + following checks fail for NO_REGS, so eliminate that early. */ + if (regclass == NO_REGS) + return NO_REGS; - /* load nextafter (0.5, 0.0) */ - fmt = REAL_MODE_FORMAT (mode); - real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); - real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); + /* All classes can load zeros. */ + if (x == CONST0_RTX (mode)) + return regclass; - /* xa = xa + 0.5 */ - half = force_reg (mode, const_double_from_real_value (pred_half, mode)); - xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT); + /* Force constants into memory if we are loading a (nonzero) constant into + an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK + instructions to load from a constant. */ + if (CONSTANT_P (x) + && (MAYBE_MMX_CLASS_P (regclass) + || MAYBE_SSE_CLASS_P (regclass) + || MAYBE_MASK_CLASS_P (regclass))) + return NO_REGS; - /* xa = (double)(int64_t)xa */ - xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); - expand_fix (xi, xa, 0); - expand_float (xa, xi, 0); + /* Floating-point constants need more complex checks. */ + if (CONST_DOUBLE_P (x)) + { + /* General regs can load everything. */ + if (INTEGER_CLASS_P (regclass)) + return regclass; - /* res = copysign (xa, operand1) */ - ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask); + /* Floats can load 0 and 1 plus some others. Note that we eliminated + zero above. We only want to wind up preferring 80387 registers if + we plan on doing computation with them. */ + if (IS_STACK_MODE (mode) + && standard_80387_constant_p (x) > 0) + { + /* Limit class to FP regs. */ + if (FLOAT_CLASS_P (regclass)) + return FLOAT_REGS; + } - emit_label (label); - LABEL_NUSES (label) = 1; + return NO_REGS; + } - emit_move_insn (operand0, res); -} + /* Prefer SSE regs only, if we can use them for math. */ + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + return SSE_CLASS_P (regclass) ? regclass : NO_REGS; -/* Expand SSE sequence for computing round - from OP1 storing into OP0 using sse4 round insn. */ -void -ix86_expand_round_sse4 (rtx op0, rtx op1) -{ - machine_mode mode = GET_MODE (op0); - rtx e1, e2, res, half; - const struct real_format *fmt; - REAL_VALUE_TYPE pred_half, half_minus_pred_half; - rtx (*gen_copysign) (rtx, rtx, rtx); - rtx (*gen_round) (rtx, rtx, rtx); + /* Generally when we see PLUS here, it's the function invariant + (plus soft-fp const_int). Which can only be computed into general + regs. */ + if (GET_CODE (x) == PLUS) + return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS; - switch (mode) + /* QImode constants are easy to load, but non-constant QImode data + must go into Q_REGS. */ + if (GET_MODE (x) == QImode && !CONSTANT_P (x)) { - case E_SFmode: - gen_copysign = gen_copysignsf3; - gen_round = gen_sse4_1_roundsf2; - break; - case E_DFmode: - gen_copysign = gen_copysigndf3; - gen_round = gen_sse4_1_rounddf2; - break; - default: - gcc_unreachable (); + if (Q_CLASS_P (regclass)) + return regclass; + else if (reg_class_subset_p (Q_REGS, regclass)) + return Q_REGS; + else + return NO_REGS; } - /* round (a) = trunc (a + copysign (0.5, a)) */ - - /* load nextafter (0.5, 0.0) */ - fmt = REAL_MODE_FORMAT (mode); - real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode); - real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half); - half = const_double_from_real_value (pred_half, mode); + return regclass; +} - /* e1 = copysign (0.5, op1) */ - e1 = gen_reg_rtx (mode); - emit_insn (gen_copysign (e1, half, op1)); +/* Discourage putting floating-point values in SSE registers unless + SSE math is being used, and likewise for the 387 registers. */ +static reg_class_t +ix86_preferred_output_reload_class (rtx x, reg_class_t regclass) +{ + machine_mode mode = GET_MODE (x); - /* e2 = op1 + e1 */ - e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT); + /* Restrict the output reload class to the register bank that we are doing + math on. If we would like not to return a subset of CLASS, reject this + alternative: if reload cannot do this, it will still use its choice. */ + mode = GET_MODE (x); + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS; - /* res = trunc (e2) */ - res = gen_reg_rtx (mode); - emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC))); + if (IS_STACK_MODE (mode)) + return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS; - emit_move_insn (op0, res); + return regclass; } -/* Handle fentry_name / fentry_section attribute. */ - -static tree -ix86_handle_fentry_name (tree *node, tree name, tree args, - int, bool *no_add_attrs) +static reg_class_t +ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass, + machine_mode mode, secondary_reload_info *sri) { - if (TREE_CODE (*node) == FUNCTION_DECL - && TREE_CODE (TREE_VALUE (args)) == STRING_CST) - /* Do nothing else, just set the attribute. We'll get at - it later with lookup_attribute. */ - ; - else + /* Double-word spills from general registers to non-offsettable memory + references (zero-extended addresses) require special handling. */ + if (TARGET_64BIT + && MEM_P (x) + && GET_MODE_SIZE (mode) > UNITS_PER_WORD + && INTEGER_CLASS_P (rclass) + && !offsettable_memref_p (x)) { - warning (OPT_Wattributes, "%qE attribute ignored", name); - *no_add_attrs = true; - } - - return NULL_TREE; -} - - -/* Table of valid machine attributes. */ -static const struct attribute_spec ix86_attribute_table[] = -{ - /* { name, min_len, max_len, decl_req, type_req, fn_type_req, - affects_type_identity, handler, exclude } */ - /* Stdcall attribute says callee is responsible for popping arguments - if they are not variable. */ - { "stdcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Fastcall attribute says callee is responsible for popping arguments - if they are not variable. */ - { "fastcall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Thiscall attribute says callee is responsible for popping arguments - if they are not variable. */ - { "thiscall", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Cdecl attribute says the callee is a normal C declaration */ - { "cdecl", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Regparm attribute specifies how many integer arguments are to be - passed in registers. */ - { "regparm", 1, 1, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* Sseregparm attribute says we are using x86_64 calling conventions - for FP arguments. */ - { "sseregparm", 0, 0, false, true, true, true, ix86_handle_cconv_attribute, - NULL }, - /* The transactional memory builtins are implicitly regparm or fastcall - depending on the ABI. Override the generic do-nothing attribute that - these builtins were declared with. */ - { "*tm regparm", 0, 0, false, true, true, true, - ix86_handle_tm_regparm_attribute, NULL }, - /* force_align_arg_pointer says this function realigns the stack at entry. */ - { (const char *)&ix86_force_align_arg_pointer_string, 0, 0, - false, true, true, false, ix86_handle_force_align_arg_pointer_attribute, - NULL }, -#if TARGET_DLLIMPORT_DECL_ATTRIBUTES - { "dllimport", 0, 0, false, false, false, false, handle_dll_attribute, - NULL }, - { "dllexport", 0, 0, false, false, false, false, handle_dll_attribute, - NULL }, - { "shared", 0, 0, true, false, false, false, - ix86_handle_shared_attribute, NULL }, -#endif - { "ms_struct", 0, 0, false, false, false, false, - ix86_handle_struct_attribute, NULL }, - { "gcc_struct", 0, 0, false, false, false, false, - ix86_handle_struct_attribute, NULL }, -#ifdef SUBTARGET_ATTRIBUTE_TABLE - SUBTARGET_ATTRIBUTE_TABLE, -#endif - /* ms_abi and sysv_abi calling convention function attributes. */ - { "ms_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, NULL }, - { "sysv_abi", 0, 0, false, true, true, true, ix86_handle_abi_attribute, - NULL }, - { "ms_abi va_list", 0, 0, false, false, false, false, NULL, NULL }, - { "sysv_abi va_list", 0, 0, false, false, false, false, NULL, NULL }, - { "ms_hook_prologue", 0, 0, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - { "callee_pop_aggregate_return", 1, 1, false, true, true, true, - ix86_handle_callee_pop_aggregate_return, NULL }, - { "interrupt", 0, 0, false, true, true, false, - ix86_handle_interrupt_attribute, NULL }, - { "no_caller_saved_registers", 0, 0, false, true, true, false, - ix86_handle_no_caller_saved_registers_attribute, NULL }, - { "naked", 0, 0, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - { "indirect_branch", 1, 1, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - { "function_return", 1, 1, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - { "indirect_return", 0, 0, false, true, true, false, - NULL, NULL }, - { "fentry_name", 1, 1, true, false, false, false, - ix86_handle_fentry_name, NULL }, - { "fentry_section", 1, 1, true, false, false, false, - ix86_handle_fentry_name, NULL }, - { "cf_check", 0, 0, true, false, false, false, - ix86_handle_fndecl_attribute, NULL }, - - /* End element. */ - { NULL, 0, 0, false, false, false, false, NULL, NULL } -}; + sri->icode = (in_p + ? CODE_FOR_reload_noff_load + : CODE_FOR_reload_noff_store); + /* Add the cost of moving address to a temporary. */ + sri->extra_cost = 1; -/* Implement targetm.vectorize.builtin_vectorization_cost. */ -static int -ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, - tree vectype, int) -{ - bool fp = false; - machine_mode mode = TImode; - int index; - if (vectype != NULL) - { - fp = FLOAT_TYPE_P (vectype); - mode = TYPE_MODE (vectype); + return NO_REGS; } - switch (type_of_cost) + /* QImode spills from non-QI registers require + intermediate register on 32bit targets. */ + if (mode == QImode + && ((!TARGET_64BIT && !in_p + && INTEGER_CLASS_P (rclass) + && MAYBE_NON_Q_CLASS_P (rclass)) + || (!TARGET_AVX512DQ + && MAYBE_MASK_CLASS_P (rclass)))) { - case scalar_stmt: - return fp ? ix86_cost->addss : COSTS_N_INSNS (1); + int regno = true_regnum (x); - case scalar_load: - /* load/store costs are relative to register move which is 2. Recompute - it to COSTS_N_INSNS so everything have same base. */ - return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0] - : ix86_cost->int_load [2]) / 2; + /* Return Q_REGS if the operand is in memory. */ + if (regno == -1) + return Q_REGS; - case scalar_store: - return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0] - : ix86_cost->int_store [2]) / 2; + return NO_REGS; + } - case vector_stmt: - return ix86_vec_cost (mode, - fp ? ix86_cost->addss : ix86_cost->sse_op); + /* This condition handles corner case where an expression involving + pointers gets vectorized. We're trying to use the address of a + stack slot as a vector initializer. - case vector_load: - index = sse_store_index (mode); - /* See PR82713 - we may end up being called on non-vector type. */ - if (index < 0) - index = 2; - return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2; + (set (reg:V2DI 74 [ vect_cst_.2 ]) + (vec_duplicate:V2DI (reg/f:DI 20 frame))) - case vector_store: - index = sse_store_index (mode); - /* See PR82713 - we may end up being called on non-vector type. */ - if (index < 0) - index = 2; - return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2; + Eventually frame gets turned into sp+offset like this: - case vec_to_scalar: - case scalar_to_vec: - return ix86_vec_cost (mode, ix86_cost->sse_op); + (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) + (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp) + (const_int 392 [0x188])))) - /* We should have separate costs for unaligned loads and gather/scatter. - Do that incrementally. */ - case unaligned_load: - index = sse_store_index (mode); - /* See PR82713 - we may end up being called on non-vector type. */ - if (index < 0) - index = 2; - return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2; + That later gets turned into: - case unaligned_store: - index = sse_store_index (mode); - /* See PR82713 - we may end up being called on non-vector type. */ - if (index < 0) - index = 2; - return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2; + (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) + (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp) + (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64])))) - case vector_gather_load: - return ix86_vec_cost (mode, - COSTS_N_INSNS - (ix86_cost->gather_static - + ix86_cost->gather_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + We'll have the following reload recorded: - case vector_scatter_store: - return ix86_vec_cost (mode, - COSTS_N_INSNS - (ix86_cost->scatter_static - + ix86_cost->scatter_per_elt - * TYPE_VECTOR_SUBPARTS (vectype)) / 2); + Reload 0: reload_in (DI) = + (plus:DI (reg/f:DI 7 sp) + (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64])) + reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) + SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine + reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188])) + reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74]) + reload_reg_rtx: (reg:V2DI 22 xmm1) - case cond_branch_taken: - return ix86_cost->cond_taken_branch_cost; + Which isn't going to work since SSE instructions can't handle scalar + additions. Returning GENERAL_REGS forces the addition into integer + register and reload can handle subsequent reloads without problems. */ - case cond_branch_not_taken: - return ix86_cost->cond_not_taken_branch_cost; + if (in_p && GET_CODE (x) == PLUS + && SSE_CLASS_P (rclass) + && SCALAR_INT_MODE_P (mode)) + return GENERAL_REGS; - case vec_perm: - case vec_promote_demote: - return ix86_vec_cost (mode, ix86_cost->sse_op); + return NO_REGS; +} - case vec_construct: - { - /* N element inserts into SSE vectors. */ - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; - /* One vinserti128 for combining two SSE vectors for AVX256. */ - if (GET_MODE_BITSIZE (mode) == 256) - cost += ix86_vec_cost (mode, ix86_cost->addss); - /* One vinserti64x4 and two vinserti128 for combining SSE - and AVX256 vectors to AVX512. */ - else if (GET_MODE_BITSIZE (mode) == 512) - cost += 3 * ix86_vec_cost (mode, ix86_cost->addss); - return cost; - } +/* Implement TARGET_CLASS_LIKELY_SPILLED_P. */ + +static bool +ix86_class_likely_spilled_p (reg_class_t rclass) +{ + switch (rclass) + { + case AREG: + case DREG: + case CREG: + case BREG: + case AD_REGS: + case SIREG: + case DIREG: + case SSE_FIRST_REG: + case FP_TOP_REG: + case FP_SECOND_REG: + return true; default: - gcc_unreachable (); + break; } + + return false; } -/* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel []))) - insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh - insn every time. */ +/* If we are copying between registers from different register sets + (e.g. FP and integer), we may need a memory location. + + The function can't work reliably when one of the CLASSES is a class + containing registers from multiple sets. We avoid this by never combining + different sets in a single alternative in the machine description. + Ensure that this constraint holds to avoid unexpected surprises. -static GTY(()) rtx_insn *vselect_insn; + When STRICT is false, we are being called from REGISTER_MOVE_COST, + so do not enforce these sanity checks. -/* Initialize vselect_insn. */ + To optimize register_move_cost performance, define inline variant. */ -static void -init_vselect_insn (void) +static inline bool +inline_secondary_memory_needed (machine_mode mode, reg_class_t class1, + reg_class_t class2, int strict) { - unsigned i; - rtx x; + if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS)) + return false; - x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN)); - for (i = 0; i < MAX_VECT_LEN; ++i) - XVECEXP (x, 0, i) = const0_rtx; - x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx, - const0_rtx), x); - x = gen_rtx_SET (const0_rtx, x); - start_sequence (); - vselect_insn = emit_insn (x); - end_sequence (); -} + if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1) + || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2) + || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1) + || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2) + || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1) + || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2) + || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1) + || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2)) + { + gcc_assert (!strict || lra_in_progress); + return true; + } -/* Construct (set target (vec_select op0 (parallel perm))) and - return true if that's a valid instruction in the active ISA. */ + if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2)) + return true; -static bool -expand_vselect (rtx target, rtx op0, const unsigned char *perm, - unsigned nelt, bool testing_p) -{ - unsigned int i; - rtx x, save_vconcat; - int icode; + /* Between mask and general, we have moves no larger than word size. */ + if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2)) + && (GET_MODE_SIZE (mode) > UNITS_PER_WORD)) + return true; - if (vselect_insn == NULL_RTX) - init_vselect_insn (); + /* ??? This is a lie. We do have moves between mmx/general, and for + mmx/sse2. But by saying we need secondary memory we discourage the + register allocator from using the mmx registers unless needed. */ + if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)) + return true; - x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1); - PUT_NUM_ELEM (XVEC (x, 0), nelt); - for (i = 0; i < nelt; ++i) - XVECEXP (x, 0, i) = GEN_INT (perm[i]); - save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0); - XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0; - PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target)); - SET_DEST (PATTERN (vselect_insn)) = target; - icode = recog_memoized (vselect_insn); + if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2)) + { + /* SSE1 doesn't have any direct moves from other classes. */ + if (!TARGET_SSE2) + return true; - if (icode >= 0 && !testing_p) - emit_insn (copy_rtx (PATTERN (vselect_insn))); + /* If the target says that inter-unit moves are more expensive + than moving through memory, then don't generate them. */ + if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC) + || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC)) + return true; - SET_DEST (PATTERN (vselect_insn)) = const0_rtx; - XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat; - INSN_CODE (vselect_insn) = -1; + /* Between SSE and general, we have moves no larger than word size. */ + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + return true; + } - return icode >= 0; + return false; } -/* Similar, but generate a vec_concat from op0 and op1 as well. */ +/* Implement TARGET_SECONDARY_MEMORY_NEEDED. */ static bool -expand_vselect_vconcat (rtx target, rtx op0, rtx op1, - const unsigned char *perm, unsigned nelt, - bool testing_p) +ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1, + reg_class_t class2) { - machine_mode v2mode; - rtx x; - bool ok; - - if (vselect_insn == NULL_RTX) - init_vselect_insn (); + return inline_secondary_memory_needed (mode, class1, class2, true); +} - if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode)) - return false; - x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0); - PUT_MODE (x, v2mode); - XEXP (x, 0) = op0; - XEXP (x, 1) = op1; - ok = expand_vselect (target, x, perm, nelt, testing_p); - XEXP (x, 0) = const0_rtx; - XEXP (x, 1) = const0_rtx; - return ok; -} - -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D - using movss or movsd. */ -static bool -expand_vec_perm_movs (struct expand_vec_perm_d *d) -{ - machine_mode vmode = d->vmode; - unsigned i, nelt = d->nelt; - rtx x; +/* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE. - if (d->one_operand_p) - return false; + get_secondary_mem widens integral modes to BITS_PER_WORD. + There is no need to emit full 64 bit move on 64 bit targets + for integral modes that can be moved using 32 bit move. */ - if (!(TARGET_SSE && vmode == V4SFmode) - && !(TARGET_SSE2 && vmode == V2DFmode)) - return false; +static machine_mode +ix86_secondary_memory_needed_mode (machine_mode mode) +{ + if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode)) + return mode_for_size (32, GET_MODE_CLASS (mode), 0).require (); + return mode; +} - /* Only the first element is changed. */ - if (d->perm[0] != nelt && d->perm[0] != 0) - return false; - for (i = 1; i < nelt; ++i) - if (d->perm[i] != i + nelt - d->perm[0]) - return false; +/* Implement the TARGET_CLASS_MAX_NREGS hook. - if (d->testing_p) - return true; + On the 80386, this is the size of MODE in words, + except in the FP regs, where a single reg is always enough. */ - if (d->perm[0] == nelt) - x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1)); +static unsigned char +ix86_class_max_nregs (reg_class_t rclass, machine_mode mode) +{ + if (MAYBE_INTEGER_CLASS_P (rclass)) + { + if (mode == XFmode) + return (TARGET_64BIT ? 2 : 3); + else if (mode == XCmode) + return (TARGET_64BIT ? 4 : 6); + else + return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD); + } else - x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1)); - - emit_insn (gen_rtx_SET (d->target, x)); - - return true; + { + if (COMPLEX_MODE_P (mode)) + return 2; + else + return 1; + } } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D - in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */ +/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */ static bool -expand_vec_perm_blend (struct expand_vec_perm_d *d) +ix86_can_change_mode_class (machine_mode from, machine_mode to, + reg_class_t regclass) { - machine_mode mmode, vmode = d->vmode; - unsigned i, mask, nelt = d->nelt; - rtx target, op0, op1, maskop, x; - rtx rperm[32], vperm; + if (from == to) + return true; - if (d->one_operand_p) - return false; - if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 - && (TARGET_AVX512BW - || GET_MODE_UNIT_SIZE (vmode) >= 4)) - ; - else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) - ; - else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) - ; - else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) - ; - else + /* x87 registers can't do subreg at all, as all values are reformatted + to extended precision. */ + if (MAYBE_FLOAT_CLASS_P (regclass)) return false; - /* This is a blend, not a permute. Elements must stay in their - respective lanes. */ - for (i = 0; i < nelt; ++i) + if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass)) { - unsigned e = d->perm[i]; - if (!(e == i || e == i + nelt)) + /* Vector registers do not support QI or HImode loads. If we don't + disallow a change to these modes, reload will assume it's ok to + drop the subreg from (subreg:SI (reg:HI 100) 0). This affects + the vec_dupv4hi pattern. */ + if (GET_MODE_SIZE (from) < 4) return false; } - if (d->testing_p) - return true; - - /* ??? Without SSE4.1, we could implement this with and/andn/or. This - decision should be extracted elsewhere, so that we only try that - sequence once all budget==3 options have been tried. */ - target = d->target; - op0 = d->op0; - op1 = d->op1; - mask = 0; - - switch (vmode) - { - case E_V8DFmode: - case E_V16SFmode: - case E_V4DFmode: - case E_V8SFmode: - case E_V2DFmode: - case E_V4SFmode: - case E_V8HImode: - case E_V8SImode: - case E_V32HImode: - case E_V64QImode: - case E_V16SImode: - case E_V8DImode: - for (i = 0; i < nelt; ++i) - mask |= (d->perm[i] >= nelt) << i; - break; - - case E_V2DImode: - for (i = 0; i < 2; ++i) - mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4); - vmode = V8HImode; - goto do_subreg; - - case E_V4SImode: - for (i = 0; i < 4; ++i) - mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); - vmode = V8HImode; - goto do_subreg; + return true; +} - case E_V16QImode: - /* See if bytes move in pairs so we can use pblendw with - an immediate argument, rather than pblendvb with a vector - argument. */ - for (i = 0; i < 16; i += 2) - if (d->perm[i] + 1 != d->perm[i + 1]) - { - use_pblendvb: - for (i = 0; i < nelt; ++i) - rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx); +/* Return index of MODE in the sse load/store tables. */ - finish_pblendvb: - vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); - vperm = force_reg (vmode, vperm); +static inline int +sse_store_index (machine_mode mode) +{ + switch (GET_MODE_SIZE (mode)) + { + case 4: + return 0; + case 8: + return 1; + case 16: + return 2; + case 32: + return 3; + case 64: + return 4; + default: + return -1; + } +} - if (GET_MODE_SIZE (vmode) == 16) - emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm)); - else - emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm)); - if (target != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); - return true; - } +/* Return the cost of moving data of mode M between a + register and memory. A value of 2 is the default; this cost is + relative to those in `REGISTER_MOVE_COST'. - for (i = 0; i < 8; ++i) - mask |= (d->perm[i * 2] >= 16) << i; - vmode = V8HImode; - /* FALLTHRU */ + This function is used extensively by register_move_cost that is used to + build tables at startup. Make it inline in this case. + When IN is 2, return maximum of in and out move cost. - do_subreg: - target = gen_reg_rtx (vmode); - op0 = gen_lowpart (vmode, op0); - op1 = gen_lowpart (vmode, op1); - break; + If moving between registers and memory is more expensive than + between two registers, you should define this macro to express the + relative cost. - case E_V32QImode: - /* See if bytes move in pairs. If not, vpblendvb must be used. */ - for (i = 0; i < 32; i += 2) - if (d->perm[i] + 1 != d->perm[i + 1]) - goto use_pblendvb; - /* See if bytes move in quadruplets. If yes, vpblendd - with immediate can be used. */ - for (i = 0; i < 32; i += 4) - if (d->perm[i] + 2 != d->perm[i + 2]) - break; - if (i < 32) + Model also increased moving costs of QImode registers in non + Q_REGS classes. + */ +static inline int +inline_memory_move_cost (machine_mode mode, enum reg_class regclass, int in) +{ + int cost; + if (FLOAT_CLASS_P (regclass)) + { + int index; + switch (mode) { - /* See if bytes move the same in both lanes. If yes, - vpblendw with immediate can be used. */ - for (i = 0; i < 16; i += 2) - if (d->perm[i] + 16 != d->perm[i + 16]) - goto use_pblendvb; - - /* Use vpblendw. */ - for (i = 0; i < 16; ++i) - mask |= (d->perm[i * 2] >= 32) << i; - vmode = V16HImode; - goto do_subreg; + case E_SFmode: + index = 0; + break; + case E_DFmode: + index = 1; + break; + case E_XFmode: + index = 2; + break; + default: + return 100; } - - /* Use vpblendd. */ - for (i = 0; i < 8; ++i) - mask |= (d->perm[i * 4] >= 32) << i; - vmode = V8SImode; - goto do_subreg; - - case E_V16HImode: - /* See if words move in pairs. If yes, vpblendd can be used. */ - for (i = 0; i < 16; i += 2) - if (d->perm[i] + 1 != d->perm[i + 1]) - break; - if (i < 16) + if (in == 2) + return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]); + return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index]; + } + if (SSE_CLASS_P (regclass)) + { + int index = sse_store_index (mode); + if (index == -1) + return 100; + if (in == 2) + return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]); + return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index]; + } + if (MMX_CLASS_P (regclass)) + { + int index; + switch (GET_MODE_SIZE (mode)) { - /* See if words move the same in both lanes. If not, - vpblendvb must be used. */ - for (i = 0; i < 8; i++) - if (d->perm[i] + 8 != d->perm[i + 8]) - { - /* Use vpblendvb. */ - for (i = 0; i < 32; ++i) - rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx); - - vmode = V32QImode; - nelt = 32; - target = gen_reg_rtx (vmode); - op0 = gen_lowpart (vmode, op0); - op1 = gen_lowpart (vmode, op1); - goto finish_pblendvb; - } - - /* Use vpblendw. */ - for (i = 0; i < 16; ++i) - mask |= (d->perm[i] >= 16) << i; - break; + case 4: + index = 0; + break; + case 8: + index = 1; + break; + default: + return 100; } - - /* Use vpblendd. */ - for (i = 0; i < 8; ++i) - mask |= (d->perm[i * 2] >= 16) << i; - vmode = V8SImode; - goto do_subreg; - - case E_V4DImode: - /* Use vpblendd. */ - for (i = 0; i < 4; ++i) - mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); - vmode = V8SImode; - goto do_subreg; - - default: - gcc_unreachable (); + if (in == 2) + return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]); + return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index]; } - - switch (vmode) + switch (GET_MODE_SIZE (mode)) { - case E_V8DFmode: - case E_V8DImode: - mmode = QImode; - break; - case E_V16SFmode: - case E_V16SImode: - mmode = HImode; - break; - case E_V32HImode: - mmode = SImode; - break; - case E_V64QImode: - mmode = DImode; - break; - default: - mmode = VOIDmode; + case 1: + if (Q_CLASS_P (regclass) || TARGET_64BIT) + { + if (!in) + return ix86_cost->int_store[0]; + if (TARGET_PARTIAL_REG_DEPENDENCY + && optimize_function_for_speed_p (cfun)) + cost = ix86_cost->movzbl_load; + else + cost = ix86_cost->int_load[0]; + if (in == 2) + return MAX (cost, ix86_cost->int_store[0]); + return cost; + } + else + { + if (in == 2) + return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4); + if (in) + return ix86_cost->movzbl_load; + else + return ix86_cost->int_store[0] + 4; + } + break; + case 2: + if (in == 2) + return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]); + return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1]; + default: + if (in == 2) + cost = MAX (ix86_cost->int_load[2], ix86_cost->int_store[2]); + else if (in) + cost = ix86_cost->int_load[2]; + else + cost = ix86_cost->int_store[2]; + /* Multiply with the number of GPR moves needed. */ + return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD); } +} - if (mmode != VOIDmode) - maskop = force_reg (mmode, gen_int_mode (mask, mmode)); - else - maskop = GEN_INT (mask); - - /* This matches five different patterns with the different modes. */ - x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop); - x = gen_rtx_SET (target, x); - emit_insn (x); - if (target != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); - - return true; +static int +ix86_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in) +{ + return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0); } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D - in terms of the variable form of vpermilps. - Note that we will have already failed the immediate input vpermilps, - which requires that the high and low part shuffle be identical; the - variable form doesn't require that. */ +/* Return the cost of moving data from a register in class CLASS1 to + one in class CLASS2. + + It is not required that the cost always equal 2 when FROM is the same as TO; + on some machines it is expensive to move between registers if they are not + general registers. */ -static bool -expand_vec_perm_vpermil (struct expand_vec_perm_d *d) +static int +ix86_register_move_cost (machine_mode mode, reg_class_t class1_i, + reg_class_t class2_i) { - rtx rperm[8], vperm; - unsigned i; + enum reg_class class1 = (enum reg_class) class1_i; + enum reg_class class2 = (enum reg_class) class2_i; - if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p) - return false; + /* In case we require secondary memory, compute cost of the store followed + by load. In order to avoid bad register allocation choices, we need + for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */ - /* We can only permute within the 128-bit lane. */ - for (i = 0; i < 8; ++i) + if (inline_secondary_memory_needed (mode, class1, class2, false)) { - unsigned e = d->perm[i]; - if (i < 4 ? e >= 4 : e < 4) - return false; - } + int cost = 1; - if (d->testing_p) - return true; + cost += inline_memory_move_cost (mode, class1, 2); + cost += inline_memory_move_cost (mode, class2, 2); - for (i = 0; i < 8; ++i) - { - unsigned e = d->perm[i]; + /* In case of copying from general_purpose_register we may emit multiple + stores followed by single load causing memory size mismatch stall. + Count this as arbitrarily high cost of 20. */ + if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD + && TARGET_MEMORY_MISMATCH_STALL + && targetm.class_max_nregs (class1, mode) + > targetm.class_max_nregs (class2, mode)) + cost += 20; - /* Within each 128-bit lane, the elements of op0 are numbered - from 0 and the elements of op1 are numbered from 4. */ - if (e >= 8 + 4) - e -= 8; - else if (e >= 4) - e -= 4; + /* In the case of FP/MMX moves, the registers actually overlap, and we + have to switch modes in order to treat them differently. */ + if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2)) + || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1))) + cost += 20; - rperm[i] = GEN_INT (e); + return cost; } - vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm)); - vperm = force_reg (V8SImode, vperm); - emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm)); - - return true; -} - -/* Return true if permutation D can be performed as VMODE permutation - instead. */ + /* Moves between SSE/MMX and integer unit are expensive. */ + if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2) + || SSE_CLASS_P (class1) != SSE_CLASS_P (class2)) -static bool -valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d) -{ - unsigned int i, j, chunk; + /* ??? By keeping returned value relatively high, we limit the number + of moves between integer and MMX/SSE registers for all targets. + Additionally, high value prevents problem with x86_modes_tieable_p(), + where integer modes in MMX/SSE registers are not tieable + because of missing QImode and HImode moves to, from or between + MMX/SSE registers. */ + return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2) + ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer); - if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT - || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT - || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode)) - return false; + if (MAYBE_FLOAT_CLASS_P (class1)) + return ix86_cost->fp_move; + if (MAYBE_SSE_CLASS_P (class1)) + { + if (GET_MODE_BITSIZE (mode) <= 128) + return ix86_cost->xmm_move; + if (GET_MODE_BITSIZE (mode) <= 256) + return ix86_cost->ymm_move; + return ix86_cost->zmm_move; + } + if (MAYBE_MMX_CLASS_P (class1)) + return ix86_cost->mmx_move; + return 2; +} - if (GET_MODE_NUNITS (vmode) >= d->nelt) - return true; +/* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in + words of a value of mode MODE but can be less for certain modes in + special long registers. - chunk = d->nelt / GET_MODE_NUNITS (vmode); - for (i = 0; i < d->nelt; i += chunk) - if (d->perm[i] & (chunk - 1)) - return false; - else - for (j = 1; j < chunk; ++j) - if (d->perm[i] + j != d->perm[i + j]) - return false; + Actually there are no two word move instructions for consecutive + registers. And only registers 0-3 may have mov byte instructions + applied to them. */ - return true; +static unsigned int +ix86_hard_regno_nregs (unsigned int regno, machine_mode mode) +{ + if (GENERAL_REGNO_P (regno)) + { + if (mode == XFmode) + return TARGET_64BIT ? 2 : 3; + if (mode == XCmode) + return TARGET_64BIT ? 4 : 6; + return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD); + } + if (COMPLEX_MODE_P (mode)) + return 2; + if (mode == V64SFmode || mode == V64SImode) + return 4; + return 1; } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D - in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */ +/* Implement TARGET_HARD_REGNO_MODE_OK. */ static bool -expand_vec_perm_pshufb (struct expand_vec_perm_d *d) +ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) { - unsigned i, nelt, eltsz, mask; - unsigned char perm[64]; - machine_mode vmode = V16QImode; - rtx rperm[64], vperm, target, op0, op1; - - nelt = d->nelt; - - if (!d->one_operand_p) - { - if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16) - { - if (TARGET_AVX2 - && valid_perm_using_mode_p (V2TImode, d)) - { - if (d->testing_p) - return true; - - /* Use vperm2i128 insn. The pattern uses - V4DImode instead of V2TImode. */ - target = d->target; - if (d->vmode != V4DImode) - target = gen_reg_rtx (V4DImode); - op0 = gen_lowpart (V4DImode, d->op0); - op1 = gen_lowpart (V4DImode, d->op1); - rperm[0] - = GEN_INT ((d->perm[0] / (nelt / 2)) - | ((d->perm[nelt / 2] / (nelt / 2)) * 16)); - emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0])); - if (target != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); - return true; - } - return false; - } - } - else + /* Flags and only flags can only hold CCmode values. */ + if (CC_REGNO_P (regno)) + return GET_MODE_CLASS (mode) == MODE_CC; + if (GET_MODE_CLASS (mode) == MODE_CC + || GET_MODE_CLASS (mode) == MODE_RANDOM + || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT) + return false; + if (STACK_REGNO_P (regno)) + return VALID_FP_MODE_P (mode); + if (MASK_REGNO_P (regno)) + return (VALID_MASK_REG_MODE (mode) + || (TARGET_AVX512BW + && VALID_MASK_AVX512BW_MODE (mode))); + if (SSE_REGNO_P (regno)) { - if (GET_MODE_SIZE (d->vmode) == 16) - { - if (!TARGET_SSSE3) - return false; - } - else if (GET_MODE_SIZE (d->vmode) == 32) - { - if (!TARGET_AVX2) - return false; - - /* V4DImode should be already handled through - expand_vselect by vpermq instruction. */ - gcc_assert (d->vmode != V4DImode); - - vmode = V32QImode; - if (d->vmode == V8SImode - || d->vmode == V16HImode - || d->vmode == V32QImode) - { - /* First see if vpermq can be used for - V8SImode/V16HImode/V32QImode. */ - if (valid_perm_using_mode_p (V4DImode, d)) - { - for (i = 0; i < 4; i++) - perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3; - if (d->testing_p) - return true; - target = gen_reg_rtx (V4DImode); - if (expand_vselect (target, gen_lowpart (V4DImode, d->op0), - perm, 4, false)) - { - emit_move_insn (d->target, - gen_lowpart (d->vmode, target)); - return true; - } - return false; - } - - /* Next see if vpermd can be used. */ - if (valid_perm_using_mode_p (V8SImode, d)) - vmode = V8SImode; - } - /* Or if vpermps can be used. */ - else if (d->vmode == V8SFmode) - vmode = V8SImode; + /* We implement the move patterns for all vector modes into and + out of SSE registers, even when no operation instructions + are available. */ - if (vmode == V32QImode) - { - /* vpshufb only works intra lanes, it is not - possible to shuffle bytes in between the lanes. */ - for (i = 0; i < nelt; ++i) - if ((d->perm[i] ^ i) & (nelt / 2)) - return false; - } - } - else if (GET_MODE_SIZE (d->vmode) == 64) - { - if (!TARGET_AVX512BW) - return false; + /* For AVX-512 we allow, regardless of regno: + - XI mode + - any of 512-bit wide vector mode + - any scalar mode. */ + if (TARGET_AVX512F + && (mode == XImode + || VALID_AVX512F_REG_MODE (mode) + || VALID_AVX512F_SCALAR_MODE (mode))) + return true; - /* If vpermq didn't work, vpshufb won't work either. */ - if (d->vmode == V8DFmode || d->vmode == V8DImode) - return false; + /* For AVX-5124FMAPS or AVX-5124VNNIW + allow V64SF and V64SI modes for special regnos. */ + if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW) + && (mode == V64SFmode || mode == V64SImode) + && MOD4_SSE_REGNO_P (regno)) + return true; - vmode = V64QImode; - if (d->vmode == V16SImode - || d->vmode == V32HImode - || d->vmode == V64QImode) - { - /* First see if vpermq can be used for - V16SImode/V32HImode/V64QImode. */ - if (valid_perm_using_mode_p (V8DImode, d)) - { - for (i = 0; i < 8; i++) - perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7; - if (d->testing_p) - return true; - target = gen_reg_rtx (V8DImode); - if (expand_vselect (target, gen_lowpart (V8DImode, d->op0), - perm, 8, false)) - { - emit_move_insn (d->target, - gen_lowpart (d->vmode, target)); - return true; - } - return false; - } + /* TODO check for QI/HI scalars. */ + /* AVX512VL allows sse regs16+ for 128/256 bit modes. */ + if (TARGET_AVX512VL + && (mode == OImode + || mode == TImode + || VALID_AVX256_REG_MODE (mode) + || VALID_AVX512VL_128_REG_MODE (mode))) + return true; - /* Next see if vpermd can be used. */ - if (valid_perm_using_mode_p (V16SImode, d)) - vmode = V16SImode; - } - /* Or if vpermps can be used. */ - else if (d->vmode == V16SFmode) - vmode = V16SImode; - if (vmode == V64QImode) - { - /* vpshufb only works intra lanes, it is not - possible to shuffle bytes in between the lanes. */ - for (i = 0; i < nelt; ++i) - if ((d->perm[i] ^ i) & (nelt / 4)) - return false; - } - } - else + /* xmm16-xmm31 are only available for AVX-512. */ + if (EXT_REX_SSE_REGNO_P (regno)) return false; - } - - if (d->testing_p) - return true; - if (vmode == V8SImode) - for (i = 0; i < 8; ++i) - rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7); - else if (vmode == V16SImode) - for (i = 0; i < 16; ++i) - rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15); - else + /* OImode and AVX modes are available only when AVX is enabled. */ + return ((TARGET_AVX + && VALID_AVX256_REG_OR_OI_MODE (mode)) + || VALID_SSE_REG_MODE (mode) + || VALID_SSE2_REG_MODE (mode) + || VALID_MMX_REG_MODE (mode) + || VALID_MMX_REG_MODE_3DNOW (mode)); + } + if (MMX_REGNO_P (regno)) { - eltsz = GET_MODE_UNIT_SIZE (d->vmode); - if (!d->one_operand_p) - mask = 2 * nelt - 1; - else if (vmode == V16QImode) - mask = nelt - 1; - else if (vmode == V64QImode) - mask = nelt / 4 - 1; - else - mask = nelt / 2 - 1; - - for (i = 0; i < nelt; ++i) - { - unsigned j, e = d->perm[i] & mask; - for (j = 0; j < eltsz; ++j) - rperm[i * eltsz + j] = GEN_INT (e * eltsz + j); - } - } - - vperm = gen_rtx_CONST_VECTOR (vmode, - gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm)); - vperm = force_reg (vmode, vperm); - - target = d->target; - if (d->vmode != vmode) - target = gen_reg_rtx (vmode); - op0 = gen_lowpart (vmode, d->op0); - if (d->one_operand_p) - { - if (vmode == V16QImode) - emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); - else if (vmode == V32QImode) - emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); - else if (vmode == V64QImode) - emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); - else if (vmode == V8SFmode) - emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); - else if (vmode == V8SImode) - emit_insn (gen_avx2_permvarv8si (target, op0, vperm)); - else if (vmode == V16SFmode) - emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm)); - else if (vmode == V16SImode) - emit_insn (gen_avx512f_permvarv16si (target, op0, vperm)); - else - gcc_unreachable (); + /* We implement the move patterns for 3DNOW modes even in MMX mode, + so if the register is available at all, then we can move data of + the given mode into or out of it. */ + return (VALID_MMX_REG_MODE (mode) + || VALID_MMX_REG_MODE_3DNOW (mode)); } - else + + if (mode == QImode) { - op1 = gen_lowpart (vmode, d->op1); - emit_insn (gen_xop_pperm (target, op0, op1, vperm)); + /* Take care for QImode values - they can be in non-QI regs, + but then they do cause partial register stalls. */ + if (ANY_QI_REGNO_P (regno)) + return true; + if (!TARGET_PARTIAL_REG_STALL) + return true; + /* LRA checks if the hard register is OK for the given mode. + QImode values can live in non-QI regs, so we allow all + registers here. */ + if (lra_in_progress) + return true; + return !can_create_pseudo_p (); } - if (target != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); + /* We handle both integer and floats in the general purpose registers. */ + else if (VALID_INT_MODE_P (mode)) + return true; + else if (VALID_FP_MODE_P (mode)) + return true; + else if (VALID_DFP_MODE_P (mode)) + return true; + /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go + on to use that value in smaller contexts, this can easily force a + pseudo to be allocated to GENERAL_REGS. Since this is no worse than + supporting DImode, allow it. */ + else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode)) + return true; - return true; + return false; } -/* For V*[QHS]Imode permutations, check if the same permutation - can't be performed in a 2x, 4x or 8x wider inner mode. */ +/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that + saves SSE registers across calls is Win64 (thus no need to check the + current ABI here), and with AVX enabled Win64 only guarantees that + the low 16 bytes are saved. */ static bool -canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, - struct expand_vec_perm_d *nd) +ix86_hard_regno_call_part_clobbered (rtx_insn *insn ATTRIBUTE_UNUSED, + unsigned int regno, machine_mode mode) { - int i; - machine_mode mode = VOIDmode; - - switch (d->vmode) - { - case E_V16QImode: mode = V8HImode; break; - case E_V32QImode: mode = V16HImode; break; - case E_V64QImode: mode = V32HImode; break; - case E_V8HImode: mode = V4SImode; break; - case E_V16HImode: mode = V8SImode; break; - case E_V32HImode: mode = V16SImode; break; - case E_V4SImode: mode = V2DImode; break; - case E_V8SImode: mode = V4DImode; break; - case E_V16SImode: mode = V8DImode; break; - default: return false; - } - for (i = 0; i < d->nelt; i += 2) - if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) - return false; - nd->vmode = mode; - nd->nelt = d->nelt / 2; - for (i = 0; i < nd->nelt; i++) - nd->perm[i] = d->perm[2 * i] / 2; - if (GET_MODE_INNER (mode) != DImode) - canonicalize_vector_int_perm (nd, nd); - if (nd != d) - { - nd->one_operand_p = d->one_operand_p; - nd->testing_p = d->testing_p; - if (d->op0 == d->op1) - nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); - else - { - nd->op0 = gen_lowpart (nd->vmode, d->op0); - nd->op1 = gen_lowpart (nd->vmode, d->op1); - } - if (d->testing_p) - nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); - else - nd->target = gen_reg_rtx (nd->vmode); - } - return true; + return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16; } -/* Try to expand one-operand permutation with constant mask. */ +/* A subroutine of ix86_modes_tieable_p. Return true if MODE is a + tieable integer mode. */ static bool -ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d) +ix86_tieable_integer_mode_p (machine_mode mode) { - machine_mode mode = GET_MODE (d->op0); - machine_mode maskmode = mode; - rtx (*gen) (rtx, rtx, rtx) = NULL; - rtx target, op0, mask; - rtx vec[64]; + switch (mode) + { + case E_HImode: + case E_SImode: + return true; - if (!rtx_equal_p (d->op0, d->op1)) - return false; + case E_QImode: + return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL; - if (!TARGET_AVX512F) - return false; + case E_DImode: + return TARGET_64BIT; - switch (mode) - { - case E_V16SImode: - gen = gen_avx512f_permvarv16si; - break; - case E_V16SFmode: - gen = gen_avx512f_permvarv16sf; - maskmode = V16SImode; - break; - case E_V8DImode: - gen = gen_avx512f_permvarv8di; - break; - case E_V8DFmode: - gen = gen_avx512f_permvarv8df; - maskmode = V8DImode; - break; default: return false; } - - target = d->target; - op0 = d->op0; - for (int i = 0; i < d->nelt; ++i) - vec[i] = GEN_INT (d->perm[i]); - mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); - emit_insn (gen (target, op0, force_reg (maskmode, mask))); - return true; } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D - in a single instruction. */ +/* Implement TARGET_MODES_TIEABLE_P. + + Return true if MODE1 is accessible in a register that can hold MODE2 + without copying. That is, all register classes that can hold MODE2 + can also hold MODE1. */ static bool -expand_vec_perm_1 (struct expand_vec_perm_d *d) +ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2) { - unsigned i, nelt = d->nelt; - struct expand_vec_perm_d nd; - - /* Check plain VEC_SELECT first, because AVX has instructions that could - match both SEL and SEL+CONCAT, but the plain SEL will allow a memory - input where SEL+CONCAT may not. */ - if (d->one_operand_p) - { - int mask = nelt - 1; - bool identity_perm = true; - bool broadcast_perm = true; - - for (i = 0; i < nelt; i++) - { - nd.perm[i] = d->perm[i] & mask; - if (nd.perm[i] != i) - identity_perm = false; - if (nd.perm[i]) - broadcast_perm = false; - } + if (mode1 == mode2) + return true; - if (identity_perm) - { - if (!d->testing_p) - emit_move_insn (d->target, d->op0); - return true; - } - else if (broadcast_perm && TARGET_AVX2) - { - /* Use vpbroadcast{b,w,d}. */ - rtx (*gen) (rtx, rtx) = NULL; - switch (d->vmode) - { - case E_V64QImode: - if (TARGET_AVX512BW) - gen = gen_avx512bw_vec_dupv64qi_1; - break; - case E_V32QImode: - gen = gen_avx2_pbroadcastv32qi_1; - break; - case E_V32HImode: - if (TARGET_AVX512BW) - gen = gen_avx512bw_vec_dupv32hi_1; - break; - case E_V16HImode: - gen = gen_avx2_pbroadcastv16hi_1; - break; - case E_V16SImode: - if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv16si_1; - break; - case E_V8SImode: - gen = gen_avx2_pbroadcastv8si_1; - break; - case E_V16QImode: - gen = gen_avx2_pbroadcastv16qi; - break; - case E_V8HImode: - gen = gen_avx2_pbroadcastv8hi; - break; - case E_V16SFmode: - if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv16sf_1; - break; - case E_V8SFmode: - gen = gen_avx2_vec_dupv8sf_1; - break; - case E_V8DFmode: - if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv8df_1; - break; - case E_V8DImode: - if (TARGET_AVX512F) - gen = gen_avx512f_vec_dupv8di_1; - break; - /* For other modes prefer other shuffles this function creates. */ - default: break; - } - if (gen != NULL) - { - if (!d->testing_p) - emit_insn (gen (d->target, d->op0)); - return true; - } - } + if (ix86_tieable_integer_mode_p (mode1) + && ix86_tieable_integer_mode_p (mode2)) + return true; - if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p)) - return true; + /* MODE2 being XFmode implies fp stack or general regs, which means we + can tie any smaller floating point modes to it. Note that we do not + tie this with TFmode. */ + if (mode2 == XFmode) + return mode1 == SFmode || mode1 == DFmode; - /* There are plenty of patterns in sse.md that are written for - SEL+CONCAT and are not replicated for a single op. Perhaps - that should be changed, to avoid the nastiness here. */ + /* MODE2 being DFmode implies fp stack, general or sse regs, which means + that we can tie it with SFmode. */ + if (mode2 == DFmode) + return mode1 == SFmode; - /* Recognize interleave style patterns, which means incrementing - every other permutation operand. */ - for (i = 0; i < nelt; i += 2) - { - nd.perm[i] = d->perm[i] & mask; - nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt; - } - if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, - d->testing_p)) - return true; + /* If MODE2 is only appropriate for an SSE register, then tie with + any other mode acceptable to SSE registers. */ + if (GET_MODE_SIZE (mode2) == 64 + && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) + return (GET_MODE_SIZE (mode1) == 64 + && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); + if (GET_MODE_SIZE (mode2) == 32 + && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) + return (GET_MODE_SIZE (mode1) == 32 + && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); + if (GET_MODE_SIZE (mode2) == 16 + && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) + return (GET_MODE_SIZE (mode1) == 16 + && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); - /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */ - if (nelt >= 4) - { - for (i = 0; i < nelt; i += 4) - { - nd.perm[i + 0] = d->perm[i + 0] & mask; - nd.perm[i + 1] = d->perm[i + 1] & mask; - nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt; - nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt; - } + /* If MODE2 is appropriate for an MMX register, then tie + with any other mode acceptable to MMX registers. */ + if (GET_MODE_SIZE (mode2) == 8 + && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2)) + return (GET_MODE_SIZE (mode1) == 8 + && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1)); - if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt, - d->testing_p)) - return true; - } - } + return false; +} - /* Try movss/movsd instructions. */ - if (expand_vec_perm_movs (d)) - return true; +/* Return the cost of moving between two registers of mode MODE. */ - /* Finally, try the fully general two operand permute. */ - if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt, - d->testing_p)) - return true; +static int +ix86_set_reg_reg_cost (machine_mode mode) +{ + unsigned int units = UNITS_PER_WORD; - /* Recognize interleave style patterns with reversed operands. */ - if (!d->one_operand_p) + switch (GET_MODE_CLASS (mode)) { - for (i = 0; i < nelt; ++i) - { - unsigned e = d->perm[i]; - if (e >= nelt) - e -= nelt; - else - e += nelt; - nd.perm[i] = e; - } - - if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt, - d->testing_p)) - return true; - } - - /* Try the SSE4.1 blend variable merge instructions. */ - if (expand_vec_perm_blend (d)) - return true; - - /* Try one of the AVX vpermil variable permutations. */ - if (expand_vec_perm_vpermil (d)) - return true; - - /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128, - vpshufb, vpermd, vpermps or vpermq variable permutation. */ - if (expand_vec_perm_pshufb (d)) - return true; + default: + break; - /* Try the AVX2 vpalignr instruction. */ - if (expand_vec_perm_palignr (d, true)) - return true; + case MODE_CC: + units = GET_MODE_SIZE (CCmode); + break; - /* Try the AVX512F vperm{s,d} instructions. */ - if (ix86_expand_vec_one_operand_perm_avx512 (d)) - return true; + case MODE_FLOAT: + if ((TARGET_SSE && mode == TFmode) + || (TARGET_80387 && mode == XFmode) + || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode) + || ((TARGET_80387 || TARGET_SSE) && mode == SFmode)) + units = GET_MODE_SIZE (mode); + break; - /* Try the AVX512F vpermt2/vpermi2 instructions. */ - if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) - return true; + case MODE_COMPLEX_FLOAT: + if ((TARGET_SSE && mode == TCmode) + || (TARGET_80387 && mode == XCmode) + || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode) + || ((TARGET_80387 || TARGET_SSE) && mode == SCmode)) + units = GET_MODE_SIZE (mode); + break; - /* See if we can get the same permutation in different vector integer - mode. */ - if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) - { - if (!d->testing_p) - emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); - return true; + case MODE_VECTOR_INT: + case MODE_VECTOR_FLOAT: + if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) + || (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) + || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode)) + || (TARGET_SSE && VALID_SSE_REG_MODE (mode)) + || (TARGET_MMX && VALID_MMX_REG_MODE (mode))) + units = GET_MODE_SIZE (mode); } - return false; + + /* Return the cost of moving between two registers of mode MODE, + assuming that the move will be in pieces of at most UNITS bytes. */ + return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units)); } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D - in terms of a pair of pshuflw + pshufhw instructions. */ +/* Return cost of vector operation in MODE given that scalar version has + COST. */ -static bool -expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d) +static int +ix86_vec_cost (machine_mode mode, int cost) { - unsigned char perm2[MAX_VECT_LEN]; - unsigned i; - bool ok; - - if (d->vmode != V8HImode || !d->one_operand_p) - return false; + if (!VECTOR_MODE_P (mode)) + return cost; - /* The two permutations only operate in 64-bit lanes. */ - for (i = 0; i < 4; ++i) - if (d->perm[i] >= 4) - return false; - for (i = 4; i < 8; ++i) - if (d->perm[i] < 4) - return false; + if (GET_MODE_BITSIZE (mode) == 128 + && TARGET_SSE_SPLIT_REGS) + return cost * 2; + if (GET_MODE_BITSIZE (mode) > 128 + && TARGET_AVX128_OPTIMAL) + return cost * GET_MODE_BITSIZE (mode) / 128; + return cost; +} - if (d->testing_p) - return true; +/* Return cost of multiplication in MODE. */ - /* Emit the pshuflw. */ - memcpy (perm2, d->perm, 4); - for (i = 4; i < 8; ++i) - perm2[i] = i; - ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p); - gcc_assert (ok); +static int +ix86_multiplication_cost (const struct processor_costs *cost, + enum machine_mode mode) +{ + machine_mode inner_mode = mode; + if (VECTOR_MODE_P (mode)) + inner_mode = GET_MODE_INNER (mode); - /* Emit the pshufhw. */ - memcpy (perm2 + 4, d->perm + 4, 4); - for (i = 0; i < 4; ++i) - perm2[i] = i; - ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p); - gcc_assert (ok); + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + return inner_mode == DFmode ? cost->mulsd : cost->mulss; + else if (X87_FLOAT_MODE_P (mode)) + return cost->fmul; + else if (FLOAT_MODE_P (mode)) + return ix86_vec_cost (mode, + inner_mode == DFmode ? cost->mulsd : cost->mulss); + else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + /* vpmullq is used in this case. No emulation is needed. */ + if (TARGET_AVX512DQ) + return ix86_vec_cost (mode, cost->mulss); - return true; + /* V*QImode is emulated with 7-13 insns. */ + if (mode == V16QImode || mode == V32QImode) + { + int extra = 11; + if (TARGET_XOP && mode == V16QImode) + extra = 5; + else if (TARGET_SSSE3) + extra = 6; + return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * extra); + } + /* V*DImode is emulated with 5-8 insns. */ + else if (mode == V2DImode || mode == V4DImode) + { + if (TARGET_XOP && mode == V2DImode) + return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 3); + else + return ix86_vec_cost (mode, cost->mulss * 3 + cost->sse_op * 5); + } + /* Without sse4.1, we don't have PMULLD; it's emulated with 7 + insns, including two PMULUDQ. */ + else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX)) + return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5); + else + return ix86_vec_cost (mode, cost->mulss); + } + else + return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7); } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify - the permutation using the SSSE3 palignr instruction. This succeeds - when all of the elements in PERM fit within one vector and we merely - need to shift them down so that a single vector permutation has a - chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only - the vpalignr instruction itself can perform the requested permutation. */ +/* Return cost of multiplication in MODE. */ + +static int +ix86_division_cost (const struct processor_costs *cost, + enum machine_mode mode) +{ + machine_mode inner_mode = mode; + if (VECTOR_MODE_P (mode)) + inner_mode = GET_MODE_INNER (mode); -static bool -expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p) -{ - unsigned i, nelt = d->nelt; - unsigned min, max, minswap, maxswap; - bool in_order, ok, swap = false; - rtx shift, target; - struct expand_vec_perm_d dcopy; - - /* Even with AVX, palignr only operates on 128-bit vectors, - in AVX2 palignr operates on both 128-bit lanes. */ - if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) - && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32)) - return false; + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + return inner_mode == DFmode ? cost->divsd : cost->divss; + else if (X87_FLOAT_MODE_P (mode)) + return cost->fdiv; + else if (FLOAT_MODE_P (mode)) + return ix86_vec_cost (mode, + inner_mode == DFmode ? cost->divsd : cost->divss); + else + return cost->divide[MODE_INDEX (mode)]; +} - min = 2 * nelt; - max = 0; - minswap = 2 * nelt; - maxswap = 0; - for (i = 0; i < nelt; ++i) - { - unsigned e = d->perm[i]; - unsigned eswap = d->perm[i] ^ nelt; - if (GET_MODE_SIZE (d->vmode) == 32) - { - e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1); - eswap = e ^ (nelt / 2); - } - if (e < min) - min = e; - if (e > max) - max = e; - if (eswap < minswap) - minswap = eswap; - if (eswap > maxswap) - maxswap = eswap; - } - if (min == 0 - || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt)) - { - if (d->one_operand_p - || minswap == 0 - || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32 - ? nelt / 2 : nelt)) - return false; - swap = true; - min = minswap; - max = maxswap; - } +#define COSTS_N_BYTES(N) ((N) * 2) - /* Given that we have SSSE3, we know we'll be able to implement the - single operand permutation after the palignr with pshufb for - 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed - first. */ - if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p) - return true; +/* Return cost of shift in MODE. + If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL. + AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE + if op1 is a result of subreg. - dcopy = *d; - if (swap) - { - dcopy.op0 = d->op1; - dcopy.op1 = d->op0; - for (i = 0; i < nelt; ++i) - dcopy.perm[i] ^= nelt; - } + SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */ - in_order = true; - for (i = 0; i < nelt; ++i) +static int +ix86_shift_rotate_cost (const struct processor_costs *cost, + enum machine_mode mode, bool constant_op1, + HOST_WIDE_INT op1_val, + bool speed, + bool and_in_op1, + bool shift_and_truncate, + bool *skip_op0, bool *skip_op1) +{ + if (skip_op0) + *skip_op0 = *skip_op1 = false; + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) { - unsigned e = dcopy.perm[i]; - if (GET_MODE_SIZE (d->vmode) == 32 - && e >= nelt - && (e & (nelt / 2 - 1)) < min) - e = e - min - (nelt / 2); + /* V*QImode is emulated with 1-11 insns. */ + if (mode == V16QImode || mode == V32QImode) + { + int count = 11; + if (TARGET_XOP && mode == V16QImode) + { + /* For XOP we use vpshab, which requires a broadcast of the + value to the variable shift insn. For constants this + means a V16Q const in mem; even when we can perform the + shift with one insn set the cost to prefer paddb. */ + if (constant_op1) + { + if (skip_op1) + *skip_op1 = true; + return ix86_vec_cost (mode, + cost->sse_op + + (speed + ? 2 + : COSTS_N_BYTES + (GET_MODE_UNIT_SIZE (mode)))); + } + count = 3; + } + else if (TARGET_SSSE3) + count = 7; + return ix86_vec_cost (mode, cost->sse_op * count); + } else - e = e - min; - if (e != i) - in_order = false; - dcopy.perm[i] = e; - } - dcopy.one_operand_p = true; - - if (single_insn_only_p && !in_order) - return false; - - /* For AVX2, test whether we can permute the result in one instruction. */ - if (d->testing_p) - { - if (in_order) - return true; - dcopy.op1 = dcopy.op0; - return expand_vec_perm_1 (&dcopy); + return ix86_vec_cost (mode, cost->sse_op); } - - shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode)); - if (GET_MODE_SIZE (d->vmode) == 16) + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) { - target = gen_reg_rtx (TImode); - emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1), - gen_lowpart (TImode, dcopy.op0), shift)); + if (constant_op1) + { + if (op1_val > 32) + return cost->shift_const + COSTS_N_INSNS (2); + else + return cost->shift_const * 2; + } + else + { + if (and_in_op1) + return cost->shift_var * 2; + else + return cost->shift_var * 6 + COSTS_N_INSNS (2); + } } else { - target = gen_reg_rtx (V2TImode); - emit_insn (gen_avx2_palignrv2ti (target, - gen_lowpart (V2TImode, dcopy.op1), - gen_lowpart (V2TImode, dcopy.op0), - shift)); - } - - dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); - - /* Test for the degenerate case where the alignment by itself - produces the desired permutation. */ - if (in_order) - { - emit_move_insn (d->target, dcopy.op0); - return true; + if (constant_op1) + return cost->shift_const; + else if (shift_and_truncate) + { + if (skip_op0) + *skip_op0 = *skip_op1 = true; + /* Return the cost after shift-and truncation. */ + return cost->shift_var; + } + else + return cost->shift_var; } - - ok = expand_vec_perm_1 (&dcopy); - gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32); - - return ok; + return cost->shift_const; } -/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify - the permutation using the SSE4_1 pblendv instruction. Potentially - reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */ +/* Compute a (partial) cost for rtx X. Return true if the complete + cost has been computed, and false if subexpressions should be + scanned. In either case, *TOTAL contains the cost result. */ static bool -expand_vec_perm_pblendv (struct expand_vec_perm_d *d) +ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, + int *total, bool speed) { - unsigned i, which, nelt = d->nelt; - struct expand_vec_perm_d dcopy, dcopy1; - machine_mode vmode = d->vmode; - bool ok; - - /* Use the same checks as in expand_vec_perm_blend. */ - if (d->one_operand_p) - return false; - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) - ; - else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) - ; - else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) - ; - else - return false; - - /* Figure out where permutation elements stay not in their - respective lanes. */ - for (i = 0, which = 0; i < nelt; ++i) - { - unsigned e = d->perm[i]; - if (e != i) - which |= (e < nelt ? 1 : 2); - } - /* We can pblend the part where elements stay not in their - respective lanes only when these elements are all in one - half of a permutation. - {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective - lanes, but both 8 and 9 >= 8 - {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their - respective lanes and 8 >= 8, but 2 not. */ - if (which != 1 && which != 2) - return false; - if (d->testing_p && GET_MODE_SIZE (vmode) == 16) - return true; - - /* First we apply one operand permutation to the part where - elements stay not in their respective lanes. */ - dcopy = *d; - if (which == 2) - dcopy.op0 = dcopy.op1 = d->op1; - else - dcopy.op0 = dcopy.op1 = d->op0; - if (!d->testing_p) - dcopy.target = gen_reg_rtx (vmode); - dcopy.one_operand_p = true; - - for (i = 0; i < nelt; ++i) - dcopy.perm[i] = d->perm[i] & (nelt - 1); - - ok = expand_vec_perm_1 (&dcopy); - if (GET_MODE_SIZE (vmode) != 16 && !ok) - return false; - else - gcc_assert (ok); - if (d->testing_p) - return true; - - /* Next we put permuted elements into their positions. */ - dcopy1 = *d; - if (which == 2) - dcopy1.op1 = dcopy.target; - else - dcopy1.op0 = dcopy.target; - - for (i = 0; i < nelt; ++i) - dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i); + rtx mask; + enum rtx_code code = GET_CODE (x); + enum rtx_code outer_code = (enum rtx_code) outer_code_i; + const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost; + int src_cost; - ok = expand_vec_perm_blend (&dcopy1); - gcc_assert (ok); + switch (code) + { + case SET: + if (register_operand (SET_DEST (x), VOIDmode) + && register_operand (SET_SRC (x), VOIDmode)) + { + *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x))); + return true; + } - return true; -} + if (register_operand (SET_SRC (x), VOIDmode)) + /* Avoid potentially incorrect high cost from rtx_costs + for non-tieable SUBREGs. */ + src_cost = 0; + else + { + src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed); -static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d); + if (CONSTANT_P (SET_SRC (x))) + /* Constant costs assume a base value of COSTS_N_INSNS (1) and add + a small value, possibly zero for cheap constants. */ + src_cost += COSTS_N_INSNS (1); + } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify - a two vector permutation into a single vector permutation by using - an interleave operation to merge the vectors. */ + *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed); + return true; -static bool -expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dremap, dfinal; - unsigned i, nelt = d->nelt, nelt2 = nelt / 2; - unsigned HOST_WIDE_INT contents; - unsigned char remap[2 * MAX_VECT_LEN]; - rtx_insn *seq; - bool ok, same_halves = false; + case CONST_INT: + case CONST: + case LABEL_REF: + case SYMBOL_REF: + if (x86_64_immediate_operand (x, VOIDmode)) + *total = 0; + else + *total = 1; + return true; - if (GET_MODE_SIZE (d->vmode) == 16) - { - if (d->one_operand_p) - return false; - } - else if (GET_MODE_SIZE (d->vmode) == 32) - { - if (!TARGET_AVX) - return false; - /* For 32-byte modes allow even d->one_operand_p. - The lack of cross-lane shuffling in some instructions - might prevent a single insn shuffle. */ - dfinal = *d; - dfinal.testing_p = true; - /* If expand_vec_perm_interleave3 can expand this into - a 3 insn sequence, give up and let it be expanded as - 3 insn sequence. While that is one insn longer, - it doesn't need a memory operand and in the common - case that both interleave low and high permutations - with the same operands are adjacent needs 4 insns - for both after CSE. */ - if (expand_vec_perm_interleave3 (&dfinal)) - return false; - } - else - return false; + case CONST_DOUBLE: + if (IS_STACK_MODE (mode)) + switch (standard_80387_constant_p (x)) + { + case -1: + case 0: + break; + case 1: /* 0.0 */ + *total = 1; + return true; + default: /* Other constants */ + *total = 2; + return true; + } + /* FALLTHRU */ - /* Examine from whence the elements come. */ - contents = 0; - for (i = 0; i < nelt; ++i) - contents |= HOST_WIDE_INT_1U << d->perm[i]; + case CONST_VECTOR: + switch (standard_sse_constant_p (x, mode)) + { + case 0: + break; + case 1: /* 0: xor eliminates false dependency */ + *total = 0; + return true; + default: /* -1: cmp contains false dependency */ + *total = 1; + return true; + } + /* FALLTHRU */ - memset (remap, 0xff, sizeof (remap)); - dremap = *d; + case CONST_WIDE_INT: + /* Fall back to (MEM (SYMBOL_REF)), since that's where + it'll probably end up. Add a penalty for size. */ + *total = (COSTS_N_INSNS (1) + + (!TARGET_64BIT && flag_pic) + + (GET_MODE_SIZE (mode) <= 4 + ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2)); + return true; - if (GET_MODE_SIZE (d->vmode) == 16) - { - unsigned HOST_WIDE_INT h1, h2, h3, h4; + case ZERO_EXTEND: + /* The zero extensions is often completely free on x86_64, so make + it as cheap as possible. */ + if (TARGET_64BIT && mode == DImode + && GET_MODE (XEXP (x, 0)) == SImode) + *total = 1; + else if (TARGET_ZERO_EXTEND_WITH_AND) + *total = cost->add; + else + *total = cost->movzx; + return false; - /* Split the two input vectors into 4 halves. */ - h1 = (HOST_WIDE_INT_1U << nelt2) - 1; - h2 = h1 << nelt2; - h3 = h2 << nelt2; - h4 = h3 << nelt2; + case SIGN_EXTEND: + *total = cost->movsx; + return false; - /* If the elements from the low halves use interleave low, and similarly - for interleave high. If the elements are from mis-matched halves, we - can use shufps for V4SF/V4SI or do a DImode shuffle. */ - if ((contents & (h1 | h3)) == contents) - { - /* punpckl* */ - for (i = 0; i < nelt2; ++i) - { - remap[i] = i * 2; - remap[i + nelt] = i * 2 + 1; - dremap.perm[i * 2] = i; - dremap.perm[i * 2 + 1] = i + nelt; - } - if (!TARGET_SSE2 && d->vmode == V4SImode) - dremap.vmode = V4SFmode; - } - else if ((contents & (h2 | h4)) == contents) - { - /* punpckh* */ - for (i = 0; i < nelt2; ++i) - { - remap[i + nelt2] = i * 2; - remap[i + nelt + nelt2] = i * 2 + 1; - dremap.perm[i * 2] = i + nelt2; - dremap.perm[i * 2 + 1] = i + nelt + nelt2; - } - if (!TARGET_SSE2 && d->vmode == V4SImode) - dremap.vmode = V4SFmode; - } - else if ((contents & (h1 | h4)) == contents) + case ASHIFT: + if (SCALAR_INT_MODE_P (mode) + && GET_MODE_SIZE (mode) < UNITS_PER_WORD + && CONST_INT_P (XEXP (x, 1))) { - /* shufps */ - for (i = 0; i < nelt2; ++i) + HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); + if (value == 1) { - remap[i] = i; - remap[i + nelt + nelt2] = i + nelt2; - dremap.perm[i] = i; - dremap.perm[i + nelt2] = i + nelt + nelt2; + *total = cost->add; + return false; } - if (nelt != 4) + if ((value == 2 || value == 3) + && cost->lea <= cost->shift_const) { - /* shufpd */ - dremap.vmode = V2DImode; - dremap.nelt = 2; - dremap.perm[0] = 0; - dremap.perm[1] = 3; + *total = cost->lea; + return false; } } - else if ((contents & (h2 | h3)) == contents) + /* FALLTHRU */ + + case ROTATE: + case ASHIFTRT: + case LSHIFTRT: + case ROTATERT: + bool skip_op0, skip_op1; + *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)), + CONST_INT_P (XEXP (x, 1)) + ? INTVAL (XEXP (x, 1)) : -1, + speed, + GET_CODE (XEXP (x, 1)) == AND, + SUBREG_P (XEXP (x, 1)) + && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND, + &skip_op0, &skip_op1); + if (skip_op0 || skip_op1) { - /* shufps */ - for (i = 0; i < nelt2; ++i) - { - remap[i + nelt2] = i; - remap[i + nelt] = i + nelt2; - dremap.perm[i] = i + nelt2; - dremap.perm[i + nelt2] = i + nelt; - } - if (nelt != 4) - { - /* shufpd */ - dremap.vmode = V2DImode; - dremap.nelt = 2; - dremap.perm[0] = 1; - dremap.perm[1] = 2; - } + if (!skip_op0) + *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed); + if (!skip_op1) + *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed); + return true; } - else - return false; - } - else - { - unsigned int nelt4 = nelt / 4, nzcnt = 0; - unsigned HOST_WIDE_INT q[8]; - unsigned int nonzero_halves[4]; + return false; - /* Split the two input vectors into 8 quarters. */ - q[0] = (HOST_WIDE_INT_1U << nelt4) - 1; - for (i = 1; i < 8; ++i) - q[i] = q[0] << (nelt4 * i); - for (i = 0; i < 4; ++i) - if (((q[2 * i] | q[2 * i + 1]) & contents) != 0) - { - nonzero_halves[nzcnt] = i; - ++nzcnt; - } + case FMA: + { + rtx sub; - if (nzcnt == 1) - { - gcc_assert (d->one_operand_p); - nonzero_halves[1] = nonzero_halves[0]; - same_halves = true; - } - else if (d->one_operand_p) - { - gcc_assert (nonzero_halves[0] == 0); - gcc_assert (nonzero_halves[1] == 1); - } + gcc_assert (FLOAT_MODE_P (mode)); + gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F); + + *total = ix86_vec_cost (mode, + GET_MODE_INNER (mode) == SFmode + ? cost->fmass : cost->fmasd); + *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed); + + /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */ + sub = XEXP (x, 0); + if (GET_CODE (sub) == NEG) + sub = XEXP (sub, 0); + *total += rtx_cost (sub, mode, FMA, 0, speed); - if (nzcnt <= 2) + sub = XEXP (x, 2); + if (GET_CODE (sub) == NEG) + sub = XEXP (sub, 0); + *total += rtx_cost (sub, mode, FMA, 2, speed); + return true; + } + + case MULT: + if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode)) { - if (d->perm[0] / nelt2 == nonzero_halves[1]) + rtx op0 = XEXP (x, 0); + rtx op1 = XEXP (x, 1); + int nbits; + if (CONST_INT_P (XEXP (x, 1))) { - /* Attempt to increase the likelihood that dfinal - shuffle will be intra-lane. */ - std::swap (nonzero_halves[0], nonzero_halves[1]); + unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); + for (nbits = 0; value != 0; value &= value - 1) + nbits++; } + else + /* This is arbitrary. */ + nbits = 7; - /* vperm2f128 or vperm2i128. */ - for (i = 0; i < nelt2; ++i) + /* Compute costs correctly for widening multiplication. */ + if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND) + && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2 + == GET_MODE_SIZE (mode)) { - remap[i + nonzero_halves[1] * nelt2] = i + nelt2; - remap[i + nonzero_halves[0] * nelt2] = i; - dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2; - dremap.perm[i] = i + nonzero_halves[0] * nelt2; + int is_mulwiden = 0; + machine_mode inner_mode = GET_MODE (op0); + + if (GET_CODE (op0) == GET_CODE (op1)) + is_mulwiden = 1, op1 = XEXP (op1, 0); + else if (CONST_INT_P (op1)) + { + if (GET_CODE (op0) == SIGN_EXTEND) + is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode) + == INTVAL (op1); + else + is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode)); + } + + if (is_mulwiden) + op0 = XEXP (op0, 0), mode = GET_MODE (op0); } - if (d->vmode != V8SFmode - && d->vmode != V4DFmode - && d->vmode != V8SImode) + *total = (cost->mult_init[MODE_INDEX (mode)] + + nbits * cost->mult_bit + + rtx_cost (op0, mode, outer_code, opno, speed) + + rtx_cost (op1, mode, outer_code, opno, speed)); + + return true; + } + *total = ix86_multiplication_cost (cost, mode); + return false; + + case DIV: + case UDIV: + case MOD: + case UMOD: + *total = ix86_division_cost (cost, mode); + return false; + + case PLUS: + if (GET_MODE_CLASS (mode) == MODE_INT + && GET_MODE_SIZE (mode) <= UNITS_PER_WORD) + { + if (GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT + && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1)) + && CONSTANT_P (XEXP (x, 1))) { - dremap.vmode = V8SImode; - dremap.nelt = 8; - for (i = 0; i < 4; ++i) + HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1)); + if (val == 2 || val == 4 || val == 8) { - dremap.perm[i] = i + nonzero_halves[0] * 4; - dremap.perm[i + 4] = i + nonzero_halves[1] * 4; + *total = cost->lea; + *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode, + outer_code, opno, speed); + *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode, + outer_code, opno, speed); + *total += rtx_cost (XEXP (x, 1), mode, + outer_code, opno, speed); + return true; } } - } - else if (d->one_operand_p) - return false; - else if (TARGET_AVX2 - && (contents & (q[0] | q[2] | q[4] | q[6])) == contents) - { - /* vpunpckl* */ - for (i = 0; i < nelt4; ++i) + else if (GET_CODE (XEXP (x, 0)) == MULT + && CONST_INT_P (XEXP (XEXP (x, 0), 1))) { - remap[i] = i * 2; - remap[i + nelt] = i * 2 + 1; - remap[i + nelt2] = i * 2 + nelt2; - remap[i + nelt + nelt2] = i * 2 + nelt2 + 1; - dremap.perm[i * 2] = i; - dremap.perm[i * 2 + 1] = i + nelt; - dremap.perm[i * 2 + nelt2] = i + nelt2; - dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2; + HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1)); + if (val == 2 || val == 4 || val == 8) + { + *total = cost->lea; + *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, + outer_code, opno, speed); + *total += rtx_cost (XEXP (x, 1), mode, + outer_code, opno, speed); + return true; + } } - } - else if (TARGET_AVX2 - && (contents & (q[1] | q[3] | q[5] | q[7])) == contents) - { - /* vpunpckh* */ - for (i = 0; i < nelt4; ++i) + else if (GET_CODE (XEXP (x, 0)) == PLUS) { - remap[i + nelt4] = i * 2; - remap[i + nelt + nelt4] = i * 2 + 1; - remap[i + nelt2 + nelt4] = i * 2 + nelt2; - remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1; - dremap.perm[i * 2] = i + nelt4; - dremap.perm[i * 2 + 1] = i + nelt + nelt4; - dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4; - dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4; + /* Add with carry, ignore the cost of adding a carry flag. */ + if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode)) + *total = cost->add; + else + { + *total = cost->lea; + *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, + outer_code, opno, speed); + } + + *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode, + outer_code, opno, speed); + *total += rtx_cost (XEXP (x, 1), mode, + outer_code, opno, speed); + return true; } } - else - return false; - } + /* FALLTHRU */ - /* Use the remapping array set up above to move the elements from their - swizzled locations into their final destinations. */ - dfinal = *d; - for (i = 0; i < nelt; ++i) - { - unsigned e = remap[d->perm[i]]; - gcc_assert (e < nelt); - /* If same_halves is true, both halves of the remapped vector are the - same. Avoid cross-lane accesses if possible. */ - if (same_halves && i >= nelt2) + case MINUS: + /* Subtract with borrow, ignore the cost of subtracting a carry flag. */ + if (GET_MODE_CLASS (mode) == MODE_INT + && GET_MODE_SIZE (mode) <= UNITS_PER_WORD + && GET_CODE (XEXP (x, 0)) == MINUS + && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode)) { - gcc_assert (e < nelt2); - dfinal.perm[i] = e + nelt2; + *total = cost->add; + *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode, + outer_code, opno, speed); + *total += rtx_cost (XEXP (x, 1), mode, + outer_code, opno, speed); + return true; } - else - dfinal.perm[i] = e; - } - if (!d->testing_p) - { - dremap.target = gen_reg_rtx (dremap.vmode); - dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); - } - dfinal.op1 = dfinal.op0; - dfinal.one_operand_p = true; - - /* Test if the final remap can be done with a single insn. For V4SFmode or - V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */ - start_sequence (); - ok = expand_vec_perm_1 (&dfinal); - seq = get_insns (); - end_sequence (); - - if (!ok) - return false; - - if (d->testing_p) - return true; - - if (dremap.vmode != dfinal.vmode) - { - dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0); - dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1); - } - - ok = expand_vec_perm_1 (&dremap); - gcc_assert (ok); - - emit_insn (seq); - return true; -} - -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify - a single vector cross-lane permutation into vpermq followed - by any of the single insn permutations. */ - -static bool -expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dremap, dfinal; - unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4; - unsigned contents[2]; - bool ok; - - if (!(TARGET_AVX2 - && (d->vmode == V32QImode || d->vmode == V16HImode) - && d->one_operand_p)) - return false; - - contents[0] = 0; - contents[1] = 0; - for (i = 0; i < nelt2; ++i) - { - contents[0] |= 1u << (d->perm[i] / nelt4); - contents[1] |= 1u << (d->perm[i + nelt2] / nelt4); - } - for (i = 0; i < 2; ++i) - { - unsigned int cnt = 0; - for (j = 0; j < 4; ++j) - if ((contents[i] & (1u << j)) != 0 && ++cnt > 2) + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + { + *total = cost->addss; return false; - } - - if (d->testing_p) - return true; - - dremap = *d; - dremap.vmode = V4DImode; - dremap.nelt = 4; - dremap.target = gen_reg_rtx (V4DImode); - dremap.op0 = gen_lowpart (V4DImode, d->op0); - dremap.op1 = dremap.op0; - dremap.one_operand_p = true; - for (i = 0; i < 2; ++i) - { - unsigned int cnt = 0; - for (j = 0; j < 4; ++j) - if ((contents[i] & (1u << j)) != 0) - dremap.perm[2 * i + cnt++] = j; - for (; cnt < 2; ++cnt) - dremap.perm[2 * i + cnt] = 0; - } - - dfinal = *d; - dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target); - dfinal.op1 = dfinal.op0; - dfinal.one_operand_p = true; - for (i = 0, j = 0; i < nelt; ++i) - { - if (i == nelt2) - j = 2; - dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0); - if ((d->perm[i] / nelt4) == dremap.perm[j]) - ; - else if ((d->perm[i] / nelt4) == dremap.perm[j + 1]) - dfinal.perm[i] |= nelt4; - else - gcc_unreachable (); - } - - ok = expand_vec_perm_1 (&dremap); - gcc_assert (ok); - - ok = expand_vec_perm_1 (&dfinal); - gcc_assert (ok); - - return true; -} - -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand - a vector permutation using two instructions, vperm2f128 resp. - vperm2i128 followed by any single in-lane permutation. */ - -static bool -expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dfirst, dsecond; - unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm; - bool ok; + } + else if (X87_FLOAT_MODE_P (mode)) + { + *total = cost->fadd; + return false; + } + else if (FLOAT_MODE_P (mode)) + { + *total = ix86_vec_cost (mode, cost->addss); + return false; + } + /* FALLTHRU */ - if (!TARGET_AVX - || GET_MODE_SIZE (d->vmode) != 32 - || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)) - return false; + case AND: + case IOR: + case XOR: + if (GET_MODE_CLASS (mode) == MODE_INT + && GET_MODE_SIZE (mode) > UNITS_PER_WORD) + { + *total = (cost->add * 2 + + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed) + << (GET_MODE (XEXP (x, 0)) != DImode)) + + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed) + << (GET_MODE (XEXP (x, 1)) != DImode))); + return true; + } + /* FALLTHRU */ - dsecond = *d; - dsecond.one_operand_p = false; - dsecond.testing_p = true; - - /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128 - immediate. For perm < 16 the second permutation uses - d->op0 as first operand, for perm >= 16 it uses d->op1 - as first operand. The second operand is the result of - vperm2[fi]128. */ - for (perm = 0; perm < 32; perm++) - { - /* Ignore permutations which do not move anything cross-lane. */ - if (perm < 16) - { - /* The second shuffle for e.g. V4DFmode has - 0123 and ABCD operands. - Ignore AB23, as 23 is already in the second lane - of the first operand. */ - if ((perm & 0xc) == (1 << 2)) continue; - /* And 01CD, as 01 is in the first lane of the first - operand. */ - if ((perm & 3) == 0) continue; - /* And 4567, as then the vperm2[fi]128 doesn't change - anything on the original 4567 second operand. */ - if ((perm & 0xf) == ((3 << 2) | 2)) continue; + case NEG: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + { + *total = cost->sse_op; + return false; } - else + else if (X87_FLOAT_MODE_P (mode)) { - /* The second shuffle for e.g. V4DFmode has - 4567 and ABCD operands. - Ignore AB67, as 67 is already in the second lane - of the first operand. */ - if ((perm & 0xc) == (3 << 2)) continue; - /* And 45CD, as 45 is in the first lane of the first - operand. */ - if ((perm & 3) == 2) continue; - /* And 0123, as then the vperm2[fi]128 doesn't change - anything on the original 0123 first operand. */ - if ((perm & 0xf) == (1 << 2)) continue; - } - - for (i = 0; i < nelt; i++) - { - j = d->perm[i] / nelt2; - if (j == ((perm >> (2 * (i >= nelt2))) & 3)) - dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1)); - else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16)) - dsecond.perm[i] = d->perm[i] & (nelt - 1); - else - break; + *total = cost->fchs; + return false; } - - if (i == nelt) + else if (FLOAT_MODE_P (mode)) { - start_sequence (); - ok = expand_vec_perm_1 (&dsecond); - end_sequence (); + *total = ix86_vec_cost (mode, cost->sse_op); + return false; } + /* FALLTHRU */ + + case NOT: + if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + *total = ix86_vec_cost (mode, cost->sse_op); + else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + *total = cost->add * 2; else - ok = false; + *total = cost->add; + return false; - if (ok) + case COMPARE: + if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT + && XEXP (XEXP (x, 0), 1) == const1_rtx + && CONST_INT_P (XEXP (XEXP (x, 0), 2)) + && XEXP (x, 1) == const0_rtx) { - if (d->testing_p) - return true; - - /* Found a usable second shuffle. dfirst will be - vperm2f128 on d->op0 and d->op1. */ - dsecond.testing_p = false; - dfirst = *d; - dfirst.target = gen_reg_rtx (d->vmode); - for (i = 0; i < nelt; i++) - dfirst.perm[i] = (i & (nelt2 - 1)) - + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2; - - canonicalize_perm (&dfirst); - ok = expand_vec_perm_1 (&dfirst); - gcc_assert (ok); - - /* And dsecond is some single insn shuffle, taking - d->op0 and result of vperm2f128 (if perm < 16) or - d->op1 and result of vperm2f128 (otherwise). */ - if (perm >= 16) - dsecond.op0 = dsecond.op1; - dsecond.op1 = dfirst.target; - - ok = expand_vec_perm_1 (&dsecond); - gcc_assert (ok); - + /* This kind of construct is implemented using test[bwl]. + Treat it as if we had an AND. */ + mode = GET_MODE (XEXP (XEXP (x, 0), 0)); + *total = (cost->add + + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code, + opno, speed) + + rtx_cost (const1_rtx, mode, outer_code, opno, speed)); return true; } - /* For one operand, the only useful vperm2f128 permutation is 0x01 - aka lanes swap. */ - if (d->one_operand_p) - return false; - } + /* The embedded comparison operand is completely free. */ + if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0))) + && XEXP (x, 1) == const0_rtx) + *total = 0; - return false; -} + return false; -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify - a two vector permutation using 2 intra-lane interleave insns - and cross-lane shuffle for 32-byte vectors. */ + case FLOAT_EXTEND: + if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + *total = 0; + else + *total = ix86_vec_cost (mode, cost->addss); + return false; -static bool -expand_vec_perm_interleave3 (struct expand_vec_perm_d *d) -{ - unsigned i, nelt; - rtx (*gen) (rtx, rtx, rtx); + case FLOAT_TRUNCATE: + if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + *total = cost->fadd; + else + *total = ix86_vec_cost (mode, cost->addss); + return false; - if (d->one_operand_p) - return false; - if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32) - ; - else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode)) - ; - else - return false; + case ABS: + /* SSE requires memory load for the constant operand. It may make + sense to account for this. Of course the constant operand may or + may not be reused. */ + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + *total = cost->sse_op; + else if (X87_FLOAT_MODE_P (mode)) + *total = cost->fabs; + else if (FLOAT_MODE_P (mode)) + *total = ix86_vec_cost (mode, cost->sse_op); + return false; - nelt = d->nelt; - if (d->perm[0] != 0 && d->perm[0] != nelt / 2) - return false; - for (i = 0; i < nelt; i += 2) - if (d->perm[i] != d->perm[0] + i / 2 - || d->perm[i + 1] != d->perm[0] + i / 2 + nelt) + case SQRT: + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd; + else if (X87_FLOAT_MODE_P (mode)) + *total = cost->fsqrt; + else if (FLOAT_MODE_P (mode)) + *total = ix86_vec_cost (mode, + mode == SFmode ? cost->sqrtss : cost->sqrtsd); return false; - if (d->testing_p) - return true; + case UNSPEC: + if (XINT (x, 1) == UNSPEC_TP) + *total = 0; + return false; - switch (d->vmode) - { - case E_V32QImode: - if (d->perm[0]) - gen = gen_vec_interleave_highv32qi; - else - gen = gen_vec_interleave_lowv32qi; - break; - case E_V16HImode: - if (d->perm[0]) - gen = gen_vec_interleave_highv16hi; - else - gen = gen_vec_interleave_lowv16hi; - break; - case E_V8SImode: - if (d->perm[0]) - gen = gen_vec_interleave_highv8si; - else - gen = gen_vec_interleave_lowv8si; - break; - case E_V4DImode: - if (d->perm[0]) - gen = gen_vec_interleave_highv4di; - else - gen = gen_vec_interleave_lowv4di; - break; - case E_V8SFmode: - if (d->perm[0]) - gen = gen_vec_interleave_highv8sf; - else - gen = gen_vec_interleave_lowv8sf; - break; - case E_V4DFmode: - if (d->perm[0]) - gen = gen_vec_interleave_highv4df; + case VEC_SELECT: + case VEC_CONCAT: + case VEC_DUPLICATE: + /* ??? Assume all of these vector manipulation patterns are + recognizable. In which case they all pretty much have the + same cost. */ + *total = cost->sse_op; + return true; + case VEC_MERGE: + mask = XEXP (x, 2); + /* This is masked instruction, assume the same cost, + as nonmasked variant. */ + if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask))) + *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed); else - gen = gen_vec_interleave_lowv4df; - break; + *total = cost->sse_op; + return true; + default: - gcc_unreachable (); + return false; } - - emit_insn (gen (d->target, d->op0, d->op1)); - return true; } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement - a single vector permutation using a single intra-lane vector - permutation, vperm2f128 swapping the lanes and vblend* insn blending - the non-swapped and swapped vectors together. */ - -static bool -expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d) -{ - struct expand_vec_perm_d dfirst, dsecond; - unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2; - rtx_insn *seq; - bool ok; - rtx (*blend) (rtx, rtx, rtx, rtx) = NULL; - - if (!TARGET_AVX - || TARGET_AVX2 - || (d->vmode != V8SFmode && d->vmode != V4DFmode) - || !d->one_operand_p) - return false; - - dfirst = *d; - for (i = 0; i < nelt; i++) - dfirst.perm[i] = 0xff; - for (i = 0, msk = 0; i < nelt; i++) - { - j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2; - if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i]) - return false; - dfirst.perm[j] = d->perm[i]; - if (j != i) - msk |= (1 << i); - } - for (i = 0; i < nelt; i++) - if (dfirst.perm[i] == 0xff) - dfirst.perm[i] = i; - - if (!d->testing_p) - dfirst.target = gen_reg_rtx (dfirst.vmode); - - start_sequence (); - ok = expand_vec_perm_1 (&dfirst); - seq = get_insns (); - end_sequence (); - - if (!ok) - return false; - - if (d->testing_p) - return true; - - emit_insn (seq); - - dsecond = *d; - dsecond.op0 = dfirst.target; - dsecond.op1 = dfirst.target; - dsecond.one_operand_p = true; - dsecond.target = gen_reg_rtx (dsecond.vmode); - for (i = 0; i < nelt; i++) - dsecond.perm[i] = i ^ nelt2; - - ok = expand_vec_perm_1 (&dsecond); - gcc_assert (ok); +#if TARGET_MACHO - blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256; - emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk))); - return true; -} +static int current_machopic_label_num; -/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF - permutation using two vperm2f128, followed by a vshufpd insn blending - the two vectors together. */ +/* Given a symbol name and its associated stub, write out the + definition of the stub. */ -static bool -expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d) +void +machopic_output_stub (FILE *file, const char *symb, const char *stub) { - struct expand_vec_perm_d dfirst, dsecond, dthird; - bool ok; - - if (!TARGET_AVX || (d->vmode != V4DFmode)) - return false; - - if (d->testing_p) - return true; - - dfirst = *d; - dsecond = *d; - dthird = *d; - - dfirst.perm[0] = (d->perm[0] & ~1); - dfirst.perm[1] = (d->perm[0] & ~1) + 1; - dfirst.perm[2] = (d->perm[2] & ~1); - dfirst.perm[3] = (d->perm[2] & ~1) + 1; - dsecond.perm[0] = (d->perm[1] & ~1); - dsecond.perm[1] = (d->perm[1] & ~1) + 1; - dsecond.perm[2] = (d->perm[3] & ~1); - dsecond.perm[3] = (d->perm[3] & ~1) + 1; - dthird.perm[0] = (d->perm[0] % 2); - dthird.perm[1] = (d->perm[1] % 2) + 4; - dthird.perm[2] = (d->perm[2] % 2) + 2; - dthird.perm[3] = (d->perm[3] % 2) + 6; - - dfirst.target = gen_reg_rtx (dfirst.vmode); - dsecond.target = gen_reg_rtx (dsecond.vmode); - dthird.op0 = dfirst.target; - dthird.op1 = dsecond.target; - dthird.one_operand_p = false; - - canonicalize_perm (&dfirst); - canonicalize_perm (&dsecond); - - ok = expand_vec_perm_1 (&dfirst) - && expand_vec_perm_1 (&dsecond) - && expand_vec_perm_1 (&dthird); + unsigned int length; + char *binder_name, *symbol_name, lazy_ptr_name[32]; + int label = ++current_machopic_label_num; - gcc_assert (ok); + /* For 64-bit we shouldn't get here. */ + gcc_assert (!TARGET_64BIT); - return true; -} + /* Lose our funky encoding stuff so it doesn't contaminate the stub. */ + symb = targetm.strip_name_encoding (symb); -/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word - permutation with two pshufb insns and an ior. We should have already - failed all two instruction sequences. */ + length = strlen (stub); + binder_name = XALLOCAVEC (char, length + 32); + GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length); -static bool -expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) -{ - rtx rperm[2][16], vperm, l, h, op, m128; - unsigned int i, nelt, eltsz; + length = strlen (symb); + symbol_name = XALLOCAVEC (char, length + 32); + GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length); - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) - return false; - gcc_assert (!d->one_operand_p); + sprintf (lazy_ptr_name, "L%d$lz", label); - if (d->testing_p) - return true; + if (MACHOPIC_ATT_STUB) + switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]); + else if (MACHOPIC_PURE) + switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]); + else + switch_to_section (darwin_sections[machopic_symbol_stub_section]); - nelt = d->nelt; - eltsz = GET_MODE_UNIT_SIZE (d->vmode); + fprintf (file, "%s:\n", stub); + fprintf (file, "\t.indirect_symbol %s\n", symbol_name); - /* Generate two permutation masks. If the required element is within - the given vector it is shuffled into the proper lane. If the required - element is in the other vector, force a zero into the lane by setting - bit 7 in the permutation mask. */ - m128 = GEN_INT (-128); - for (i = 0; i < nelt; ++i) + if (MACHOPIC_ATT_STUB) { - unsigned j, e = d->perm[i]; - unsigned which = (e >= nelt); - if (e >= nelt) - e -= nelt; - - for (j = 0; j < eltsz; ++j) - { - rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j); - rperm[1-which][i*eltsz + j] = m128; - } + fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n"); } - - vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0])); - vperm = force_reg (V16QImode, vperm); - - l = gen_reg_rtx (V16QImode); - op = gen_lowpart (V16QImode, d->op0); - emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm)); - - vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1])); - vperm = force_reg (V16QImode, vperm); - - h = gen_reg_rtx (V16QImode); - op = gen_lowpart (V16QImode, d->op1); - emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm)); - - op = d->target; - if (d->vmode != V16QImode) - op = gen_reg_rtx (V16QImode); - emit_insn (gen_iorv16qi3 (op, l, h)); - if (op != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, op)); - - return true; -} - -/* Implement arbitrary permutation of one V32QImode and V16QImode operand - with two vpshufb insns, vpermq and vpor. We should have already failed - all two or three instruction sequences. */ - -static bool -expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) -{ - rtx rperm[2][32], vperm, l, h, hp, op, m128; - unsigned int i, nelt, eltsz; - - if (!TARGET_AVX2 - || !d->one_operand_p - || (d->vmode != V32QImode && d->vmode != V16HImode)) - return false; - - if (d->testing_p) - return true; - - nelt = d->nelt; - eltsz = GET_MODE_UNIT_SIZE (d->vmode); - - /* Generate two permutation masks. If the required element is within - the same lane, it is shuffled in. If the required element from the - other lane, force a zero by setting bit 7 in the permutation mask. - In the other mask the mask has non-negative elements if element - is requested from the other lane, but also moved to the other lane, - so that the result of vpshufb can have the two V2TImode halves - swapped. */ - m128 = GEN_INT (-128); - for (i = 0; i < nelt; ++i) + else if (MACHOPIC_PURE) { - unsigned j, e = d->perm[i] & (nelt / 2 - 1); - unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; - - for (j = 0; j < eltsz; ++j) - { - rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j); - rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128; - } + /* PIC stub. */ + /* 25-byte PIC stub using "CALL get_pc_thunk". */ + rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */); + output_set_got (tmp, NULL_RTX); /* "CALL ___.get_pc_thunk.cx". */ + fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n", + label, lazy_ptr_name, label); + fprintf (file, "\tjmp\t*%%ecx\n"); } + else + fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name); - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); - vperm = force_reg (V32QImode, vperm); - - h = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); + /* The AT&T-style ("self-modifying") stub is not lazily bound, thus + it needs no stub-binding-helper. */ + if (MACHOPIC_ATT_STUB) + return; - /* Swap the 128-byte lanes of h into hp. */ - hp = gen_reg_rtx (V4DImode); - op = gen_lowpart (V4DImode, h); - emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx, - const1_rtx)); + fprintf (file, "%s:\n", binder_name); - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); - vperm = force_reg (V32QImode, vperm); + if (MACHOPIC_PURE) + { + fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name); + fprintf (file, "\tpushl\t%%ecx\n"); + } + else + fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name); - l = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); + fputs ("\tjmp\tdyld_stub_binding_helper\n", file); - op = d->target; - if (d->vmode != V32QImode) - op = gen_reg_rtx (V32QImode); - emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp))); - if (op != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, op)); + /* N.B. Keep the correspondence of these + 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the + old-pic/new-pic/non-pic stubs; altering this will break + compatibility with existing dylibs. */ + if (MACHOPIC_PURE) + { + /* 25-byte PIC stub using "CALL get_pc_thunk". */ + switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]); + } + else + /* 16-byte -mdynamic-no-pic stub. */ + switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]); - return true; + fprintf (file, "%s:\n", lazy_ptr_name); + fprintf (file, "\t.indirect_symbol %s\n", symbol_name); + fprintf (file, ASM_LONG "%s\n", binder_name); } +#endif /* TARGET_MACHO */ -/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even - and extract-odd permutations of two V32QImode and V16QImode operand - with two vpshufb insns, vpor and vpermq. We should have already - failed all two or three instruction sequences. */ +/* Order the registers for register allocator. */ -static bool -expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) +void +x86_order_regs_for_local_alloc (void) { - rtx rperm[2][32], vperm, l, h, ior, op, m128; - unsigned int i, nelt, eltsz; - - if (!TARGET_AVX2 - || d->one_operand_p - || (d->vmode != V32QImode && d->vmode != V16HImode)) - return false; - - for (i = 0; i < d->nelt; ++i) - if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2)) - return false; - - if (d->testing_p) - return true; + int pos = 0; + int i; - nelt = d->nelt; - eltsz = GET_MODE_UNIT_SIZE (d->vmode); - - /* Generate two permutation masks. In the first permutation mask - the first quarter will contain indexes for the first half - of the op0, the second quarter will contain bit 7 set, third quarter - will contain indexes for the second half of the op0 and the - last quarter bit 7 set. In the second permutation mask - the first quarter will contain bit 7 set, the second quarter - indexes for the first half of the op1, the third quarter bit 7 set - and last quarter indexes for the second half of the op1. - I.e. the first mask e.g. for V32QImode extract even will be: - 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128 - (all values masked with 0xf except for -128) and second mask - for extract even will be - -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */ - m128 = GEN_INT (-128); - for (i = 0; i < nelt; ++i) - { - unsigned j, e = d->perm[i] & (nelt / 2 - 1); - unsigned which = d->perm[i] >= nelt; - unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0; + /* First allocate the local general purpose registers. */ + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (GENERAL_REGNO_P (i) && call_used_regs[i]) + reg_alloc_order [pos++] = i; - for (j = 0; j < eltsz; ++j) - { - rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j); - rperm[1 - which][(i * eltsz + j) ^ xorv] = m128; - } - } + /* Global general purpose registers. */ + for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) + if (GENERAL_REGNO_P (i) && !call_used_regs[i]) + reg_alloc_order [pos++] = i; - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); - vperm = force_reg (V32QImode, vperm); + /* x87 registers come first in case we are doing FP math + using them. */ + if (!TARGET_SSE_MATH) + for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++) + reg_alloc_order [pos++] = i; - l = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); + /* SSE registers. */ + for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++) + reg_alloc_order [pos++] = i; + for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++) + reg_alloc_order [pos++] = i; - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); - vperm = force_reg (V32QImode, vperm); + /* Extended REX SSE registers. */ + for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++) + reg_alloc_order [pos++] = i; - h = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, d->op1); - emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); + /* Mask register. */ + for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++) + reg_alloc_order [pos++] = i; - ior = gen_reg_rtx (V32QImode); - emit_insn (gen_iorv32qi3 (ior, l, h)); + /* x87 registers. */ + if (TARGET_SSE_MATH) + for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++) + reg_alloc_order [pos++] = i; - /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */ - op = gen_reg_rtx (V4DImode); - ior = gen_lowpart (V4DImode, ior); - emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - emit_move_insn (d->target, gen_lowpart (d->vmode, op)); + for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++) + reg_alloc_order [pos++] = i; - return true; + /* Initialize the rest of array as we do not allocate some registers + at all. */ + while (pos < FIRST_PSEUDO_REGISTER) + reg_alloc_order [pos++] = 0; } -/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even - and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands - with two "and" and "pack" or two "shift" and "pack" insns. We should - have already failed all two instruction sequences. */ - static bool -expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) +ix86_ms_bitfield_layout_p (const_tree record_type) { - rtx op, dop0, dop1, t; - unsigned i, odd, c, s, nelt = d->nelt; - bool end_perm = false; - machine_mode half_mode; - rtx (*gen_and) (rtx, rtx, rtx); - rtx (*gen_pack) (rtx, rtx, rtx); - rtx (*gen_shift) (rtx, rtx, rtx); + return ((TARGET_MS_BITFIELD_LAYOUT + && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type))) + || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type))); +} - if (d->one_operand_p) - return false; +/* Returns an expression indicating where the this parameter is + located on entry to the FUNCTION. */ - switch (d->vmode) - { - case E_V8HImode: - /* Required for "pack". */ - if (!TARGET_SSE4_1) - return false; - c = 0xffff; - s = 16; - half_mode = V4SImode; - gen_and = gen_andv4si3; - gen_pack = gen_sse4_1_packusdw; - gen_shift = gen_lshrv4si3; - break; - case E_V16QImode: - /* No check as all instructions are SSE2. */ - c = 0xff; - s = 8; - half_mode = V8HImode; - gen_and = gen_andv8hi3; - gen_pack = gen_sse2_packuswb; - gen_shift = gen_lshrv8hi3; - break; - case E_V16HImode: - if (!TARGET_AVX2) - return false; - c = 0xffff; - s = 16; - half_mode = V8SImode; - gen_and = gen_andv8si3; - gen_pack = gen_avx2_packusdw; - gen_shift = gen_lshrv8si3; - end_perm = true; - break; - case E_V32QImode: - if (!TARGET_AVX2) - return false; - c = 0xff; - s = 8; - half_mode = V16HImode; - gen_and = gen_andv16hi3; - gen_pack = gen_avx2_packuswb; - gen_shift = gen_lshrv16hi3; - end_perm = true; - break; - default: - /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than - general shuffles. */ - return false; - } +static rtx +x86_this_parameter (tree function) +{ + tree type = TREE_TYPE (function); + bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0; + int nregs; - /* Check that permutation is even or odd. */ - odd = d->perm[0]; - if (odd > 1) - return false; + if (TARGET_64BIT) + { + const int *parm_regs; - for (i = 1; i < nelt; ++i) - if (d->perm[i] != 2 * i + odd) - return false; + if (ix86_function_type_abi (type) == MS_ABI) + parm_regs = x86_64_ms_abi_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; + return gen_rtx_REG (Pmode, parm_regs[aggr]); + } - if (d->testing_p) - return true; + nregs = ix86_function_regparm (type, function); - dop0 = gen_reg_rtx (half_mode); - dop1 = gen_reg_rtx (half_mode); - if (odd == 0) - { - t = gen_const_vec_duplicate (half_mode, GEN_INT (c)); - t = force_reg (half_mode, t); - emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); - emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); - } - else + if (nregs > 0 && !stdarg_p (type)) { - emit_insn (gen_shift (dop0, - gen_lowpart (half_mode, d->op0), - GEN_INT (s))); - emit_insn (gen_shift (dop1, - gen_lowpart (half_mode, d->op1), - GEN_INT (s))); - } - /* In AVX2 for 256 bit case we need to permute pack result. */ - if (TARGET_AVX2 && end_perm) - { - op = gen_reg_rtx (d->vmode); - t = gen_reg_rtx (V4DImode); - emit_insn (gen_pack (op, dop0, dop1)); - emit_insn (gen_avx2_permv4di_1 (t, - gen_lowpart (V4DImode, op), - const0_rtx, - const2_rtx, - const1_rtx, - GEN_INT (3))); - emit_move_insn (d->target, gen_lowpart (d->vmode, t)); + int regno; + unsigned int ccvt = ix86_get_callcvt (type); + + if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) + regno = aggr ? DX_REG : CX_REG; + else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) + { + regno = CX_REG; + if (aggr) + return gen_rtx_MEM (SImode, + plus_constant (Pmode, stack_pointer_rtx, 4)); + } + else + { + regno = AX_REG; + if (aggr) + { + regno = DX_REG; + if (nregs == 1) + return gen_rtx_MEM (SImode, + plus_constant (Pmode, + stack_pointer_rtx, 4)); + } + } + return gen_rtx_REG (SImode, regno); } - else - emit_insn (gen_pack (d->target, dop0, dop1)); - return true; + return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx, + aggr ? 8 : 4)); } -/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even - and extract-odd permutations of two V64QI operands - with two "shifts", two "truncs" and one "concat" insns for "odd" - and two "truncs" and one concat insn for "even." - Have already failed all two instruction sequences. */ +/* Determine whether x86_output_mi_thunk can succeed. */ static bool -expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d) +x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset, + const_tree function) { - rtx t1, t2, t3, t4; - unsigned i, odd, nelt = d->nelt; - - if (!TARGET_AVX512BW - || d->one_operand_p - || d->vmode != V64QImode) - return false; - - /* Check that permutation is even or odd. */ - odd = d->perm[0]; - if (odd > 1) - return false; - - for (i = 1; i < nelt; ++i) - if (d->perm[i] != 2 * i + odd) - return false; - - if (d->testing_p) + /* 64-bit can handle anything. */ + if (TARGET_64BIT) return true; + /* For 32-bit, everything's fine if we have one free register. */ + if (ix86_function_regparm (TREE_TYPE (function), function) < 3) + return true; - if (odd) - { - t1 = gen_reg_rtx (V32HImode); - t2 = gen_reg_rtx (V32HImode); - emit_insn (gen_lshrv32hi3 (t1, - gen_lowpart (V32HImode, d->op0), - GEN_INT (8))); - emit_insn (gen_lshrv32hi3 (t2, - gen_lowpart (V32HImode, d->op1), - GEN_INT (8))); - } - else - { - t1 = gen_lowpart (V32HImode, d->op0); - t2 = gen_lowpart (V32HImode, d->op1); - } + /* Need a free register for vcall_offset. */ + if (vcall_offset) + return false; - t3 = gen_reg_rtx (V32QImode); - t4 = gen_reg_rtx (V32QImode); - emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1)); - emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2)); - emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4)); + /* Need a free register for GOT references. */ + if (flag_pic && !targetm.binds_local_p (function)) + return false; + /* Otherwise ok. */ return true; } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even - and extract-odd permutations. */ +/* Output the assembler code for a thunk function. THUNK_DECL is the + declaration for the thunk function itself, FUNCTION is the decl for + the target function. DELTA is an immediate constant offset to be + added to THIS. If VCALL_OFFSET is nonzero, the word at + *(*this + vcall_offset) should be added to THIS. */ -static bool -expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) +static void +x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta, + HOST_WIDE_INT vcall_offset, tree function) { - rtx t1, t2, t3, t4, t5; + rtx this_param = x86_this_parameter (function); + rtx this_reg, tmp, fnaddr; + unsigned int tmp_regno; + rtx_insn *insn; - switch (d->vmode) + if (TARGET_64BIT) + tmp_regno = R10_REG; + else { - case E_V4DFmode: - if (d->testing_p) - break; - t1 = gen_reg_rtx (V4DFmode); - t2 = gen_reg_rtx (V4DFmode); - - /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ - emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20))); - emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31))); - - /* Now an unpck[lh]pd will produce the result required. */ - if (odd) - t3 = gen_avx_unpckhpd256 (d->target, t1, t2); + unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function)); + if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) + tmp_regno = AX_REG; + else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) + tmp_regno = DX_REG; else - t3 = gen_avx_unpcklpd256 (d->target, t1, t2); - emit_insn (t3); - break; - - case E_V8SFmode: - { - int mask = odd ? 0xdd : 0x88; - - if (d->testing_p) - break; - t1 = gen_reg_rtx (V8SFmode); - t2 = gen_reg_rtx (V8SFmode); - t3 = gen_reg_rtx (V8SFmode); - - /* Shuffle within the 128-bit lanes to produce: - { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */ - emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1, - GEN_INT (mask))); - - /* Shuffle the lanes around to produce: - { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */ - emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1, - GEN_INT (0x3))); - - /* Shuffle within the 128-bit lanes to produce: - { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */ - emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44))); - - /* Shuffle within the 128-bit lanes to produce: - { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */ - emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee))); - - /* Shuffle the lanes around to produce: - { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */ - emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2, - GEN_INT (0x20))); - } - break; - - case E_V2DFmode: - case E_V4SFmode: - case E_V2DImode: - case E_V4SImode: - /* These are always directly implementable by expand_vec_perm_1. */ - gcc_unreachable (); + tmp_regno = CX_REG; + } - case E_V8HImode: - if (TARGET_SSE4_1) - return expand_vec_perm_even_odd_pack (d); - else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB) - return expand_vec_perm_pshufb2 (d); - else - { - if (d->testing_p) - break; - /* We need 2*log2(N)-1 operations to achieve odd/even - with interleave. */ - t1 = gen_reg_rtx (V8HImode); - t2 = gen_reg_rtx (V8HImode); - emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1)); - emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1)); - emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1)); - emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1)); - if (odd) - t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2); - else - t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2); - emit_insn (t3); - } - break; + emit_note (NOTE_INSN_PROLOGUE_END); - case E_V16QImode: - return expand_vec_perm_even_odd_pack (d); + /* CET is enabled, insert EB instruction. */ + if ((flag_cf_protection & CF_BRANCH)) + emit_insn (gen_nop_endbr ()); - case E_V16HImode: - case E_V32QImode: - return expand_vec_perm_even_odd_pack (d); + /* If VCALL_OFFSET, we'll need THIS in a register. Might as well + pull it in now and let DELTA benefit. */ + if (REG_P (this_param)) + this_reg = this_param; + else if (vcall_offset) + { + /* Put the this parameter into %eax. */ + this_reg = gen_rtx_REG (Pmode, AX_REG); + emit_move_insn (this_reg, this_param); + } + else + this_reg = NULL_RTX; - case E_V64QImode: - return expand_vec_perm_even_odd_trunc (d); + /* Adjust the this parameter by a fixed constant. */ + if (delta) + { + rtx delta_rtx = GEN_INT (delta); + rtx delta_dst = this_reg ? this_reg : this_param; - case E_V4DImode: - if (!TARGET_AVX2) + if (TARGET_64BIT) { - struct expand_vec_perm_d d_copy = *d; - d_copy.vmode = V4DFmode; - if (d->testing_p) - d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1); - else - d_copy.target = gen_reg_rtx (V4DFmode); - d_copy.op0 = gen_lowpart (V4DFmode, d->op0); - d_copy.op1 = gen_lowpart (V4DFmode, d->op1); - if (expand_vec_perm_even_odd_1 (&d_copy, odd)) + if (!x86_64_general_operand (delta_rtx, Pmode)) { - if (!d->testing_p) - emit_move_insn (d->target, - gen_lowpart (V4DImode, d_copy.target)); - return true; + tmp = gen_rtx_REG (Pmode, tmp_regno); + emit_move_insn (tmp, delta_rtx); + delta_rtx = tmp; } - return false; } - if (d->testing_p) - break; + ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx); + } - t1 = gen_reg_rtx (V4DImode); - t2 = gen_reg_rtx (V4DImode); + /* Adjust the this parameter by a value stored in the vtable. */ + if (vcall_offset) + { + rtx vcall_addr, vcall_mem, this_mem; - /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ - emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20))); - emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31))); + tmp = gen_rtx_REG (Pmode, tmp_regno); - /* Now an vpunpck[lh]qdq will produce the result required. */ - if (odd) - t3 = gen_avx2_interleave_highv4di (d->target, t1, t2); - else - t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2); - emit_insn (t3); - break; + this_mem = gen_rtx_MEM (ptr_mode, this_reg); + if (Pmode != ptr_mode) + this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem); + emit_move_insn (tmp, this_mem); - case E_V8SImode: - if (!TARGET_AVX2) + /* Adjust the this parameter. */ + vcall_addr = plus_constant (Pmode, tmp, vcall_offset); + if (TARGET_64BIT + && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true)) { - struct expand_vec_perm_d d_copy = *d; - d_copy.vmode = V8SFmode; - if (d->testing_p) - d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1); - else - d_copy.target = gen_reg_rtx (V8SFmode); - d_copy.op0 = gen_lowpart (V8SFmode, d->op0); - d_copy.op1 = gen_lowpart (V8SFmode, d->op1); - if (expand_vec_perm_even_odd_1 (&d_copy, odd)) - { - if (!d->testing_p) - emit_move_insn (d->target, - gen_lowpart (V8SImode, d_copy.target)); - return true; - } - return false; + rtx tmp2 = gen_rtx_REG (Pmode, R11_REG); + emit_move_insn (tmp2, GEN_INT (vcall_offset)); + vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2); } - if (d->testing_p) - break; - - t1 = gen_reg_rtx (V8SImode); - t2 = gen_reg_rtx (V8SImode); - t3 = gen_reg_rtx (V4DImode); - t4 = gen_reg_rtx (V4DImode); - t5 = gen_reg_rtx (V4DImode); - - /* Shuffle the lanes around into - { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */ - emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0), - gen_lowpart (V4DImode, d->op1), - GEN_INT (0x20))); - emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0), - gen_lowpart (V4DImode, d->op1), - GEN_INT (0x31))); - - /* Swap the 2nd and 3rd position in each lane into - { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */ - emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3), - GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); - emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4), - GEN_INT (2 * 4 + 1 * 16 + 3 * 64))); - - /* Now an vpunpck[lh]qdq will produce - { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */ - if (odd) - t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1), - gen_lowpart (V4DImode, t2)); + vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr); + if (Pmode != ptr_mode) + emit_insn (gen_addsi_1_zext (this_reg, + gen_rtx_REG (ptr_mode, + REGNO (this_reg)), + vcall_mem)); else - t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1), - gen_lowpart (V4DImode, t2)); - emit_insn (t3); - emit_move_insn (d->target, gen_lowpart (V8SImode, t5)); - break; - - default: - gcc_unreachable (); + ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem); } - return true; -} - -/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match - extract-even and extract-odd permutations. */ - -static bool -expand_vec_perm_even_odd (struct expand_vec_perm_d *d) -{ - unsigned i, odd, nelt = d->nelt; - - odd = d->perm[0]; - if (odd != 0 && odd != 1) - return false; - - for (i = 1; i < nelt; ++i) - if (d->perm[i] != 2 * i + odd) - return false; - - return expand_vec_perm_even_odd_1 (d, odd); -} - -/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast - permutations. We assume that expand_vec_perm_1 has already failed. */ - -static bool -expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) -{ - unsigned elt = d->perm[0], nelt2 = d->nelt / 2; - machine_mode vmode = d->vmode; - unsigned char perm2[4]; - rtx op0 = d->op0, dest; - bool ok; + /* If necessary, drop THIS back to its stack slot. */ + if (this_reg && this_reg != this_param) + emit_move_insn (this_param, this_reg); - switch (vmode) + fnaddr = XEXP (DECL_RTL (function), 0); + if (TARGET_64BIT) { - case E_V4DFmode: - case E_V8SFmode: - /* These are special-cased in sse.md so that we can optionally - use the vbroadcast instruction. They expand to two insns - if the input happens to be in a register. */ - gcc_unreachable (); - - case E_V2DFmode: - case E_V2DImode: - case E_V4SFmode: - case E_V4SImode: - /* These are always implementable using standard shuffle patterns. */ - gcc_unreachable (); - - case E_V8HImode: - case E_V16QImode: - /* These can be implemented via interleave. We save one insn by - stopping once we have promoted to V4SImode and then use pshufd. */ - if (d->testing_p) - return true; - do + if (!flag_pic || targetm.binds_local_p (function) + || TARGET_PECOFF) + ; + else { - rtx dest; - rtx (*gen) (rtx, rtx, rtx) - = vmode == V16QImode ? gen_vec_interleave_lowv16qi - : gen_vec_interleave_lowv8hi; - - if (elt >= nelt2) - { - gen = vmode == V16QImode ? gen_vec_interleave_highv16qi - : gen_vec_interleave_highv8hi; - elt -= nelt2; - } - nelt2 /= 2; + tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL); + tmp = gen_rtx_CONST (Pmode, tmp); + fnaddr = gen_const_mem (Pmode, tmp); + } + } + else + { + if (!flag_pic || targetm.binds_local_p (function)) + ; +#if TARGET_MACHO + else if (TARGET_MACHO) + { + fnaddr = machopic_indirect_call_target (DECL_RTL (function)); + fnaddr = XEXP (fnaddr, 0); + } +#endif /* TARGET_MACHO */ + else + { + tmp = gen_rtx_REG (Pmode, CX_REG); + output_set_got (tmp, NULL_RTX); - dest = gen_reg_rtx (vmode); - emit_insn (gen (dest, op0, op0)); - vmode = get_mode_wider_vector (vmode); - op0 = gen_lowpart (vmode, dest); + fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT); + fnaddr = gen_rtx_CONST (Pmode, fnaddr); + fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr); + fnaddr = gen_const_mem (Pmode, fnaddr); } - while (vmode != V4SImode); + } - memset (perm2, elt, 4); - dest = gen_reg_rtx (V4SImode); - ok = expand_vselect (dest, op0, perm2, 4, d->testing_p); - gcc_assert (ok); - if (!d->testing_p) - emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); - return true; + /* Our sibling call patterns do not allow memories, because we have no + predicate that can distinguish between frame and non-frame memory. + For our purposes here, we can get away with (ab)using a jump pattern, + because we're going to do no optimization. */ + if (MEM_P (fnaddr)) + { + if (sibcall_insn_operand (fnaddr, word_mode)) + { + fnaddr = XEXP (DECL_RTL (function), 0); + tmp = gen_rtx_MEM (QImode, fnaddr); + tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx); + tmp = emit_call_insn (tmp); + SIBLING_CALL_P (tmp) = 1; + } + else + emit_jump_insn (gen_indirect_jump (fnaddr)); + } + else + { + if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr)) + { + // CM_LARGE_PIC always uses pseudo PIC register which is + // uninitialized. Since FUNCTION is local and calling it + // doesn't go through PLT, we use scratch register %r11 as + // PIC register and initialize it here. + pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG); + ix86_init_large_pic_reg (tmp_regno); + fnaddr = legitimize_pic_address (fnaddr, + gen_rtx_REG (Pmode, tmp_regno)); + } - case E_V64QImode: - case E_V32QImode: - case E_V16HImode: - case E_V8SImode: - case E_V4DImode: - /* For AVX2 broadcasts of the first element vpbroadcast* or - vpermq should be used by expand_vec_perm_1. */ - gcc_assert (!TARGET_AVX2 || d->perm[0]); - return false; + if (!sibcall_insn_operand (fnaddr, word_mode)) + { + tmp = gen_rtx_REG (word_mode, tmp_regno); + if (GET_MODE (fnaddr) != word_mode) + fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr); + emit_move_insn (tmp, fnaddr); + fnaddr = tmp; + } - default: - gcc_unreachable (); + tmp = gen_rtx_MEM (QImode, fnaddr); + tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx); + tmp = emit_call_insn (tmp); + SIBLING_CALL_P (tmp) = 1; } + emit_barrier (); + + /* Emit just enough of rest_of_compilation to get the insns emitted. + Note that use_thunk calls assemble_start_function et al. */ + insn = get_insns (); + shorten_branches (insn); + final_start_function (insn, file, 1); + final (insn, file, 1); + final_end_function (); } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match - broadcast permutations. */ +static void +x86_file_start (void) +{ + default_file_start (); + if (TARGET_16BIT) + fputs ("\t.code16gcc\n", asm_out_file); +#if TARGET_MACHO + darwin_file_start (); +#endif + if (X86_FILE_START_VERSION_DIRECTIVE) + fputs ("\t.version\t\"01.01\"\n", asm_out_file); + if (X86_FILE_START_FLTUSED) + fputs ("\t.global\t__fltused\n", asm_out_file); + if (ix86_asm_dialect == ASM_INTEL) + fputs ("\t.intel_syntax noprefix\n", asm_out_file); +} -static bool -expand_vec_perm_broadcast (struct expand_vec_perm_d *d) +int +x86_field_alignment (tree type, int computed) { - unsigned i, elt, nelt = d->nelt; + machine_mode mode; - if (!d->one_operand_p) - return false; + if (TARGET_64BIT || TARGET_ALIGN_DOUBLE) + return computed; + if (TARGET_IAMCU) + return iamcu_alignment (type, computed); + mode = TYPE_MODE (strip_array_types (type)); + if (mode == DFmode || mode == DCmode + || GET_MODE_CLASS (mode) == MODE_INT + || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT) + return MIN (32, computed); + return computed; +} - elt = d->perm[0]; - for (i = 1; i < nelt; ++i) - if (d->perm[i] != elt) - return false; +/* Print call to TARGET to FILE. */ - return expand_vec_perm_broadcast_1 (d); +static void +x86_print_call_or_nop (FILE *file, const char *target) +{ + if (flag_nop_mcount || !strcmp (target, "nop")) + /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */ + fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n"); + else + fprintf (file, "1:\tcall\t%s\n", target); } -/* Implement arbitrary permutations of two V64QImode operands - with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */ static bool -expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d) +current_fentry_name (const char **name) { - if (!TARGET_AVX512BW || !(d->vmode == V64QImode)) + tree attr = lookup_attribute ("fentry_name", + DECL_ATTRIBUTES (current_function_decl)); + if (!attr) return false; + *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr))); + return true; +} - if (d->testing_p) - return true; - - struct expand_vec_perm_d ds[2]; - rtx rperm[128], vperm, target0, target1; - unsigned int i, nelt; - machine_mode vmode; - - nelt = d->nelt; - vmode = V64QImode; - - for (i = 0; i < 2; i++) - { - ds[i] = *d; - ds[i].vmode = V32HImode; - ds[i].nelt = 32; - ds[i].target = gen_reg_rtx (V32HImode); - ds[i].op0 = gen_lowpart (V32HImode, d->op0); - ds[i].op1 = gen_lowpart (V32HImode, d->op1); - } - - /* Prepare permutations such that the first one takes care of - putting the even bytes into the right positions or one higher - positions (ds[0]) and the second one takes care of - putting the odd bytes into the right positions or one below - (ds[1]). */ - - for (i = 0; i < nelt; i++) - { - ds[i & 1].perm[i / 2] = d->perm[i] / 2; - if (i & 1) - { - rperm[i] = constm1_rtx; - rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1)); - } - else - { - rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1)); - rperm[i + 64] = constm1_rtx; - } - } - - bool ok = expand_vec_perm_1 (&ds[0]); - gcc_assert (ok); - ds[0].target = gen_lowpart (V64QImode, ds[0].target); - - ok = expand_vec_perm_1 (&ds[1]); - gcc_assert (ok); - ds[1].target = gen_lowpart (V64QImode, ds[1].target); - - vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm)); - vperm = force_reg (vmode, vperm); - target0 = gen_reg_rtx (V64QImode); - emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm)); - - vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64)); - vperm = force_reg (vmode, vperm); - target1 = gen_reg_rtx (V64QImode); - emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm)); - - emit_insn (gen_iorv64qi3 (d->target, target0, target1)); +static bool +current_fentry_section (const char **name) +{ + tree attr = lookup_attribute ("fentry_section", + DECL_ATTRIBUTES (current_function_decl)); + if (!attr) + return false; + *name = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (attr))); return true; } -/* Implement arbitrary permutation of two V32QImode and V16QImode operands - with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed - all the shorter instruction sequences. */ - -static bool -expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d) +/* Output assembler code to FILE to increment profiler label # LABELNO + for profiling a function entry. */ +void +x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) { - rtx rperm[4][32], vperm, l[2], h[2], op, m128; - unsigned int i, nelt, eltsz; - bool used[4]; + if (cfun->machine->endbr_queued_at_entrance) + fprintf (file, "\t%s\n", TARGET_64BIT ? "endbr64" : "endbr32"); - if (!TARGET_AVX2 - || d->one_operand_p - || (d->vmode != V32QImode && d->vmode != V16HImode)) - return false; + const char *mcount_name = MCOUNT_NAME; - if (d->testing_p) - return true; + if (current_fentry_name (&mcount_name)) + ; + else if (fentry_name) + mcount_name = fentry_name; + else if (flag_fentry) + mcount_name = MCOUNT_NAME_BEFORE_PROLOGUE; - nelt = d->nelt; - eltsz = GET_MODE_UNIT_SIZE (d->vmode); - - /* Generate 4 permutation masks. If the required element is within - the same lane, it is shuffled in. If the required element from the - other lane, force a zero by setting bit 7 in the permutation mask. - In the other mask the mask has non-negative elements if element - is requested from the other lane, but also moved to the other lane, - so that the result of vpshufb can have the two V2TImode halves - swapped. */ - m128 = GEN_INT (-128); - for (i = 0; i < 32; ++i) - { - rperm[0][i] = m128; - rperm[1][i] = m128; - rperm[2][i] = m128; - rperm[3][i] = m128; - } - used[0] = false; - used[1] = false; - used[2] = false; - used[3] = false; - for (i = 0; i < nelt; ++i) + if (TARGET_64BIT) { - unsigned j, e = d->perm[i] & (nelt / 2 - 1); - unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz; - unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0); +#ifndef NO_PROFILE_COUNTERS + fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno); +#endif - for (j = 0; j < eltsz; ++j) - rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j); - used[which] = true; + if (!TARGET_PECOFF && flag_pic) + fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name); + else + x86_print_call_or_nop (file, mcount_name); } - - for (i = 0; i < 2; ++i) + else if (flag_pic) { - if (!used[2 * i + 1]) - { - h[i] = NULL_RTX; - continue; - } - vperm = gen_rtx_CONST_VECTOR (V32QImode, - gen_rtvec_v (32, rperm[2 * i + 1])); - vperm = force_reg (V32QImode, vperm); - h[i] = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm)); +#ifndef NO_PROFILE_COUNTERS + fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n", + LPREFIX, labelno); +#endif + fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name); } - - /* Swap the 128-byte lanes of h[X]. */ - for (i = 0; i < 2; ++i) - { - if (h[i] == NULL_RTX) - continue; - op = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]), - const2_rtx, GEN_INT (3), const0_rtx, - const1_rtx)); - h[i] = gen_lowpart (V32QImode, op); - } - - for (i = 0; i < 2; ++i) + else { - if (!used[2 * i]) - { - l[i] = NULL_RTX; - continue; - } - vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i])); - vperm = force_reg (V32QImode, vperm); - l[i] = gen_reg_rtx (V32QImode); - op = gen_lowpart (V32QImode, i ? d->op1 : d->op0); - emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm)); +#ifndef NO_PROFILE_COUNTERS + fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n", + LPREFIX, labelno); +#endif + x86_print_call_or_nop (file, mcount_name); } - for (i = 0; i < 2; ++i) + if (flag_record_mcount + || lookup_attribute ("fentry_section", + DECL_ATTRIBUTES (current_function_decl))) { - if (h[i] && l[i]) - { - op = gen_reg_rtx (V32QImode); - emit_insn (gen_iorv32qi3 (op, l[i], h[i])); - l[i] = op; - } - else if (h[i]) - l[i] = h[i]; - } + const char *sname = "__mcount_loc"; - gcc_assert (l[0] && l[1]); - op = d->target; - if (d->vmode != V32QImode) - op = gen_reg_rtx (V32QImode); - emit_insn (gen_iorv32qi3 (op, l[0], l[1])); - if (op != d->target) - emit_move_insn (d->target, gen_lowpart (d->vmode, op)); - return true; + if (current_fentry_section (&sname)) + ; + else if (fentry_section) + sname = fentry_section; + + fprintf (file, "\t.section %s, \"a\",@progbits\n", sname); + fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long"); + fprintf (file, "\t.previous\n"); + } } -/* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits - taken care of, perform the expansion in D and return true on success. */ +/* We don't have exact information about the insn sizes, but we may assume + quite safely that we are informed about all 1 byte insns and memory + address sizes. This is enough to eliminate unnecessary padding in + 99% of cases. */ -static bool -ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) +int +ix86_min_insn_size (rtx_insn *insn) { - /* Try a single instruction expansion. */ - if (expand_vec_perm_1 (d)) - return true; + int l = 0, len; - /* Try sequences of two instructions. */ + if (!INSN_P (insn) || !active_insn_p (insn)) + return 0; - if (expand_vec_perm_pshuflw_pshufhw (d)) - return true; + /* Discard alignments we've emit and jump instructions. */ + if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE + && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN) + return 0; - if (expand_vec_perm_palignr (d, false)) - return true; + /* Important case - calls are always 5 bytes. + It is common to have many calls in the row. */ + if (CALL_P (insn) + && symbolic_reference_mentioned_p (PATTERN (insn)) + && !SIBLING_CALL_P (insn)) + return 5; + len = get_attr_length (insn); + if (len <= 1) + return 1; - if (expand_vec_perm_interleave2 (d)) - return true; + /* For normal instructions we rely on get_attr_length being exact, + with a few exceptions. */ + if (!JUMP_P (insn)) + { + enum attr_type type = get_attr_type (insn); - if (expand_vec_perm_broadcast (d)) - return true; + switch (type) + { + case TYPE_MULTI: + if (GET_CODE (PATTERN (insn)) == ASM_INPUT + || asm_noperands (PATTERN (insn)) >= 0) + return 0; + break; + case TYPE_OTHER: + case TYPE_FCMP: + break; + default: + /* Otherwise trust get_attr_length. */ + return len; + } - if (expand_vec_perm_vpermq_perm_1 (d)) - return true; + l = get_attr_length_address (insn); + if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn))) + l = 4; + } + if (l) + return 1+l; + else + return 2; +} - if (expand_vec_perm_vperm2f128 (d)) - return true; +#ifdef ASM_OUTPUT_MAX_SKIP_PAD - if (expand_vec_perm_pblendv (d)) - return true; +/* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte + window. */ - /* Try sequences of three instructions. */ +static void +ix86_avoid_jump_mispredicts (void) +{ + rtx_insn *insn, *start = get_insns (); + int nbytes = 0, njumps = 0; + bool isjump = false; - if (expand_vec_perm_even_odd_pack (d)) - return true; + /* Look for all minimal intervals of instructions containing 4 jumps. + The intervals are bounded by START and INSN. NBYTES is the total + size of instructions in the interval including INSN and not including + START. When the NBYTES is smaller than 16 bytes, it is possible + that the end of START and INSN ends up in the same 16byte page. - if (expand_vec_perm_2vperm2f128_vshuf (d)) - return true; + The smallest offset in the page INSN can start is the case where START + ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN). + We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN). - if (expand_vec_perm_pshufb2 (d)) - return true; + Don't consider asm goto as jump, while it can contain a jump, it doesn't + have to, control transfer to label(s) can be performed through other + means, and also we estimate minimum length of all asm stmts as 0. */ + for (insn = start; insn; insn = NEXT_INSN (insn)) + { + int min_size; - if (expand_vec_perm_interleave3 (d)) - return true; + if (LABEL_P (insn)) + { + align_flags alignment = label_to_alignment (insn); + int align = alignment.levels[0].log; + int max_skip = alignment.levels[0].maxskip; - if (expand_vec_perm_vperm2f128_vblend (d)) - return true; + if (max_skip > 15) + max_skip = 15; + /* If align > 3, only up to 16 - max_skip - 1 bytes can be + already in the current 16 byte page, because otherwise + ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer + bytes to reach 16 byte boundary. */ + if (align <= 0 + || (align <= 3 && max_skip != (1 << align) - 1)) + max_skip = 0; + if (dump_file) + fprintf (dump_file, "Label %i with max_skip %i\n", + INSN_UID (insn), max_skip); + if (max_skip) + { + while (nbytes + max_skip >= 16) + { + start = NEXT_INSN (start); + if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0) + || CALL_P (start)) + njumps--, isjump = true; + else + isjump = false; + nbytes -= ix86_min_insn_size (start); + } + } + continue; + } + + min_size = ix86_min_insn_size (insn); + nbytes += min_size; + if (dump_file) + fprintf (dump_file, "Insn %i estimated to %i bytes\n", + INSN_UID (insn), min_size); + if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0) + || CALL_P (insn)) + njumps++; + else + continue; - /* Try sequences of four instructions. */ + while (njumps > 3) + { + start = NEXT_INSN (start); + if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0) + || CALL_P (start)) + njumps--, isjump = true; + else + isjump = false; + nbytes -= ix86_min_insn_size (start); + } + gcc_assert (njumps >= 0); + if (dump_file) + fprintf (dump_file, "Interval %i to %i has %i bytes\n", + INSN_UID (start), INSN_UID (insn), nbytes); - if (expand_vec_perm_even_odd_trunc (d)) - return true; - if (expand_vec_perm_vpshufb2_vpermq (d)) - return true; + if (njumps == 3 && isjump && nbytes < 16) + { + int padsize = 15 - nbytes + ix86_min_insn_size (insn); - if (expand_vec_perm_vpshufb2_vpermq_even_odd (d)) - return true; + if (dump_file) + fprintf (dump_file, "Padding insn %i by %i bytes!\n", + INSN_UID (insn), padsize); + emit_insn_before (gen_pad (GEN_INT (padsize)), insn); + } + } +} +#endif - if (expand_vec_perm_vpermt2_vpshub2 (d)) - return true; +/* AMD Athlon works faster + when RET is not destination of conditional jump or directly preceded + by other jump instruction. We avoid the penalty by inserting NOP just + before the RET instructions in such cases. */ +static void +ix86_pad_returns (void) +{ + edge e; + edge_iterator ei; - /* ??? Look for narrow permutations whose element orderings would - allow the promotion to a wider mode. */ + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) + { + basic_block bb = e->src; + rtx_insn *ret = BB_END (bb); + rtx_insn *prev; + bool replace = false; - /* ??? Look for sequences of interleave or a wider permute that place - the data into the correct lanes for a half-vector shuffle like - pshuf[lh]w or vpermilps. */ + if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret)) + || optimize_bb_for_size_p (bb)) + continue; + for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev)) + if (active_insn_p (prev) || LABEL_P (prev)) + break; + if (prev && LABEL_P (prev)) + { + edge e; + edge_iterator ei; - /* ??? Look for sequences of interleave that produce the desired results. - The combinatorics of punpck[lh] get pretty ugly... */ + FOR_EACH_EDGE (e, ei, bb->preds) + if (EDGE_FREQUENCY (e) && e->src->index >= 0 + && !(e->flags & EDGE_FALLTHRU)) + { + replace = true; + break; + } + } + if (!replace) + { + prev = prev_active_insn (ret); + if (prev + && ((JUMP_P (prev) && any_condjump_p (prev)) + || CALL_P (prev))) + replace = true; + /* Empty functions get branch mispredict even when + the jump destination is not visible to us. */ + if (!prev && !optimize_function_for_size_p (cfun)) + replace = true; + } + if (replace) + { + emit_jump_insn_before (gen_simple_return_internal_long (), ret); + delete_insn (ret); + } + } +} - if (expand_vec_perm_even_odd (d)) - return true; +/* Count the minimum number of instructions in BB. Return 4 if the + number of instructions >= 4. */ - /* Even longer sequences. */ - if (expand_vec_perm_vpshufb4_vpermq2 (d)) - return true; +static int +ix86_count_insn_bb (basic_block bb) +{ + rtx_insn *insn; + int insn_count = 0; - /* See if we can get the same permutation in different vector integer - mode. */ - struct expand_vec_perm_d nd; - if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) + /* Count number of instructions in this block. Return 4 if the number + of instructions >= 4. */ + FOR_BB_INSNS (bb, insn) { - if (!d->testing_p) - emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target)); - return true; + /* Only happen in exit blocks. */ + if (JUMP_P (insn) + && ANY_RETURN_P (PATTERN (insn))) + break; + + if (NONDEBUG_INSN_P (insn) + && GET_CODE (PATTERN (insn)) != USE + && GET_CODE (PATTERN (insn)) != CLOBBER) + { + insn_count++; + if (insn_count >= 4) + return insn_count; + } } - return false; + return insn_count; } -/* If a permutation only uses one operand, make it clear. Returns true - if the permutation references both operands. */ -static bool -canonicalize_perm (struct expand_vec_perm_d *d) -{ - int i, which, nelt = d->nelt; +/* Count the minimum number of instructions in code path in BB. + Return 4 if the number of instructions >= 4. */ - for (i = which = 0; i < nelt; ++i) - which |= (d->perm[i] < nelt ? 1 : 2); +static int +ix86_count_insn (basic_block bb) +{ + edge e; + edge_iterator ei; + int min_prev_count; - d->one_operand_p = true; - switch (which) + /* Only bother counting instructions along paths with no + more than 2 basic blocks between entry and exit. Given + that BB has an edge to exit, determine if a predecessor + of BB has an edge from entry. If so, compute the number + of instructions in the predecessor block. If there + happen to be multiple such blocks, compute the minimum. */ + min_prev_count = 4; + FOR_EACH_EDGE (e, ei, bb->preds) { - default: - gcc_unreachable(); + edge prev_e; + edge_iterator prev_ei; - case 3: - if (!rtx_equal_p (d->op0, d->op1)) - { - d->one_operand_p = false; + if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun)) + { + min_prev_count = 0; break; - } - /* The elements of PERM do not suggest that only the first operand - is used, but both operands are identical. Allow easier matching - of the permutation by folding the permutation into the single - input vector. */ - /* FALLTHRU */ - - case 2: - for (i = 0; i < nelt; ++i) - d->perm[i] &= nelt - 1; - d->op0 = d->op1; - break; - - case 1: - d->op1 = d->op0; - break; + } + FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds) + { + if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun)) + { + int count = ix86_count_insn_bb (e->src); + if (count < min_prev_count) + min_prev_count = count; + break; + } + } } - return (which == 3); + if (min_prev_count < 4) + min_prev_count += ix86_count_insn_bb (bb); + + return min_prev_count; } -/* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */ +/* Pad short function to 4 instructions. */ -static bool -ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, - rtx op1, const vec_perm_indices &sel) +static void +ix86_pad_short_function (void) { - struct expand_vec_perm_d d; - unsigned char perm[MAX_VECT_LEN]; - unsigned int i, nelt, which; - bool two_args; + edge e; + edge_iterator ei; - d.target = target; - d.op0 = op0; - d.op1 = op1; + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) + { + rtx_insn *ret = BB_END (e->src); + if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret))) + { + int insn_count = ix86_count_insn (e->src); - d.vmode = vmode; - gcc_assert (VECTOR_MODE_P (d.vmode)); - d.nelt = nelt = GET_MODE_NUNITS (d.vmode); - d.testing_p = !target; + /* Pad short function. */ + if (insn_count < 4) + { + rtx_insn *insn = ret; - gcc_assert (sel.length () == nelt); - gcc_checking_assert (sizeof (d.perm) == sizeof (perm)); + /* Find epilogue. */ + while (insn + && (!NOTE_P (insn) + || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG)) + insn = PREV_INSN (insn); - /* Given sufficient ISA support we can just return true here - for selected vector modes. */ - switch (d.vmode) - { - case E_V16SFmode: - case E_V16SImode: - case E_V8DImode: - case E_V8DFmode: - if (!TARGET_AVX512F) - return false; - /* All implementable with a single vperm[it]2 insn. */ - if (d.testing_p) - return true; - break; - case E_V32HImode: - if (!TARGET_AVX512BW) - return false; - if (d.testing_p) - /* All implementable with a single vperm[it]2 insn. */ - return true; - break; - case E_V64QImode: - if (!TARGET_AVX512BW) - return false; - if (d.testing_p) - /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */ - return true; - break; - case E_V8SImode: - case E_V8SFmode: - case E_V4DFmode: - case E_V4DImode: - if (!TARGET_AVX) - return false; - if (d.testing_p && TARGET_AVX512VL) - /* All implementable with a single vperm[it]2 insn. */ - return true; - break; - case E_V16HImode: - if (!TARGET_SSE2) - return false; - if (d.testing_p && TARGET_AVX2) - /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ - return true; - break; - case E_V32QImode: - if (!TARGET_SSE2) - return false; - if (d.testing_p && TARGET_AVX2) - /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ - return true; - break; - case E_V8HImode: - case E_V16QImode: - if (!TARGET_SSE2) - return false; - /* Fall through. */ - case E_V4SImode: - case E_V4SFmode: - if (!TARGET_SSE) - return false; - /* All implementable with a single vpperm insn. */ - if (d.testing_p && TARGET_XOP) - return true; - /* All implementable with 2 pshufb + 1 ior. */ - if (d.testing_p && TARGET_SSSE3) - return true; - break; - case E_V2DImode: - case E_V2DFmode: - if (!TARGET_SSE) - return false; - /* All implementable with shufpd or unpck[lh]pd. */ - if (d.testing_p) - return true; - break; - default: - return false; - } + if (!insn) + insn = ret; - for (i = which = 0; i < nelt; ++i) - { - unsigned char e = sel[i]; - gcc_assert (e < 2 * nelt); - d.perm[i] = e; - perm[i] = e; - which |= (e < nelt ? 1 : 2); + /* Two NOPs count as one instruction. */ + insn_count = 2 * (4 - insn_count); + emit_insn_before (gen_nops (GEN_INT (insn_count)), insn); + } + } } +} - if (d.testing_p) - { - /* For all elements from second vector, fold the elements to first. */ - if (which == 2) - for (i = 0; i < nelt; ++i) - d.perm[i] -= nelt; +/* Fix up a Windows system unwinder issue. If an EH region falls through into + the epilogue, the Windows system unwinder will apply epilogue logic and + produce incorrect offsets. This can be avoided by adding a nop between + the last insn that can throw and the first insn of the epilogue. */ + +static void +ix86_seh_fixup_eh_fallthru (void) +{ + edge e; + edge_iterator ei; - /* Check whether the mask can be applied to the vector type. */ - d.one_operand_p = (which != 3); + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) + { + rtx_insn *insn, *next; - /* Implementable with shufps or pshufd. */ - if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode)) - return true; + /* Find the beginning of the epilogue. */ + for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn)) + if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG) + break; + if (insn == NULL) + continue; - /* Otherwise we have to go through the motions and see if we can - figure out how to generate the requested permutation. */ - d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1); - d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2); - if (!d.one_operand_p) - d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3); + /* We only care about preceding insns that can throw. */ + insn = prev_active_insn (insn); + if (insn == NULL || !can_throw_internal (insn)) + continue; - start_sequence (); - bool ret = ix86_expand_vec_perm_const_1 (&d); - end_sequence (); + /* Do not separate calls from their debug information. */ + for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next)) + if (NOTE_P (next) && NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION) + insn = next; + else + break; - return ret; + emit_insn_after (gen_nops (const1_rtx), insn); } +} - two_args = canonicalize_perm (&d); +/* Implement machine specific optimizations. We implement padding of returns + for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ +static void +ix86_reorg (void) +{ + /* We are freeing block_for_insn in the toplev to keep compatibility + with old MDEP_REORGS that are not CFG based. Recompute it now. */ + compute_bb_for_insn (); - if (ix86_expand_vec_perm_const_1 (&d)) - return true; + if (TARGET_SEH && current_function_has_exception_handlers ()) + ix86_seh_fixup_eh_fallthru (); - /* If the selector says both arguments are needed, but the operands are the - same, the above tried to expand with one_operand_p and flattened selector. - If that didn't work, retry without one_operand_p; we succeeded with that - during testing. */ - if (two_args && d.one_operand_p) + if (optimize && optimize_function_for_speed_p (cfun)) { - d.one_operand_p = false; - memcpy (d.perm, perm, sizeof (perm)); - return ix86_expand_vec_perm_const_1 (&d); + if (TARGET_PAD_SHORT_FUNCTION) + ix86_pad_short_function (); + else if (TARGET_PAD_RETURNS) + ix86_pad_returns (); +#ifdef ASM_OUTPUT_MAX_SKIP_PAD + if (TARGET_FOUR_JUMP_LIMIT) + ix86_avoid_jump_mispredicts (); +#endif } +} +/* Return nonzero when QImode register that must be represented via REX prefix + is used. */ +bool +x86_extended_QIreg_mentioned_p (rtx_insn *insn) +{ + int i; + extract_insn_cached (insn); + for (i = 0; i < recog_data.n_operands; i++) + if (GENERAL_REG_P (recog_data.operand[i]) + && !QI_REGNO_P (REGNO (recog_data.operand[i]))) + return true; return false; } -void -ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd) -{ - struct expand_vec_perm_d d; - unsigned i, nelt; - - d.target = targ; - d.op0 = op0; - d.op1 = op1; - d.vmode = GET_MODE (targ); - d.nelt = nelt = GET_MODE_NUNITS (d.vmode); - d.one_operand_p = false; - d.testing_p = false; - - for (i = 0; i < nelt; ++i) - d.perm[i] = i * 2 + odd; - - /* We'll either be able to implement the permutation directly... */ - if (expand_vec_perm_1 (&d)) - return; - - /* ... or we use the special-case patterns. */ - expand_vec_perm_even_odd_1 (&d, odd); -} - -static void -ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p) +/* Return true when INSN mentions register that must be encoded using REX + prefix. */ +bool +x86_extended_reg_mentioned_p (rtx insn) { - struct expand_vec_perm_d d; - unsigned i, nelt, base; - bool ok; - - d.target = targ; - d.op0 = op0; - d.op1 = op1; - d.vmode = GET_MODE (targ); - d.nelt = nelt = GET_MODE_NUNITS (d.vmode); - d.one_operand_p = false; - d.testing_p = false; - - base = high_p ? nelt / 2 : 0; - for (i = 0; i < nelt / 2; ++i) + subrtx_iterator::array_type array; + FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST) { - d.perm[i * 2] = i + base; - d.perm[i * 2 + 1] = i + base + nelt; + const_rtx x = *iter; + if (REG_P (x) + && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x)))) + return true; } - - /* Note that for AVX this isn't one instruction. */ - ok = ix86_expand_vec_perm_const_1 (&d); - gcc_assert (ok); + return false; } +/* If profitable, negate (without causing overflow) integer constant + of mode MODE at location LOC. Return true in this case. */ +bool +x86_maybe_negate_const_int (rtx *loc, machine_mode mode) +{ + HOST_WIDE_INT val; -/* Expand a vector operation CODE for a V*QImode in terms of the - same operation on V*HImode. */ - -void -ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) -{ - machine_mode qimode = GET_MODE (dest); - machine_mode himode; - rtx (*gen_il) (rtx, rtx, rtx); - rtx (*gen_ih) (rtx, rtx, rtx); - rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h; - struct expand_vec_perm_d d; - bool ok, full_interleave; - bool uns_p = false; - int i; + if (!CONST_INT_P (*loc)) + return false; - switch (qimode) + switch (mode) { - case E_V16QImode: - himode = V8HImode; - gen_il = gen_vec_interleave_lowv16qi; - gen_ih = gen_vec_interleave_highv16qi; - break; - case E_V32QImode: - himode = V16HImode; - gen_il = gen_avx2_interleave_lowv32qi; - gen_ih = gen_avx2_interleave_highv32qi; - break; - case E_V64QImode: - himode = V32HImode; - gen_il = gen_avx512bw_interleave_lowv64qi; - gen_ih = gen_avx512bw_interleave_highv64qi; - break; - default: - gcc_unreachable (); - } + case E_DImode: + /* DImode x86_64 constants must fit in 32 bits. */ + gcc_assert (x86_64_immediate_operand (*loc, mode)); - op2_l = op2_h = op2; - switch (code) - { - case MULT: - /* Unpack data such that we've got a source byte in each low byte of - each word. We don't care what goes into the high byte of each word. - Rather than trying to get zero in there, most convenient is to let - it be a copy of the low byte. */ - op2_l = gen_reg_rtx (qimode); - op2_h = gen_reg_rtx (qimode); - emit_insn (gen_il (op2_l, op2, op2)); - emit_insn (gen_ih (op2_h, op2, op2)); - - op1_l = gen_reg_rtx (qimode); - op1_h = gen_reg_rtx (qimode); - emit_insn (gen_il (op1_l, op1, op1)); - emit_insn (gen_ih (op1_h, op1, op1)); - full_interleave = qimode == V16QImode; + mode = SImode; break; - case ASHIFT: - case LSHIFTRT: - uns_p = true; - /* FALLTHRU */ - case ASHIFTRT: - op1_l = gen_reg_rtx (himode); - op1_h = gen_reg_rtx (himode); - ix86_expand_sse_unpack (op1_l, op1, uns_p, false); - ix86_expand_sse_unpack (op1_h, op1, uns_p, true); - full_interleave = true; + case E_SImode: + case E_HImode: + case E_QImode: break; + default: gcc_unreachable (); } - /* Perform the operation. */ - res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX, - 1, OPTAB_DIRECT); - res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX, - 1, OPTAB_DIRECT); - gcc_assert (res_l && res_h); + /* Avoid overflows. */ + if (mode_signbit_p (mode, *loc)) + return false; - /* Merge the data back into the right place. */ - d.target = dest; - d.op0 = gen_lowpart (qimode, res_l); - d.op1 = gen_lowpart (qimode, res_h); - d.vmode = qimode; - d.nelt = GET_MODE_NUNITS (qimode); - d.one_operand_p = false; - d.testing_p = false; + val = INTVAL (*loc); - if (full_interleave) + /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'. + Exceptions: -128 encodes smaller than 128, so swap sign and op. */ + if ((val < 0 && val != -128) + || val == 128) { - /* For SSE2, we used an full interleave, so the desired - results are in the even elements. */ - for (i = 0; i < d.nelt; ++i) - d.perm[i] = i * 2; + *loc = GEN_INT (-val); + return true; } - else - { - /* For AVX, the interleave used above was not cross-lane. So the - extraction is evens but with the second and third quarter swapped. - Happily, that is even one insn shorter than even extraction. - For AVX512BW we have 4 lanes. We extract evens from within a lane, - always first from the first and then from the second source operand, - the index bits above the low 4 bits remains the same. - Thus, for d.nelt == 32 we want permutation - 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62 - and for d.nelt == 64 we want permutation - 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94, - 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */ - for (i = 0; i < d.nelt; ++i) - d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15); - } - - ok = ix86_expand_vec_perm_const_1 (&d); - gcc_assert (ok); - set_unique_reg_note (get_last_insn (), REG_EQUAL, - gen_rtx_fmt_ee (code, qimode, op1, op2)); + return false; } -/* Helper function of ix86_expand_mul_widen_evenodd. Return true - if op is CONST_VECTOR with all odd elements equal to their - preceding element. */ - -static bool -const_vector_equal_evenodd_p (rtx op) -{ - machine_mode mode = GET_MODE (op); - int i, nunits = GET_MODE_NUNITS (mode); - if (GET_CODE (op) != CONST_VECTOR - || nunits != CONST_VECTOR_NUNITS (op)) - return false; - for (i = 0; i < nunits; i += 2) - if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1)) - return false; - return true; -} +/* Generate an unsigned DImode/SImode to FP conversion. This is the same code + optabs would emit if we didn't have TFmode patterns. */ void -ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2, - bool uns_p, bool odd_p) +x86_emit_floatuns (rtx operands[2]) { - machine_mode mode = GET_MODE (op1); - machine_mode wmode = GET_MODE (dest); - rtx x; - rtx orig_op1 = op1, orig_op2 = op2; - - if (!nonimmediate_operand (op1, mode)) - op1 = force_reg (mode, op1); - if (!nonimmediate_operand (op2, mode)) - op2 = force_reg (mode, op2); + rtx_code_label *neglab, *donelab; + rtx i0, i1, f0, in, out; + machine_mode mode, inmode; - /* We only play even/odd games with vectors of SImode. */ - gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode); + inmode = GET_MODE (operands[1]); + gcc_assert (inmode == SImode || inmode == DImode); - /* If we're looking for the odd results, shift those members down to - the even slots. For some cpus this is faster than a PSHUFD. */ - if (odd_p) - { - /* For XOP use vpmacsdqh, but only for smult, as it is only - signed. */ - if (TARGET_XOP && mode == V4SImode && !uns_p) - { - x = force_reg (wmode, CONST0_RTX (wmode)); - emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x)); - return; - } + out = operands[0]; + in = force_reg (inmode, operands[1]); + mode = GET_MODE (out); + neglab = gen_label_rtx (); + donelab = gen_label_rtx (); + f0 = gen_reg_rtx (mode); - x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode)); - if (!const_vector_equal_evenodd_p (orig_op1)) - op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1), - x, NULL, 1, OPTAB_DIRECT); - if (!const_vector_equal_evenodd_p (orig_op2)) - op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2), - x, NULL, 1, OPTAB_DIRECT); - op1 = gen_lowpart (mode, op1); - op2 = gen_lowpart (mode, op2); - } + emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab); - if (mode == V16SImode) - { - if (uns_p) - x = gen_vec_widen_umult_even_v16si (dest, op1, op2); - else - x = gen_vec_widen_smult_even_v16si (dest, op1, op2); - } - else if (mode == V8SImode) - { - if (uns_p) - x = gen_vec_widen_umult_even_v8si (dest, op1, op2); - else - x = gen_vec_widen_smult_even_v8si (dest, op1, op2); - } - else if (uns_p) - x = gen_vec_widen_umult_even_v4si (dest, op1, op2); - else if (TARGET_SSE4_1) - x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2); - else - { - rtx s1, s2, t0, t1, t2; + expand_float (out, in, 0); - /* The easiest way to implement this without PMULDQ is to go through - the motions as if we are performing a full 64-bit multiply. With - the exception that we need to do less shuffling of the elements. */ + emit_jump_insn (gen_jump (donelab)); + emit_barrier (); - /* Compute the sign-extension, aka highparts, of the two operands. */ - s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), - op1, pc_rtx, pc_rtx); - s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode), - op2, pc_rtx, pc_rtx); + emit_label (neglab); - /* Multiply LO(A) * HI(B), and vice-versa. */ - t1 = gen_reg_rtx (wmode); - t2 = gen_reg_rtx (wmode); - emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2)); - emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1)); + i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT); - /* Multiply LO(A) * LO(B). */ - t0 = gen_reg_rtx (wmode); - emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2)); + expand_float (f0, i0, 0); - /* Combine and shift the highparts into place. */ - t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT); - t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1, - 1, OPTAB_DIRECT); + emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0))); - /* Combine high and low parts. */ - force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT); - return; - } - emit_insn (x); + emit_label (donelab); } - -void -ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2, - bool uns_p, bool high_p) + +/* Target hook for scalar_mode_supported_p. */ +static bool +ix86_scalar_mode_supported_p (scalar_mode mode) { - machine_mode wmode = GET_MODE (dest); - machine_mode mode = GET_MODE (op1); - rtx t1, t2, t3, t4, mask; - - switch (mode) - { - case E_V4SImode: - t1 = gen_reg_rtx (mode); - t2 = gen_reg_rtx (mode); - if (TARGET_XOP && !uns_p) - { - /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case, - shuffle the elements once so that all elements are in the right - place for immediate use: { A C B D }. */ - emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - } - else - { - /* Put the elements into place for the multiply. */ - ix86_expand_vec_interleave (t1, op1, op1, high_p); - ix86_expand_vec_interleave (t2, op2, op2, high_p); - high_p = false; - } - ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p); - break; - - case E_V8SImode: - /* Shuffle the elements between the lanes. After this we - have { A B E F | C D G H } for each operand. */ - t1 = gen_reg_rtx (V4DImode); - t2 = gen_reg_rtx (V4DImode); - emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1), - const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2), - const0_rtx, const2_rtx, - const1_rtx, GEN_INT (3))); - - /* Shuffle the elements within the lanes. After this we - have { A A B B | C C D D } or { E E F F | G G H H }. */ - t3 = gen_reg_rtx (V8SImode); - t4 = gen_reg_rtx (V8SImode); - mask = GEN_INT (high_p - ? 2 + (2 << 2) + (3 << 4) + (3 << 6) - : 0 + (0 << 2) + (1 << 4) + (1 << 6)); - emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask)); - emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask)); - - ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false); - break; - - case E_V8HImode: - case E_V16HImode: - t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX, - uns_p, OPTAB_DIRECT); - t2 = expand_binop (mode, - uns_p ? umul_highpart_optab : smul_highpart_optab, - op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT); - gcc_assert (t1 && t2); - - t3 = gen_reg_rtx (mode); - ix86_expand_vec_interleave (t3, t1, t2, high_p); - emit_move_insn (dest, gen_lowpart (wmode, t3)); - break; - - case E_V16QImode: - case E_V32QImode: - case E_V32HImode: - case E_V16SImode: - case E_V64QImode: - t1 = gen_reg_rtx (wmode); - t2 = gen_reg_rtx (wmode); - ix86_expand_sse_unpack (t1, op1, uns_p, high_p); - ix86_expand_sse_unpack (t2, op2, uns_p, high_p); - - emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2))); - break; - - default: - gcc_unreachable (); - } + if (DECIMAL_FLOAT_MODE_P (mode)) + return default_decimal_float_supported_p (); + else if (mode == TFmode) + return true; + else + return default_scalar_mode_supported_p (mode); } -void -ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2) +/* Implements target hook vector_mode_supported_p. */ +static bool +ix86_vector_mode_supported_p (machine_mode mode) { - rtx res_1, res_2, res_3, res_4; - - res_1 = gen_reg_rtx (V4SImode); - res_2 = gen_reg_rtx (V4SImode); - res_3 = gen_reg_rtx (V2DImode); - res_4 = gen_reg_rtx (V2DImode); - ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false); - ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true); - - /* Move the results in element 2 down to element 1; we don't care - what goes in elements 2 and 3. Then we can merge the parts - back together with an interleave. - - Note that two other sequences were tried: - (1) Use interleaves at the start instead of psrldq, which allows - us to use a single shufps to merge things back at the end. - (2) Use shufps here to combine the two vectors, then pshufd to - put the elements in the correct order. - In both cases the cost of the reformatting stall was too high - and the overall sequence slower. */ - - emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3), - const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4), - const0_rtx, const2_rtx, - const0_rtx, const0_rtx)); - res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2)); - - set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2)); + if (TARGET_SSE && VALID_SSE_REG_MODE (mode)) + return true; + if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode)) + return true; + if (TARGET_AVX && VALID_AVX256_REG_MODE (mode)) + return true; + if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode)) + return true; + if (TARGET_MMX && VALID_MMX_REG_MODE (mode)) + return true; + if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode)) + return true; + return false; } -void -ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2) +/* Target hook for c_mode_for_suffix. */ +static machine_mode +ix86_c_mode_for_suffix (char suffix) { - machine_mode mode = GET_MODE (op0); - rtx t1, t2, t3, t4, t5, t6; + if (suffix == 'q') + return TFmode; + if (suffix == 'w') + return XFmode; - if (TARGET_AVX512DQ && mode == V8DImode) - emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2)); - else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode) - emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2)); - else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode) - emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2)); - else if (TARGET_XOP && mode == V2DImode) - { - /* op1: A,B,C,D, op2: E,F,G,H */ - op1 = gen_lowpart (V4SImode, op1); - op2 = gen_lowpart (V4SImode, op2); + return VOIDmode; +} - t1 = gen_reg_rtx (V4SImode); - t2 = gen_reg_rtx (V4SImode); - t3 = gen_reg_rtx (V2DImode); - t4 = gen_reg_rtx (V2DImode); +/* Worker function for TARGET_MD_ASM_ADJUST. - /* t1: B,A,D,C */ - emit_insn (gen_sse2_pshufd_1 (t1, op1, - GEN_INT (1), - GEN_INT (0), - GEN_INT (3), - GEN_INT (2))); + We implement asm flag outputs, and maintain source compatibility + with the old cc0-based compiler. */ - /* t2: (B*E),(A*F),(D*G),(C*H) */ - emit_insn (gen_mulv4si3 (t2, t1, op2)); +static rtx_insn * +ix86_md_asm_adjust (vec &outputs, vec &/*inputs*/, + vec &constraints, + vec &clobbers, HARD_REG_SET &clobbered_regs) +{ + bool saw_asm_flag = false; - /* t3: (B*E)+(A*F), (D*G)+(C*H) */ - emit_insn (gen_xop_phadddq (t3, t2)); + start_sequence (); + for (unsigned i = 0, n = outputs.length (); i < n; ++i) + { + const char *con = constraints[i]; + if (strncmp (con, "=@cc", 4) != 0) + continue; + con += 4; + if (strchr (con, ',') != NULL) + { + error ("alternatives not allowed in asm flag output"); + continue; + } - /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */ - emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32))); + bool invert = false; + if (con[0] == 'n') + invert = true, con++; - /* Multiply lower parts and add all */ - t5 = gen_reg_rtx (V2DImode); - emit_insn (gen_vec_widen_umult_even_v4si (t5, - gen_lowpart (V4SImode, op1), - gen_lowpart (V4SImode, op2))); - op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT); + machine_mode mode = CCmode; + rtx_code code = UNKNOWN; - } - else - { - machine_mode nmode; - rtx (*umul) (rtx, rtx, rtx); + switch (con[0]) + { + case 'a': + if (con[1] == 0) + mode = CCAmode, code = EQ; + else if (con[1] == 'e' && con[2] == 0) + mode = CCCmode, code = NE; + break; + case 'b': + if (con[1] == 0) + mode = CCCmode, code = EQ; + else if (con[1] == 'e' && con[2] == 0) + mode = CCAmode, code = NE; + break; + case 'c': + if (con[1] == 0) + mode = CCCmode, code = EQ; + break; + case 'e': + if (con[1] == 0) + mode = CCZmode, code = EQ; + break; + case 'g': + if (con[1] == 0) + mode = CCGCmode, code = GT; + else if (con[1] == 'e' && con[2] == 0) + mode = CCGCmode, code = GE; + break; + case 'l': + if (con[1] == 0) + mode = CCGCmode, code = LT; + else if (con[1] == 'e' && con[2] == 0) + mode = CCGCmode, code = LE; + break; + case 'o': + if (con[1] == 0) + mode = CCOmode, code = EQ; + break; + case 'p': + if (con[1] == 0) + mode = CCPmode, code = EQ; + break; + case 's': + if (con[1] == 0) + mode = CCSmode, code = EQ; + break; + case 'z': + if (con[1] == 0) + mode = CCZmode, code = EQ; + break; + } + if (code == UNKNOWN) + { + error ("unknown asm flag output %qs", constraints[i]); + continue; + } + if (invert) + code = reverse_condition (code); - if (mode == V2DImode) + rtx dest = outputs[i]; + if (!saw_asm_flag) { - umul = gen_vec_widen_umult_even_v4si; - nmode = V4SImode; + /* This is the first asm flag output. Here we put the flags + register in as the real output and adjust the condition to + allow it. */ + constraints[i] = "=Bf"; + outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG); + saw_asm_flag = true; } - else if (mode == V4DImode) + else { - umul = gen_vec_widen_umult_even_v8si; - nmode = V8SImode; + /* We don't need the flags register as output twice. */ + constraints[i] = "=X"; + outputs[i] = gen_rtx_SCRATCH (SImode); } - else if (mode == V8DImode) + + rtx x = gen_rtx_REG (mode, FLAGS_REG); + x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx); + + machine_mode dest_mode = GET_MODE (dest); + if (!SCALAR_INT_MODE_P (dest_mode)) { - umul = gen_vec_widen_umult_even_v16si; - nmode = V16SImode; + error ("invalid type for asm flag output"); + continue; } - else - gcc_unreachable (); + if (dest_mode == DImode && !TARGET_64BIT) + dest_mode = SImode; - /* Multiply low parts. */ - t1 = gen_reg_rtx (mode); - emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2))); + if (dest_mode != QImode) + { + rtx destqi = gen_reg_rtx (QImode); + emit_insn (gen_rtx_SET (destqi, x)); - /* Shift input vectors right 32 bits so we can multiply high parts. */ - t6 = GEN_INT (32); - t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT); - t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT); + if (TARGET_ZERO_EXTEND_WITH_AND + && optimize_function_for_speed_p (cfun)) + { + x = force_reg (dest_mode, const0_rtx); - /* Multiply high parts by low parts. */ - t4 = gen_reg_rtx (mode); - t5 = gen_reg_rtx (mode); - emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2))); - emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1))); + emit_insn (gen_movstrictqi + (gen_lowpart (QImode, x), destqi)); + } + else + x = gen_rtx_ZERO_EXTEND (dest_mode, destqi); + } - /* Combine and shift the highparts back. */ - t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT); - t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT); + if (dest_mode != GET_MODE (dest)) + { + rtx tmp = gen_reg_rtx (SImode); - /* Combine high and low parts. */ - force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT); + emit_insn (gen_rtx_SET (tmp, x)); + emit_insn (gen_zero_extendsidi2 (dest, tmp)); + } + else + emit_insn (gen_rtx_SET (dest, x)); } + rtx_insn *seq = get_insns (); + end_sequence (); - set_unique_reg_note (get_last_insn (), REG_EQUAL, - gen_rtx_MULT (mode, op1, op2)); + if (saw_asm_flag) + return seq; + else + { + /* If we had no asm flag outputs, clobber the flags. */ + clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG)); + SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG); + return NULL; + } } -/* Return 1 if control tansfer instruction INSN - should be encoded with notrack prefix. */ +/* Implements target vector targetm.asm.encode_section_info. */ -static bool -ix86_notrack_prefixed_insn_p (rtx insn) +static void ATTRIBUTE_UNUSED +ix86_encode_section_info (tree decl, rtx rtl, int first) { - if (!insn || !((flag_cf_protection & CF_BRANCH))) - return false; - - if (CALL_P (insn)) - { - rtx call = get_call_rtx_from (insn); - gcc_assert (call != NULL_RTX); - rtx addr = XEXP (call, 0); + default_encode_section_info (decl, rtl, first); - /* Do not emit 'notrack' if it's not an indirect call. */ - if (MEM_P (addr) - && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF) - return false; - else - return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0); - } + if (ix86_in_large_data_p (decl)) + SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR; +} - if (JUMP_P (insn) && !flag_cet_switch) - { - rtx target = JUMP_LABEL (insn); - if (target == NULL_RTX || ANY_RETURN_P (target)) - return false; +/* Worker function for REVERSE_CONDITION. */ - /* Check the jump is a switch table. */ - rtx_insn *label = as_a (target); - rtx_insn *table = next_insn (label); - if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table)) - return false; - else - return true; - } - return false; +enum rtx_code +ix86_reverse_condition (enum rtx_code code, machine_mode mode) +{ + return (mode == CCFPmode + ? reverse_condition_maybe_unordered (code) + : reverse_condition (code)); } -/* Calculate integer abs() using only SSE2 instructions. */ +/* Output code to perform an x87 FP register move, from OPERANDS[1] + to OPERANDS[0]. */ -void -ix86_expand_sse2_abs (rtx target, rtx input) +const char * +output_387_reg_move (rtx_insn *insn, rtx *operands) { - machine_mode mode = GET_MODE (target); - rtx tmp0, tmp1, x; - - switch (mode) + if (REG_P (operands[0])) { - case E_V2DImode: - case E_V4DImode: - /* For 64-bit signed integer X, with SSE4.2 use - pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X. - Otherwise handle it similarly to V4SImode, except use 64 as W instead of - 32 and use logical instead of arithmetic right shift (which is - unimplemented) and subtract. */ - if (TARGET_SSE4_2) - { - tmp0 = gen_reg_rtx (mode); - tmp1 = gen_reg_rtx (mode); - emit_move_insn (tmp1, CONST0_RTX (mode)); - if (mode == E_V2DImode) - emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input)); - else - emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input)); + if (REG_P (operands[1]) + && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + { + if (REGNO (operands[0]) == FIRST_STACK_REG) + return output_387_ffreep (operands, 0); + return "fstp\t%y0"; } + if (STACK_TOP_P (operands[0])) + return "fld%Z1\t%y1"; + return "fst\t%y0"; + } + else if (MEM_P (operands[0])) + { + gcc_assert (REG_P (operands[1])); + if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + return "fstp%Z0\t%y0"; else { - tmp0 = expand_simple_binop (mode, LSHIFTRT, input, - GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - - 1), NULL, 0, OPTAB_DIRECT); - tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false); + /* There is no non-popping store to memory for XFmode. + So if we need one, follow the store with a load. */ + if (GET_MODE (operands[0]) == XFmode) + return "fstp%Z0\t%y0\n\tfld%Z0\t%y0"; + else + return "fst%Z0\t%y0"; } - - tmp1 = expand_simple_binop (mode, XOR, tmp0, input, - NULL, 0, OPTAB_DIRECT); - x = expand_simple_binop (mode, MINUS, tmp1, tmp0, - target, 0, OPTAB_DIRECT); - break; - - case E_V4SImode: - /* For 32-bit signed integer X, the best way to calculate the absolute - value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */ - tmp0 = expand_simple_binop (mode, ASHIFTRT, input, - GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1), - NULL, 0, OPTAB_DIRECT); - tmp1 = expand_simple_binop (mode, XOR, tmp0, input, - NULL, 0, OPTAB_DIRECT); - x = expand_simple_binop (mode, MINUS, tmp1, tmp0, - target, 0, OPTAB_DIRECT); - break; - - case E_V8HImode: - /* For 16-bit signed integer X, the best way to calculate the absolute - value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */ - tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); - - x = expand_simple_binop (mode, SMAX, tmp0, input, - target, 0, OPTAB_DIRECT); - break; - - case E_V16QImode: - /* For 8-bit signed integer X, the best way to calculate the absolute - value of X is min ((unsigned char) X, (unsigned char) (-X)), - as SSE2 provides the PMINUB insn. */ - tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0); - - x = expand_simple_binop (V16QImode, UMIN, tmp0, input, - target, 0, OPTAB_DIRECT); - break; - - default: - gcc_unreachable (); } - - if (x != target) - emit_move_insn (target, x); + else + gcc_unreachable(); } +#ifdef TARGET_SOLARIS +/* Solaris implementation of TARGET_ASM_NAMED_SECTION. */ -/* Expand an extract from a vector register through pextr insn. - Return true if successful. */ - -bool -ix86_expand_pextr (rtx *operands) +static void +i386_solaris_elf_named_section (const char *name, unsigned int flags, + tree decl) { - rtx dst = operands[0]; - rtx src = operands[1]; - - unsigned int size = INTVAL (operands[2]); - unsigned int pos = INTVAL (operands[3]); - - if (SUBREG_P (dst)) + /* With Binutils 2.15, the "@unwind" marker must be specified on + every occurrence of the ".eh_frame" section, not just the first + one. */ + if (TARGET_64BIT + && strcmp (name, ".eh_frame") == 0) { - /* Reject non-lowpart subregs. */ - if (SUBREG_BYTE (dst) > 0) - return false; - dst = SUBREG_REG (dst); + fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name, + flags & SECTION_WRITE ? "aw" : "a"); + return; } - - if (SUBREG_P (src)) + +#ifndef USE_GAS + if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE) { - pos += SUBREG_BYTE (src) * BITS_PER_UNIT; - src = SUBREG_REG (src); + solaris_elf_asm_comdat_section (name, flags, decl); + return; } - switch (GET_MODE (src)) + /* Solaris/x86 as uses the same syntax for the SHF_EXCLUDE flags as the + SPARC assembler. One cannot mix single-letter flags and #exclude, so + only emit the latter here. */ + if (flags & SECTION_EXCLUDE) { - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - case E_V1TImode: - case E_TImode: - { - machine_mode srcmode, dstmode; - rtx d, pat; + fprintf (asm_out_file, "\t.section\t%s,#exclude\n", name); + return; + } +#endif - if (!int_mode_for_size (size, 0).exists (&dstmode)) - return false; + default_elf_asm_named_section (name, flags, decl); +} +#endif /* TARGET_SOLARIS */ - switch (dstmode) - { - case E_QImode: - if (!TARGET_SSE4_1) - return false; - srcmode = V16QImode; - break; +/* Return the mangling of TYPE if it is an extended fundamental type. */ - case E_HImode: - if (!TARGET_SSE2) - return false; - srcmode = V8HImode; - break; +static const char * +ix86_mangle_type (const_tree type) +{ + type = TYPE_MAIN_VARIANT (type); - case E_SImode: - if (!TARGET_SSE4_1) - return false; - srcmode = V4SImode; - break; + if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE + && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE) + return NULL; - case E_DImode: - gcc_assert (TARGET_64BIT); - if (!TARGET_SSE4_1) - return false; - srcmode = V2DImode; - break; + switch (TYPE_MODE (type)) + { + case E_TFmode: + /* __float128 is "g". */ + return "g"; + case E_XFmode: + /* "long double" or __float80 is "e". */ + return "e"; + default: + return NULL; + } +} - default: - return false; - } +static GTY(()) tree ix86_tls_stack_chk_guard_decl; + +static tree +ix86_stack_protect_guard (void) +{ + if (TARGET_SSP_TLS_GUARD) + { + tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1); + int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg); + tree type = build_qualified_type (type_node, qual); + tree t; - /* Reject extractions from misaligned positions. */ - if (pos & (size-1)) - return false; + if (global_options_set.x_ix86_stack_protector_guard_symbol_str) + { + t = ix86_tls_stack_chk_guard_decl; - if (GET_MODE (dst) == dstmode) - d = dst; - else - d = gen_reg_rtx (dstmode); + if (t == NULL) + { + rtx x; - /* Construct insn pattern. */ - pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size))); - pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat); + t = build_decl + (UNKNOWN_LOCATION, VAR_DECL, + get_identifier (ix86_stack_protector_guard_symbol_str), + type); + TREE_STATIC (t) = 1; + TREE_PUBLIC (t) = 1; + DECL_EXTERNAL (t) = 1; + TREE_USED (t) = 1; + TREE_THIS_VOLATILE (t) = 1; + DECL_ARTIFICIAL (t) = 1; + DECL_IGNORED_P (t) = 1; - /* Let the rtl optimizers know about the zero extension performed. */ - if (dstmode == QImode || dstmode == HImode) - { - pat = gen_rtx_ZERO_EXTEND (SImode, pat); - d = gen_lowpart (SImode, d); - } + /* Do not share RTL as the declaration is visible outside of + current function. */ + x = DECL_RTL (t); + RTX_FLAG (x, used) = 1; - emit_insn (gen_rtx_SET (d, pat)); + ix86_tls_stack_chk_guard_decl = t; + } + } + else + { + tree asptrtype = build_pointer_type (type); - if (d != dst) - emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d)); - return true; - } + t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset); + t = build2 (MEM_REF, asptrtype, t, + build_int_cst (asptrtype, 0)); + TREE_THIS_VOLATILE (t) = 1; + } - default: - return false; + return t; } + + return default_stack_protect_guard (); } -/* Expand an insert into a vector register through pinsr insn. - Return true if successful. */ +/* For 32-bit code we can save PIC register setup by using + __stack_chk_fail_local hidden function instead of calling + __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC + register, so it is better to call __stack_chk_fail directly. */ -bool -ix86_expand_pinsr (rtx *operands) +static tree ATTRIBUTE_UNUSED +ix86_stack_protect_fail (void) { - rtx dst = operands[0]; - rtx src = operands[3]; + return TARGET_64BIT + ? default_external_stack_protect_fail () + : default_hidden_stack_protect_fail (); +} - unsigned int size = INTVAL (operands[1]); - unsigned int pos = INTVAL (operands[2]); +/* Select a format to encode pointers in exception handling data. CODE + is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is + true if the symbol may be affected by dynamic relocations. - if (SUBREG_P (dst)) + ??? All x86 object file formats are capable of representing this. + After all, the relocation needed is the same as for the call insn. + Whether or not a particular assembler allows us to enter such, I + guess we'll have to see. */ +int +asm_preferred_eh_data_format (int code, int global) +{ + if (flag_pic) + { + int type = DW_EH_PE_sdata8; + if (!TARGET_64BIT + || ix86_cmodel == CM_SMALL_PIC + || (ix86_cmodel == CM_MEDIUM_PIC && (global || code))) + type = DW_EH_PE_sdata4; + return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type; + } + if (ix86_cmodel == CM_SMALL + || (ix86_cmodel == CM_MEDIUM && code)) + return DW_EH_PE_udata4; + return DW_EH_PE_absptr; +} + +/* Implement targetm.vectorize.builtin_vectorization_cost. */ +static int +ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, + tree vectype, int) +{ + bool fp = false; + machine_mode mode = TImode; + int index; + if (vectype != NULL) { - pos += SUBREG_BYTE (dst) * BITS_PER_UNIT; - dst = SUBREG_REG (dst); + fp = FLOAT_TYPE_P (vectype); + mode = TYPE_MODE (vectype); } - switch (GET_MODE (dst)) + switch (type_of_cost) { - case E_V16QImode: - case E_V8HImode: - case E_V4SImode: - case E_V2DImode: - case E_V1TImode: - case E_TImode: - { - machine_mode srcmode, dstmode; - rtx (*pinsr)(rtx, rtx, rtx, rtx); - rtx d; + case scalar_stmt: + return fp ? ix86_cost->addss : COSTS_N_INSNS (1); - if (!int_mode_for_size (size, 0).exists (&srcmode)) - return false; + case scalar_load: + /* load/store costs are relative to register move which is 2. Recompute + it to COSTS_N_INSNS so everything have same base. */ + return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0] + : ix86_cost->int_load [2]) / 2; - switch (srcmode) - { - case E_QImode: - if (!TARGET_SSE4_1) - return false; - dstmode = V16QImode; - pinsr = gen_sse4_1_pinsrb; - break; + case scalar_store: + return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0] + : ix86_cost->int_store [2]) / 2; - case E_HImode: - if (!TARGET_SSE2) - return false; - dstmode = V8HImode; - pinsr = gen_sse2_pinsrw; - break; + case vector_stmt: + return ix86_vec_cost (mode, + fp ? ix86_cost->addss : ix86_cost->sse_op); - case E_SImode: - if (!TARGET_SSE4_1) - return false; - dstmode = V4SImode; - pinsr = gen_sse4_1_pinsrd; - break; + case vector_load: + index = sse_store_index (mode); + /* See PR82713 - we may end up being called on non-vector type. */ + if (index < 0) + index = 2; + return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2; - case E_DImode: - gcc_assert (TARGET_64BIT); - if (!TARGET_SSE4_1) - return false; - dstmode = V2DImode; - pinsr = gen_sse4_1_pinsrq; - break; + case vector_store: + index = sse_store_index (mode); + /* See PR82713 - we may end up being called on non-vector type. */ + if (index < 0) + index = 2; + return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2; - default: - return false; - } + case vec_to_scalar: + case scalar_to_vec: + return ix86_vec_cost (mode, ix86_cost->sse_op); - /* Reject insertions to misaligned positions. */ - if (pos & (size-1)) - return false; + /* We should have separate costs for unaligned loads and gather/scatter. + Do that incrementally. */ + case unaligned_load: + index = sse_store_index (mode); + /* See PR82713 - we may end up being called on non-vector type. */ + if (index < 0) + index = 2; + return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2; - if (SUBREG_P (src)) - { - unsigned int srcpos = SUBREG_BYTE (src); + case unaligned_store: + index = sse_store_index (mode); + /* See PR82713 - we may end up being called on non-vector type. */ + if (index < 0) + index = 2; + return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2; - if (srcpos > 0) - { - rtx extr_ops[4]; + case vector_gather_load: + return ix86_vec_cost (mode, + COSTS_N_INSNS + (ix86_cost->gather_static + + ix86_cost->gather_per_elt + * TYPE_VECTOR_SUBPARTS (vectype)) / 2); - extr_ops[0] = gen_reg_rtx (srcmode); - extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src)); - extr_ops[2] = GEN_INT (size); - extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT); + case vector_scatter_store: + return ix86_vec_cost (mode, + COSTS_N_INSNS + (ix86_cost->scatter_static + + ix86_cost->scatter_per_elt + * TYPE_VECTOR_SUBPARTS (vectype)) / 2); - if (!ix86_expand_pextr (extr_ops)) - return false; + case cond_branch_taken: + return ix86_cost->cond_taken_branch_cost; - src = extr_ops[0]; - } - else - src = gen_lowpart (srcmode, SUBREG_REG (src)); - } + case cond_branch_not_taken: + return ix86_cost->cond_not_taken_branch_cost; - if (GET_MODE (dst) == dstmode) - d = dst; - else - d = gen_reg_rtx (dstmode); + case vec_perm: + case vec_promote_demote: + return ix86_vec_cost (mode, ix86_cost->sse_op); - emit_insn (pinsr (d, gen_lowpart (dstmode, dst), - gen_lowpart (srcmode, src), - GEN_INT (1 << (pos / size)))); - if (d != dst) - emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d)); - return true; - } + case vec_construct: + { + /* N element inserts into SSE vectors. */ + int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op; + /* One vinserti128 for combining two SSE vectors for AVX256. */ + if (GET_MODE_BITSIZE (mode) == 256) + cost += ix86_vec_cost (mode, ix86_cost->addss); + /* One vinserti64x4 and two vinserti128 for combining SSE + and AVX256 vectors to AVX512. */ + else if (GET_MODE_BITSIZE (mode) == 512) + cost += 3 * ix86_vec_cost (mode, ix86_cost->addss); + return cost; + } - default: - return false; + default: + gcc_unreachable (); } } + /* This function returns the calling abi specific va_list type node. It returns the FNDECL specific va_list type. */ @@ -50134,39 +21319,6 @@ ix86_preferred_simd_mode (scalar_mode mode) } } -/* All CPUs prefer to avoid cross-lane operations so perform reductions - upper against lower halves up to SSE reg size. */ - -static machine_mode -ix86_split_reduction (machine_mode mode) -{ - /* Reduce lowpart against highpart until we reach SSE reg width to - avoid cross-lane operations. */ - switch (mode) - { - case E_V8DImode: - case E_V4DImode: - return V2DImode; - case E_V16SImode: - case E_V8SImode: - return V4SImode; - case E_V32HImode: - case E_V16HImode: - return V8HImode; - case E_V64QImode: - case E_V32QImode: - return V16QImode; - case E_V16SFmode: - case E_V8SFmode: - return V4SFmode; - case E_V8DFmode: - case E_V4DFmode: - return V2DFmode; - default: - return mode; - } -} - /* If AVX is enabled then try vectorizing with both 256bit and 128bit vectors. If AVX512F is enabled then try vectorizing with 512bit, 256bit and 128bit vectors. */ @@ -50687,50 +21839,6 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, return ret; } -/* Add target attribute to SIMD clone NODE if needed. */ - -static void -ix86_simd_clone_adjust (struct cgraph_node *node) -{ - const char *str = NULL; - - /* Attributes need to be adjusted for definitions, not declarations. */ - if (!node->definition) - return; - - gcc_assert (node->decl == cfun->decl); - switch (node->simdclone->vecsize_mangle) - { - case 'b': - if (!TARGET_SSE2) - str = "sse2"; - break; - case 'c': - if (!TARGET_AVX) - str = "avx"; - break; - case 'd': - if (!TARGET_AVX2) - str = "avx2"; - break; - case 'e': - if (!TARGET_AVX512F) - str = "avx512f"; - break; - default: - gcc_unreachable (); - } - if (str == NULL) - return; - push_cfun (NULL); - tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str)); - bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0); - gcc_assert (ok); - pop_cfun (); - ix86_reset_previous_fndecl (); - ix86_set_current_function (node->decl); -} - /* If SIMD clone NODE can't be used in a vectorized loop in current function, return -1, otherwise return a badness of using it (0 if it is most desirable from vecsize_mangle point of view, 1 @@ -50839,10 +21947,10 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) tree fenv_ptr = build_pointer_type (fenv_type); tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var); fenv_addr = fold_convert (ptr_type_node, fenv_addr); - tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV]; - tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV]; - tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW]; - tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX]; + tree fnstenv = get_ix86_builtin (IX86_BUILTIN_FNSTENV); + tree fldenv = get_ix86_builtin (IX86_BUILTIN_FLDENV); + tree fnstsw = get_ix86_builtin (IX86_BUILTIN_FNSTSW); + tree fnclex = get_ix86_builtin (IX86_BUILTIN_FNCLEX); tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr); tree hold_fnclex = build_call_expr (fnclex, 0); fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv, @@ -50866,8 +21974,8 @@ ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update) { tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node); tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node); - tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR]; - tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR]; + tree stmxcsr = get_ix86_builtin (IX86_BUILTIN_STMXCSR); + tree ldmxcsr = get_ix86_builtin (IX86_BUILTIN_LDMXCSR); tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0); tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node, mxcsr_orig_var, stmxcsr_hold_call); @@ -51110,22 +22218,6 @@ ix86_init_libfuncs (void) #endif } -/* Generate call to __divmoddi4. */ - -static void -ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode, - rtx op0, rtx op1, - rtx *quot_p, rtx *rem_p) -{ - rtx rem = assign_386_stack_local (mode, SLOT_TEMP); - - rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL, - mode, op0, mode, op1, mode, - XEXP (rem, 0), Pmode); - *quot_p = quot; - *rem_p = rem; -} - /* Set the value of FLT_EVAL_METHOD in float.h. When using only the FPU, assume that the fpcw is set to extended precision; when using only SSE, rounding is correct; when using both SSE and the FPU, diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 2cb16d9fbf6..ad6c36ba265 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2759,6 +2759,9 @@ struct GTY(()) machine_function { /* During SEH output, this is non-null. */ struct seh_frame_state * GTY((skip(""))) seh; }; + +extern GTY(()) tree sysv_va_list_type_node; +extern GTY(()) tree ms_va_list_type_node; #endif #define ix86_stack_locals (cfun->machine->stack_locals) @@ -2856,6 +2859,12 @@ extern void debug_dispatch_window (int); #define TARGET_SUPPORTS_WIDE_INT 1 +#if !defined(GENERATOR_FILE) && !defined(IN_LIBGCC2) +extern enum attr_cpu ix86_schedule; + +#define NUM_X86_64_MS_CLOBBERED_REGS 12 +#endif + /* Local variables: version-control: t diff --git a/gcc/config/i386/t-i386 b/gcc/config/i386/t-i386 index 0dac80fbc46..50caf2c6961 100644 --- a/gcc/config/i386/t-i386 +++ b/gcc/config/i386/t-i386 @@ -44,6 +44,22 @@ i386-d.o: $(srcdir)/config/i386/i386-d.c $(COMPILE) $< $(POSTCOMPILE) +i386-options.o: $(srcdir)/config/i386/i386-options.c + $(COMPILE) $< + $(POSTCOMPILE) + +i386-builtins.o: $(srcdir)/config/i386/i386-builtins.c + $(COMPILE) $< + $(POSTCOMPILE) + +i386-expand.o: $(srcdir)/config/i386/i386-expand.c + $(COMPILE) $< + $(POSTCOMPILE) + +i386-features.o: $(srcdir)/config/i386/i386-features.c + $(COMPILE) $< + $(POSTCOMPILE) + i386.o: i386-builtin-types.inc i386-builtin-types.inc: s-i386-bt ; @true -- 2.30.2