From: Jakub Jelinek Date: Wed, 28 Oct 2020 09:28:18 +0000 (+0100) Subject: lto: LTO cgraph support for late declare variant resolution [PR96680] X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=f165ef89c08ddabb19583e45e8a6819f810d95ab;p=gcc.git lto: LTO cgraph support for late declare variant resolution [PR96680] > I've tried to add the saving/restoring next to ipa refs saving/restoring, as > the declare variant alt stuff is kind of extension of those, unfortunately > following doesn't compile, because I need to also write or read a tree there > (ctx is a portion of DECL_ATTRIBUTES of the base function), but the ipa refs > write/read back functions don't have arguments that can be used for that. This patch adds the streaming out and in of those omp_declare_variant_alt hash table on the side data for the declare_variant_alt cgraph_nodes and treats for LTO purposes the declare_variant_alt nodes (which have no body) as if they contained a body that calls all the possible variants. After IPA all the calls to these magic declare_variant_alt calls are replaced with call to one of the variant depending on which one has the highest score in the context. 2020-10-28 Jakub Jelinek PR lto/96680 gcc/ * lto-streamer.h (omp_lto_output_declare_variant_alt, omp_lto_input_declare_variant_alt): Declare variant. * symtab.c (symtab_node::get_partitioning_class): Return SYMBOL_DUPLICATE for declare_variant_alt nodes. * passes.c (ipa_write_summaries): Add declare_variant_alt to partition. * lto-cgraph.c (output_refs): Call omp_lto_output_declare_variant_alt on declare_variant_alt nodes. (input_refs): Call omp_lto_input_declare_variant_alt on declare_variant_alt nodes. * lto-streamer-out.c (output_function): Don't call collect_block_tree_leafs if DECL_INITIAL is error_mark_node. (lto_output): Call output_function even for declare_variant_alt nodes. * omp-general.c (omp_lto_output_declare_variant_alt, omp_lto_input_declare_variant_alt): New functions. gcc/lto/ * lto-common.c (lto_fixup_prevailing_decls): Don't use LTO_NO_PREVAIL on TREE_LIST's TREE_PURPOSE. * lto-partition.c (lto_balanced_map): Treat declare_variant_alt nodes like definitions. libgomp/ * testsuite/libgomp.c/declare-variant-1.c: New test. --- diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c index 19d4ca54e92..91900a12fa2 100644 --- a/gcc/lto-cgraph.c +++ b/gcc/lto-cgraph.c @@ -767,6 +767,9 @@ output_refs (lto_symtab_encoder_t encoder) for (int i = 0; node->iterate_reference (i, ref); i++) lto_output_ref (ob, ref, encoder); } + if (cgraph_node *cnode = dyn_cast (node)) + if (cnode->declare_variant_alt) + omp_lto_output_declare_variant_alt (ob, cnode, encoder); } streamer_write_uhwi_stream (ob->main_stream, 0); @@ -1608,6 +1611,9 @@ input_refs (class lto_input_block *ib, input_ref (ib, node, nodes); count--; } + if (cgraph_node *cnode = dyn_cast (node)) + if (cnode->declare_variant_alt) + omp_lto_input_declare_variant_alt (ib, cnode, nodes); } } diff --git a/gcc/lto-streamer-out.c b/gcc/lto-streamer-out.c index 7882c89388d..0ca2796da9c 100644 --- a/gcc/lto-streamer-out.c +++ b/gcc/lto-streamer-out.c @@ -2424,7 +2424,7 @@ output_function (struct cgraph_node *node) /* As we do not recurse into BLOCK_SUBBLOCKS but only BLOCK_SUPERCONTEXT collect block tree leafs and stream those. */ auto_vec block_tree_leafs; - if (DECL_INITIAL (function)) + if (DECL_INITIAL (function) && DECL_INITIAL (function) != error_mark_node) collect_block_tree_leafs (DECL_INITIAL (function), block_tree_leafs); streamer_write_uhwi (ob, block_tree_leafs.length ()); for (unsigned i = 0; i < block_tree_leafs.length (); ++i) @@ -2788,7 +2788,8 @@ lto_output (void) && flag_incremental_link != INCREMENTAL_LINK_LTO) /* Thunks have no body but they may be synthetized at WPA time. */ - || DECL_ARGUMENTS (cnode->decl))) + || DECL_ARGUMENTS (cnode->decl) + || cnode->declare_variant_alt)) output_function (cnode); else if ((vnode = dyn_cast (snode)) && (DECL_INITIAL (vnode->decl) != error_mark_node diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h index b465a5e9c18..c75a8b2cc4a 100644 --- a/gcc/lto-streamer.h +++ b/gcc/lto-streamer.h @@ -927,6 +927,12 @@ bool reachable_from_this_partition_p (struct cgraph_node *, lto_symtab_encoder_t compute_ltrans_boundary (lto_symtab_encoder_t encoder); void select_what_to_stream (void); +/* In omp-general.c. */ +void omp_lto_output_declare_variant_alt (lto_simple_output_block *, + cgraph_node *, lto_symtab_encoder_t); +void omp_lto_input_declare_variant_alt (lto_input_block *, cgraph_node *, + vec); + /* In options-save.c. */ void cl_target_option_stream_out (struct output_block *, struct bitpack_d *, struct cl_target_option *); diff --git a/gcc/lto/lto-common.c b/gcc/lto/lto-common.c index 3ca0fd83a41..6944c469f89 100644 --- a/gcc/lto/lto-common.c +++ b/gcc/lto/lto-common.c @@ -2592,7 +2592,6 @@ lto_fixup_prevailing_decls (tree t) case TREE_LIST: LTO_SET_PREVAIL (TREE_VALUE (t)); LTO_SET_PREVAIL (TREE_PURPOSE (t)); - LTO_NO_PREVAIL (TREE_PURPOSE (t)); break; default: gcc_unreachable (); diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c index 7c15181f66f..cc721f91586 100644 --- a/gcc/lto/lto-partition.c +++ b/gcc/lto/lto-partition.c @@ -593,7 +593,8 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size) last_visited_node++; - gcc_assert (node->definition || node->weakref); + gcc_assert (node->definition || node->weakref + || node->declare_variant_alt); /* Compute boundary cost of callgraph edges. */ for (edge = node->callees; edge; edge = edge->next_callee) @@ -704,7 +705,7 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size) int index; node = dyn_cast (ref->referring); - gcc_assert (node->definition); + gcc_assert (node->definition || node->declare_variant_alt); index = lto_symtab_encoder_lookup (partition->encoder, node); if (index != LCC_NOT_FOUND diff --git a/gcc/omp-general.c b/gcc/omp-general.c index b70e3e31352..b66dfb58257 100644 --- a/gcc/omp-general.c +++ b/gcc/omp-general.c @@ -42,6 +42,8 @@ along with GCC; see the file COPYING3. If not see #include "tree-pass.h" #include "omp-device-properties.h" #include "tree-iterator.h" +#include "data-streamer.h" +#include "streamer-hooks.h" enum omp_requires omp_requires_mask; @@ -2337,6 +2339,125 @@ omp_resolve_declare_variant (tree base) ? TREE_PURPOSE (TREE_VALUE (variant1)) : base); } +void +omp_lto_output_declare_variant_alt (lto_simple_output_block *ob, + cgraph_node *node, + lto_symtab_encoder_t encoder) +{ + gcc_assert (node->declare_variant_alt); + + omp_declare_variant_base_entry entry; + entry.base = NULL; + entry.node = node; + entry.variants = NULL; + omp_declare_variant_base_entry *entryp + = omp_declare_variant_alt->find_with_hash (&entry, DECL_UID (node->decl)); + gcc_assert (entryp); + + int nbase = lto_symtab_encoder_lookup (encoder, entryp->base); + gcc_assert (nbase != LCC_NOT_FOUND); + streamer_write_hwi_stream (ob->main_stream, nbase); + + streamer_write_hwi_stream (ob->main_stream, entryp->variants->length ()); + + unsigned int i; + omp_declare_variant_entry *varentry; + FOR_EACH_VEC_SAFE_ELT (entryp->variants, i, varentry) + { + int nvar = lto_symtab_encoder_lookup (encoder, varentry->variant); + gcc_assert (nvar != LCC_NOT_FOUND); + streamer_write_hwi_stream (ob->main_stream, nvar); + + for (widest_int *w = &varentry->score; ; + w = &varentry->score_in_declare_simd_clone) + { + unsigned len = w->get_len (); + streamer_write_hwi_stream (ob->main_stream, len); + const HOST_WIDE_INT *val = w->get_val (); + for (unsigned j = 0; j < len; j++) + streamer_write_hwi_stream (ob->main_stream, val[j]); + if (w == &varentry->score_in_declare_simd_clone) + break; + } + + HOST_WIDE_INT cnt = -1; + HOST_WIDE_INT i = varentry->matches ? 1 : 0; + for (tree attr = DECL_ATTRIBUTES (entryp->base->decl); + attr; attr = TREE_CHAIN (attr), i += 2) + { + attr = lookup_attribute ("omp declare variant base", attr); + if (attr == NULL_TREE) + break; + + if (varentry->ctx == TREE_VALUE (TREE_VALUE (attr))) + { + cnt = i; + break; + } + } + + gcc_assert (cnt != -1); + streamer_write_hwi_stream (ob->main_stream, cnt); + } +} + +void +omp_lto_input_declare_variant_alt (lto_input_block *ib, cgraph_node *node, + vec nodes) +{ + gcc_assert (node->declare_variant_alt); + omp_declare_variant_base_entry *entryp + = ggc_cleared_alloc (); + entryp->base = dyn_cast (nodes[streamer_read_hwi (ib)]); + entryp->node = node; + unsigned int len = streamer_read_hwi (ib); + vec_alloc (entryp->variants, len); + + for (unsigned int i = 0; i < len; i++) + { + omp_declare_variant_entry varentry; + varentry.variant + = dyn_cast (nodes[streamer_read_hwi (ib)]); + for (widest_int *w = &varentry.score; ; + w = &varentry.score_in_declare_simd_clone) + { + unsigned len2 = streamer_read_hwi (ib); + HOST_WIDE_INT arr[WIDE_INT_MAX_ELTS]; + gcc_assert (len2 <= WIDE_INT_MAX_ELTS); + for (unsigned int j = 0; j < len2; j++) + arr[j] = streamer_read_hwi (ib); + *w = widest_int::from_array (arr, len2, true); + if (w == &varentry.score_in_declare_simd_clone) + break; + } + + HOST_WIDE_INT cnt = streamer_read_hwi (ib); + HOST_WIDE_INT j = 0; + varentry.ctx = NULL_TREE; + varentry.matches = (cnt & 1) ? true : false; + cnt &= ~HOST_WIDE_INT_1; + for (tree attr = DECL_ATTRIBUTES (entryp->base->decl); + attr; attr = TREE_CHAIN (attr), j += 2) + { + attr = lookup_attribute ("omp declare variant base", attr); + if (attr == NULL_TREE) + break; + + if (cnt == j) + { + varentry.ctx = TREE_VALUE (TREE_VALUE (attr)); + break; + } + } + gcc_assert (varentry.ctx != NULL_TREE); + entryp->variants->quick_push (varentry); + } + if (omp_declare_variant_alt == NULL) + omp_declare_variant_alt + = hash_table::create_ggc (64); + *omp_declare_variant_alt->find_slot_with_hash (entryp, DECL_UID (node->decl), + INSERT) = entryp; +} /* Encode an oacc launch argument. This matches the GOMP_LAUNCH_PACK macro on gomp-constants.h. We do not check for overflow. */ diff --git a/gcc/passes.c b/gcc/passes.c index 02a47e2595c..079ad1a88f7 100644 --- a/gcc/passes.c +++ b/gcc/passes.c @@ -2731,7 +2731,8 @@ ipa_write_summaries (void) { struct cgraph_node *node = order[i]; - if (node->definition && node->need_lto_streaming) + if ((node->definition || node->declare_variant_alt) + && node->need_lto_streaming) { if (gimple_has_body_p (node->decl)) lto_prepare_function_for_streaming (node); diff --git a/gcc/symtab.c b/gcc/symtab.c index 067ae2e28a0..9db88fa8531 100644 --- a/gcc/symtab.c +++ b/gcc/symtab.c @@ -2006,7 +2006,7 @@ symtab_node::get_partitioning_class (void) if (DECL_ABSTRACT_P (decl)) return SYMBOL_EXTERNAL; - if (cnode && cnode->inlined_to) + if (cnode && (cnode->inlined_to || cnode->declare_variant_alt)) return SYMBOL_DUPLICATE; /* Transparent aliases are always duplicated. */ diff --git a/libgomp/testsuite/libgomp.c/declare-variant-1.c b/libgomp/testsuite/libgomp.c/declare-variant-1.c new file mode 100644 index 00000000000..d16608f7e6d --- /dev/null +++ b/libgomp/testsuite/libgomp.c/declare-variant-1.c @@ -0,0 +1,54 @@ +/* { dg-do link { target vect_simd_clones } } */ +/* { dg-require-effective-target lto } */ +/* { dg-require-effective-target fpic } */ +/* { dg-require-effective-target shared } */ +/* { dg-additional-options "-fdump-tree-gimple -fdump-tree-optimized -O2 -fPIC -shared -flto -flto-partition=one" } */ +/* { dg-additional-options "-mno-sse3" { target { i?86-*-* x86_64-*-* } } } */ + +int +f01 (int a) +{ + asm volatile ("" : "+g" (a) : "g" (1) : "memory"); + return a; +} + +int +f02 (int a) +{ + asm volatile ("" : "+g" (a) : "g" (2) : "memory"); + return a; +} + +int +f03 (int a) +{ + asm volatile ("" : "+g" (a) : "g" (3) : "memory"); + return a; +} + +#pragma omp declare variant (f01) match (device={isa("avx512f")}) /* 4 or 8 */ +#pragma omp declare variant (f02) match (implementation={vendor(score(3):gnu)},device={kind(cpu)}) /* (1 or 2) + 3 */ +#pragma omp declare variant (f03) match (implementation={vendor(score(5):gnu)},device={kind(host)}) /* (1 or 2) + 5 */ +int +f04 (int a) +{ + asm volatile ("" : "+g" (a) : "g" (4) : "memory"); + return a; +} + +#pragma omp declare simd +int +test1 (int x) +{ + /* At gimplification time, we can't decide yet which function to call. */ + /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */ + /* After simd clones are created, the original non-clone test1 shall + call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones + shall call f01 with score 8. */ + /* { dg-final { scan-ltrans-tree-dump-not "f04 \\\(x" "optimized" } } */ + /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" } } */ + /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" } } */ + int a = f04 (x); + int b = f04 (x); + return a + b; +}