From: Richard Sandiford Date: Fri, 15 Jan 2021 16:45:42 +0000 (+0000) Subject: aarch64: Add a minipass for fusing CC insns [PR88836] X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=5a783f42d77b2f00a1ed171c119b020e8df8e521;p=gcc.git aarch64: Add a minipass for fusing CC insns [PR88836] This patch adds a small target-specific pass to remove redundant SVE PTEST instructions. There are two important uses of this: - Removing PTESTs after WHILELOs (PR88836). The original testcase no longer exhibits the problem due to more recent optimisations, but it can still be seen in simple cases like the one in the patch. It also shows up in 450.soplex. - Removing PTESTs after RDFFRs in ACLE code. This is just an interim “solution” for GCC 11. I hope to replace it with something generic and target-independent for GCC 12. However, the use cases above are very important for performance, so I'd rather not leave the bug unfixed for yet another release cycle. Since the pass is intended to be short-lived, I've not added a command-line option for it. The pass can be disabled using -fdisable-rtl-cc_fusion if necessary. Although what the pass does is independent of SVE, it's motivated only by SVE cases and doesn't trigger for any non-SVE test I've seen. I've therefore gated it on TARGET_SVE and restricted it to PTEST patterns. gcc/ PR target/88836 * config.gcc (aarch64*-*-*): Add aarch64-cc-fusion.o to extra_objs. * Makefile.in (RTL_SSA_H): New variable. * config/aarch64/t-aarch64 (aarch64-cc-fusion.o): New rule. * config/aarch64/aarch64-protos.h (make_pass_cc_fusion): Declare. * config/aarch64/aarch64-passes.def: Add pass_cc_fusion after pass_combine. * config/aarch64/aarch64-cc-fusion.cc: New file. gcc/testsuite/ PR target/88836 * gcc.target/aarch64/sve/acle/general/ldff1_8.c: New test. * gcc.target/aarch64/sve/ptest_1.c: Likewise. --- diff --git a/gcc/Makefile.in b/gcc/Makefile.in index de8af617488..a63c5d9cab6 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1024,6 +1024,13 @@ PLUGIN_H = plugin.h $(GCC_PLUGIN_H) PLUGIN_VERSION_H = plugin-version.h configargs.h CONTEXT_H = context.h GENSUPPORT_H = gensupport.h read-md.h optabs.def +RTL_SSA_H = $(PRETTY_PRINT_H) insn-config.h splay-tree-utils.h \ + $(RECOG_H) $(REGS_H) function-abi.h obstack-utils.h \ + mux-utils.h rtlanal.h memmodel.h $(EMIT_RTL_H) \ + rtl-ssa/accesses.h rtl-ssa/insns.h rtl-ssa/blocks.h \ + rtl-ssa/changes.h rtl-ssa/functions.h rtl-ssa/is-a.inl \ + rtl-ssa/access-utils.h rtl-ssa/insn-utils.h rtl-ssa/movement.h \ + rtl-ssa/change-utils.h rtl-ssa/member-fns.inl # # Now figure out from those variables how to compile and link. diff --git a/gcc/config.gcc b/gcc/config.gcc index 9fb57e96121..17fea83b2e4 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -321,7 +321,7 @@ aarch64*-*-*) c_target_objs="aarch64-c.o" cxx_target_objs="aarch64-c.o" d_target_objs="aarch64-d.o" - extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o" + extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o falkor-tag-collision-avoidance.o aarch64-bti-insert.o aarch64-cc-fusion.o" target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c \$(srcdir)/config/aarch64/aarch64-sve-builtins.h \$(srcdir)/config/aarch64/aarch64-sve-builtins.cc" target_has_targetm_common=yes ;; diff --git a/gcc/config/aarch64/aarch64-cc-fusion.cc b/gcc/config/aarch64/aarch64-cc-fusion.cc new file mode 100644 index 00000000000..09069a20de2 --- /dev/null +++ b/gcc/config/aarch64/aarch64-cc-fusion.cc @@ -0,0 +1,296 @@ +// Pass to fuse CC operations with other instructions. +// Copyright (C) 2021 Free Software Foundation, Inc. +// +// This file is part of GCC. +// +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. +// +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + +// This pass looks for sequences of the form: +// +// A: (set (reg R1) X1) +// B: ...instructions that might change the value of X1... +// C: (set (reg CC) X2) // X2 uses R1 +// +// and tries to change them to: +// +// C': [(set (reg CC) X2') +// (set (reg R1) X1)] +// B: ...instructions that might change the value of X1... +// +// where X2' is the result of replacing R1 with X1 in X2. +// +// This sequence occurs in SVE code in two important cases: +// +// (a) Sometimes, to deal correctly with overflow, we need to increment +// an IV after a WHILELO rather than before it. In this case: +// - A is a WHILELO, +// - B includes an IV increment and +// - C is a separate PTEST. +// +// (b) ACLE code of the form: +// +// svbool_t ok = svrdffr (); +// if (svptest_last (pg, ok)) +// ... +// +// must, for performance reasons, be code-generated as: +// +// RDFFRS Pok.B, Pg/Z +// ...branch on flags result... +// +// without a separate PTEST of Pok. In this case: +// - A is an aarch64_rdffr +// - B includes an aarch64_update_ffrt +// - C is a separate PTEST +// +// Combine can handle this optimization if B doesn't exist and if A and +// C are in the same BB. This pass instead handles cases where B does +// exist and cases where A and C are in different BBs of the same EBB. + +#define IN_TARGET_CODE 1 + +#define INCLUDE_ALGORITHM +#define INCLUDE_FUNCTIONAL +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "df.h" +#include "rtl-ssa.h" +#include "tree-pass.h" + +using namespace rtl_ssa; + +namespace { +const pass_data pass_data_cc_fusion = +{ + RTL_PASS, // type + "cc_fusion", // name + OPTGROUP_NONE, // optinfo_flags + TV_NONE, // tv_id + 0, // properties_required + 0, // properties_provided + 0, // properties_destroyed + 0, // todo_flags_start + TODO_df_finish, // todo_flags_finish +}; + +// Class that represents one run of the pass. +class cc_fusion +{ +public: + cc_fusion () : m_parallel () {} + void execute (); + +private: + rtx optimizable_set (const insn_info *); + bool parallelize_insns (def_info *, rtx, def_info *, rtx); + void optimize_cc_setter (def_info *, rtx); + + // A spare PARALLEL rtx, or null if none. + rtx m_parallel; +}; + +// See whether INSN is a single_set that we can optimize. Return the +// set if so, otherwise return null. +rtx +cc_fusion::optimizable_set (const insn_info *insn) +{ + if (!insn->can_be_optimized () + || insn->is_asm () + || insn->has_volatile_refs () + || insn->has_pre_post_modify ()) + return NULL_RTX; + + return single_set (insn->rtl ()); +} + +// CC_SET is a single_set that sets (only) CC_DEF; OTHER_SET is likewise +// a single_set that sets (only) OTHER_DEF. CC_SET is known to set the +// CC register and the instruction that contains CC_SET is known to use +// OTHER_DEF. Try to do CC_SET and OTHER_SET in parallel. +bool +cc_fusion::parallelize_insns (def_info *cc_def, rtx cc_set, + def_info *other_def, rtx other_set) +{ + auto attempt = crtl->ssa->new_change_attempt (); + + insn_info *cc_insn = cc_def->insn (); + insn_info *other_insn = other_def->insn (); + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "trying to parallelize insn %d and insn %d\n", + other_insn->uid (), cc_insn->uid ()); + + // Try to substitute OTHER_SET into CC_INSN. + insn_change_watermark rtl_watermark; + rtx_insn *cc_rtl = cc_insn->rtl (); + insn_propagation prop (cc_rtl, SET_DEST (other_set), + SET_SRC (other_set)); + if (!prop.apply_to_pattern (&PATTERN (cc_rtl)) + || prop.num_replacements == 0) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "-- failed to substitute all uses of r%d\n", + other_def->regno ()); + return false; + } + + // Restrict the uses to those outside notes. + use_array cc_uses = remove_note_accesses (attempt, cc_insn->uses ()); + use_array other_set_uses = remove_note_accesses (attempt, + other_insn->uses ()); + + // Remove the use of the substituted value. + access_array_builder uses_builder (attempt); + uses_builder.reserve (cc_uses.size ()); + for (use_info *use : cc_uses) + if (use->def () != other_def) + uses_builder.quick_push (use); + cc_uses = use_array (uses_builder.finish ()); + + // Get the list of uses for the new instruction. + insn_change cc_change (cc_insn); + cc_change.new_uses = merge_access_arrays (attempt, other_set_uses, cc_uses); + if (!cc_change.new_uses.is_valid ()) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "-- cannot merge uses\n"); + return false; + } + + // The instruction initially defines just two registers. recog can add + // extra clobbers if necessary. + auto_vec new_defs; + new_defs.quick_push (cc_def); + new_defs.quick_push (other_def); + sort_accesses (new_defs); + cc_change.new_defs = def_array (access_array (new_defs)); + + // Make sure there is somewhere that the new instruction could live. + auto other_change = insn_change::delete_insn (other_insn); + insn_change *changes[] = { &other_change, &cc_change }; + cc_change.move_range = cc_insn->ebb ()->insn_range (); + if (!restrict_movement_ignoring (cc_change, insn_is_changing (changes))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "-- cannot satisfy all definitions and uses\n"); + return false; + } + + // Tentatively install the new pattern. By convention, the CC set + // must be first. + if (m_parallel) + { + XVECEXP (m_parallel, 0, 0) = cc_set; + XVECEXP (m_parallel, 0, 1) = other_set; + } + else + { + rtvec vec = gen_rtvec (2, cc_set, other_set); + m_parallel = gen_rtx_PARALLEL (VOIDmode, vec); + } + validate_change (cc_rtl, &PATTERN (cc_rtl), m_parallel, 1); + + // These routines report failures themselves. + if (!recog_ignoring (attempt, cc_change, insn_is_changing (changes)) + || !changes_are_worthwhile (changes) + || !crtl->ssa->verify_insn_changes (changes)) + return false; + + remove_reg_equal_equiv_notes (cc_rtl); + confirm_change_group (); + crtl->ssa->change_insns (changes); + m_parallel = NULL_RTX; + return true; +} + +// Try to optimize the instruction that contains CC_DEF, where CC_DEF describes +// a definition of the CC register by CC_SET. +void +cc_fusion::optimize_cc_setter (def_info *cc_def, rtx cc_set) +{ + // Search the registers used by the CC setter for an easily-substitutable + // def-use chain. + for (use_info *other_use : cc_def->insn ()->uses ()) + if (def_info *other_def = other_use->def ()) + if (other_use->regno () != CC_REGNUM + && other_def->ebb () == cc_def->ebb ()) + if (rtx other_set = optimizable_set (other_def->insn ())) + { + rtx dest = SET_DEST (other_set); + if (REG_P (dest) + && REGNO (dest) == other_def->regno () + && REG_NREGS (dest) == 1 + && parallelize_insns (cc_def, cc_set, other_def, other_set)) + return; + } +} + +// Run the pass on the current function. +void +cc_fusion::execute () +{ + // Initialization. + calculate_dominance_info (CDI_DOMINATORS); + df_analyze (); + crtl->ssa = new rtl_ssa::function_info (cfun); + + // Walk through all instructions that set CC. Look for a PTEST instruction + // that we can optimize. + // + // ??? The PTEST test isn't needed for correctness, but it ensures that the + // pass no effect on non-SVE code. + for (def_info *def : crtl->ssa->reg_defs (CC_REGNUM)) + if (rtx cc_set = optimizable_set (def->insn ())) + if (REG_P (SET_DEST (cc_set)) + && REGNO (SET_DEST (cc_set)) == CC_REGNUM + && GET_CODE (SET_SRC (cc_set)) == UNSPEC + && XINT (SET_SRC (cc_set), 1) == UNSPEC_PTEST) + optimize_cc_setter (def, cc_set); + + // Finalization. + crtl->ssa->perform_pending_updates (); + free_dominance_info (CDI_DOMINATORS); +} + +class pass_cc_fusion : public rtl_opt_pass +{ +public: + pass_cc_fusion (gcc::context *ctxt) + : rtl_opt_pass (pass_data_cc_fusion, ctxt) + {} + + // opt_pass methods: + virtual bool gate (function *) { return TARGET_SVE && optimize >= 2; } + virtual unsigned int execute (function *); +}; + +unsigned int +pass_cc_fusion::execute (function *) +{ + cc_fusion ().execute (); + return 0; +} + +} // end namespace + +// Create a new CC fusion pass instance. + +rtl_opt_pass * +make_pass_cc_fusion (gcc::context *ctxt) +{ + return new pass_cc_fusion (ctxt); +} diff --git a/gcc/config/aarch64/aarch64-passes.def b/gcc/config/aarch64/aarch64-passes.def index 5c2bd521b58..0b773d2c34d 100644 --- a/gcc/config/aarch64/aarch64-passes.def +++ b/gcc/config/aarch64/aarch64-passes.def @@ -22,3 +22,4 @@ INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering); INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation); INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance); INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti); +INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion); diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 2ef6b6d68f8..ff87ced2a34 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -790,6 +790,7 @@ rtl_opt_pass *make_pass_fma_steering (gcc::context *); rtl_opt_pass *make_pass_track_speculation (gcc::context *); rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *); rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt); +rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt); poly_uint64 aarch64_regmode_natural_size (machine_mode); diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64 index bb17eedbf9a..7e1606c47ac 100644 --- a/gcc/config/aarch64/t-aarch64 +++ b/gcc/config/aarch64/t-aarch64 @@ -158,6 +158,12 @@ aarch64-bti-insert.o: $(srcdir)/config/aarch64/aarch64-bti-insert.c \ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/aarch64/aarch64-bti-insert.c +aarch64-cc-fusion.o: $(srcdir)/config/aarch64/aarch64-cc-fusion.cc \ + $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \ + $(RTL_SSA_H) tree-pass.h + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/aarch64/aarch64-cc-fusion.cc + comma=, MULTILIB_OPTIONS = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG)))) MULTILIB_DIRNAMES = $(subst $(comma), ,$(TM_MULTILIB_CONFIG)) diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c new file mode 100644 index 00000000000..b55cc1fd2b8 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/ldff1_8.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include + +void +foo (int8_t *ptr1, int8_t *ptr2, svint8_t *res1, svint8_t *res2) +{ + svbool_t pg = svptrue_b8 (); + + svsetffr (); + svint8_t x1 = svldff1 (pg, ptr1); + svbool_t ok1 = svrdffr (); + if (!svptest_last (pg, ok1)) + { + x1 = svsel (ok1, x1, svdup_s8 (0)); + svsetffr (); + } + + svint8_t x2 = svldff1 (pg, ptr2); + svbool_t ok2 = svrdffr (); + if (!svptest_last (pg, ok2)) + x2 = svsel (ok2, x2, svdup_s8 (0)); + + *res1 = x1; + *res2 = x2; +} + +/* { dg-final { scan-assembler-times {\trdffrs\t} 2 } } */ +/* { dg-final { scan-assembler-times {\t(?:b.last|b.nfrst)\t} 2 } } */ +/* { dg-final { scan-assembler-not {\trdffr\t} } } */ +/* { dg-final { scan-assembler-not {\tptest\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c b/gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c new file mode 100644 index 00000000000..abc51089480 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/ptest_1.c @@ -0,0 +1,10 @@ +/* { dg-options "-O2 -ftree-vectorize" } */ + +void +f (char *restrict x, char *restrict y, char *restrict z, unsigned long n) +{ + for (unsigned long i = 0; i < n; i += 1) + x[i] = y[i] + z[i]; +} + +/* { dg-final { scan-assembler-not {\tptest\t} } } */