From: Thomas Preud'homme Date: Tue, 28 Apr 2015 08:10:44 +0000 (+0000) Subject: re PR target/63503 ([AArch64] A57 executes fused multiply-add poorly in some situations) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=fde9b31b61aa281b502e4f49200e3090aa8a4dd1;p=gcc.git re PR target/63503 ([AArch64] A57 executes fused multiply-add poorly in some situations) 2015-04-28 Thomas Preud'homme gcc/ PR target/63503 * config.gcc: Add cortex-a57-fma-steering.o to extra_objs for aarch64-*-*. * config/aarch64/t-aarch64: Add a rule for cortex-a57-fma-steering.o. * config/aarch64/aarch64.h (AARCH64_FL_USE_FMA_STEERING_PASS): Define. (AARCH64_TUNE_FMA_STEERING): Likewise. * config/aarch64/aarch64-cores.def: Set AARCH64_FL_USE_FMA_STEERING_PASS for cores with dynamic steering of FMUL/FMADD instructions. * config/aarch64/aarch64.c (aarch64_register_fma_steering): Declare. (aarch64_override_options): Include cortex-a57-fma-steering.h. Call aarch64_register_fma_steering () if AARCH64_TUNE_FMA_STEERING is true. * config/aarch64/cortex-a57-fma-steering.h: New file. * config/aarch64/cortex-a57-fma-steering.c: Likewise. From-SVN: r222512 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 66d86d9532b..070a150b31d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,20 @@ +2015-04-28 Thomas Preud'homme + + PR target/63503 + * config.gcc: Add cortex-a57-fma-steering.o to extra_objs for + aarch64-*-*. + * config/aarch64/t-aarch64: Add a rule for cortex-a57-fma-steering.o. + * config/aarch64/aarch64.h (AARCH64_FL_USE_FMA_STEERING_PASS): Define. + (AARCH64_TUNE_FMA_STEERING): Likewise. + * config/aarch64/aarch64-cores.def: Set + AARCH64_FL_USE_FMA_STEERING_PASS for cores with dynamic steering of + FMUL/FMADD instructions. + * config/aarch64/aarch64.c (aarch64_register_fma_steering): Declare. + (aarch64_override_options): Include cortex-a57-fma-steering.h. Call + aarch64_register_fma_steering () if AARCH64_TUNE_FMA_STEERING is true. + * config/aarch64/cortex-a57-fma-steering.h: New file. + * config/aarch64/cortex-a57-fma-steering.c: Likewise. + 2015-04-28 Richard Sandiford * gensupport.c (std_preds): Add missing codes to address_operand entry. diff --git a/gcc/config.gcc b/gcc/config.gcc index 5c48cd0c682..f21421b1670 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -302,7 +302,7 @@ m32c*-*-*) aarch64*-*-*) cpu_type=aarch64 extra_headers="arm_neon.h arm_acle.h" - extra_objs="aarch64-builtins.o aarch-common.o" + extra_objs="aarch64-builtins.o aarch-common.o cortex-a57-fma-steering.o" target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.c" target_has_targetm_common=yes ;; diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index 7c285ba52cc..dfc9cc85413 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -40,7 +40,7 @@ /* V8 Architecture Processors. */ AARCH64_CORE("cortex-a53", cortexa53, cortexa53, 8, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03") -AARCH64_CORE("cortex-a57", cortexa57, cortexa57, 8, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07") +AARCH64_CORE("cortex-a57", cortexa57, cortexa57, 8, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_USE_FMA_STEERING_PASS, cortexa57, "0x41", "0xd07") AARCH64_CORE("cortex-a72", cortexa72, cortexa57, 8, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd08") AARCH64_CORE("exynos-m1", exynosm1, cortexa57, 8, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa57, "0x53", "0x001") AARCH64_CORE("thunderx", thunderx, thunderx, 8, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx, "0x43", "0x0a1") @@ -48,5 +48,5 @@ AARCH64_CORE("xgene1", xgene1, xgene1, 8, AARCH64_FL_FOR_ARCH8, xgen /* V8 big.LITTLE implementations. */ -AARCH64_CORE("cortex-a57.cortex-a53", cortexa57cortexa53, cortexa53, 8, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07.0xd03") +AARCH64_CORE("cortex-a57.cortex-a53", cortexa57cortexa53, cortexa53, 8, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_USE_FMA_STEERING_PASS, cortexa57, "0x41", "0xd07.0xd03") AARCH64_CORE("cortex-a72.cortex-a53", cortexa72cortexa53, cortexa53, 8, AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd08.0xd03") diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index dd92934bdec..b8a91dc8fe5 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -95,6 +95,7 @@ #include "rtl-iter.h" #include "tm-constrs.h" #include "sched-int.h" +#include "cortex-a57-fma-steering.h" /* Defined for convenience. */ #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT) @@ -6884,6 +6885,9 @@ aarch64_override_options (void) align_functions = aarch64_tune_params->function_align; } + if (AARCH64_TUNE_FMA_STEERING) + aarch64_register_fma_steering (); + aarch64_override_options_after_change (); } diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 1f7187bab9b..3fd1b3f0d32 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -200,6 +200,8 @@ extern unsigned aarch64_architecture_version; #define AARCH64_FL_CRYPTO (1 << 2) /* Has crypto. */ #define AARCH64_FL_SLOWMUL (1 << 3) /* A slow multiply core. */ #define AARCH64_FL_CRC (1 << 4) /* Has CRC. */ +/* Has static dispatch of FMA. */ +#define AARCH64_FL_USE_FMA_STEERING_PASS (1 << 5) /* Has FP and SIMD. */ #define AARCH64_FL_FPSIMD (AARCH64_FL_FP | AARCH64_FL_SIMD) @@ -220,6 +222,8 @@ extern unsigned long aarch64_isa_flags; /* Macros to test tuning flags. */ extern unsigned long aarch64_tune_flags; #define AARCH64_TUNE_SLOWMUL (aarch64_tune_flags & AARCH64_FL_SLOWMUL) +#define AARCH64_TUNE_FMA_STEERING \ + (aarch64_tune_flags & AARCH64_FL_USE_FMA_STEERING_PASS) /* Crypto is an optional extension to AdvSIMD. */ #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO) diff --git a/gcc/config/aarch64/cortex-a57-fma-steering.c b/gcc/config/aarch64/cortex-a57-fma-steering.c new file mode 100644 index 00000000000..3d7557ac2b4 --- /dev/null +++ b/gcc/config/aarch64/cortex-a57-fma-steering.c @@ -0,0 +1,1088 @@ +/* FMA steering optimization pass for Cortex-A57. + Copyright (C) 2015 Free Software Foundation, Inc. + Contributed by ARM Ltd. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "regs.h" +#include "insn-config.h" +#include "hard-reg-set.h" +#include "dominance.h" +#include "cfg.h" +#include "cfganal.h" +#include "predict.h" +#include "basic-block.h" +#include "insn-attr.h" +#include "machmode.h" +#include "recog.h" +#include "output.h" +#include "vec.h" +#include "hash-map.h" +#include "bitmap.h" +#include "obstack.h" +#include "df.h" +#include "target.h" +#include "rtl.h" +#include "context.h" +#include "tree-pass.h" +#include "regrename.h" +#include "cortex-a57-fma-steering.h" + +#include + +/* For better performance, the destination of FMADD/FMSUB instructions should + have the same parity as their accumulator register if the accumulator + contains the result of a previous FMUL or FMADD/FMSUB instruction if + targetting Cortex-A57 processors. Performance is also increased by + otherwise keeping a good balance in the parity of the destination register + of FMUL or FMADD/FMSUB. + + This pass ensure that registers are renamed so that these conditions hold. + We reuse the existing register renaming facility from regrename.c to build + dependency chains and expose candidate registers for renaming. + + + The algorithm has three steps: + + First, the functions of the register renaming pass are called. These + analyze the instructions and produce a list of def/use chains of + instructions. + + Next, this information is used to build trees of multiply and + multiply-accumulate instructions. The roots of these trees are any + multiply, or any multiply-accumulate whose accumulator is not dependent on + a multiply or multiply-accumulate instruction. A child is added to the + tree where a dependency chain exists between the result of the parent + instruction and the accumulator operand of the child, as in the diagram + below: + + fmul s2, s0, s1 + / \ + fmadd s0, s1, s1, s2 fmadd s4, s1, s1 s2 + | + fmadd s3, s1, s1, s0 + + Trees made of a single instruction are permitted. + + Finally, renaming is performed. The parity of the destination register at + the root of a tree is checked against the current balance of multiply and + multiply-accumulate on each pipeline. If necessary, the root of a tree is + renamed, in which case the rest of the tree is then renamed to keep the same + parity in the destination registers of all instructions in the tree. */ + + + +/* Forward declarations. */ +class fma_node; +class fma_root_node; +class func_fma_steering; + +/* Dependencies between FMUL or FMADD/FMSUB instructions and subsequent + FMADD/FMSUB instructions form a graph. This is because alternatives can + make a register be set by several FMUL or FMADD/FMSUB instructions in + different basic blocks and because of loops. For ease of browsing, the + connected components of this graph are broken up into forests of trees. + Forests are represented by fma_forest objects, contained in the fma_forests + list. Using a separate object for the forests allows for a better use of + memory as there is some information that is global to each forest, such as + the number of FMSUB and FMADD/FMSUB instructions currently scheduled on each + floating-point execution pipelines. */ + +class fma_forest +{ +public: + fma_forest (func_fma_steering *, fma_root_node *, int); + ~fma_forest (); + + int get_id (); + std::list *get_roots (); + func_fma_steering *get_globals (); + int get_target_parity (); + void fma_node_created (fma_node *); + void merge_forest (fma_forest *); + void dump_info (); + void dispatch (); + +private: + /* The list of roots that form this forest. */ + std::list *m_roots; + + /* Target parity the destination register of all FMUL and FMADD/FMSUB + instructions in this forest should have. */ + int m_target_parity; + + /* Link to the instance of func_fma_steering holding data related to the + FMA steering of the current function (cfun). */ + func_fma_steering *m_globals; + + /* Identifier for the forest (used for dumps). */ + int m_id; + + /* Total number of nodes in the forest (for statistics). */ + int m_nb_nodes; +}; + +class fma_node +{ +public: + fma_node (fma_node *parent, du_chain *chain); + ~fma_node (); + + bool root_p (); + fma_forest *get_forest (); + std::list *get_children (); + rtx_insn *get_insn (); + void add_child (fma_node *); + int get_parity (); + void set_head (du_head *); + void rename (fma_forest *); + void dump_info (fma_forest *); + +protected: + /* Root node that lead to this node. */ + fma_root_node *m_root; + + /* The parent node of this node. If the node belong to a chain with several + parent nodes, the first one encountered in a depth-first search is chosen + as canonical parent. */ + fma_node *m_parent; + + /* The list of child nodes. If a chain contains several parent nodes, one is + chosen as canonical parent and the others will have no children. */ + std::list *m_children; + + /* The associated DU_HEAD chain that the insn represented by this object + is (one of) the root of. When a chain contains several roots, the non + canonical ones have this field set to NULL. */ + struct du_head *m_head; + + /* The FMUL or FMADD/FMSUB instruction this object corresponds to. */ + rtx_insn *m_insn; +}; + +class fma_root_node : public fma_node +{ +public: + fma_root_node (func_fma_steering *, du_chain *, int); + + fma_forest *get_forest (); + void set_forest (fma_forest *); + void dump_info (fma_forest *); + +private: + /* The forest this node belonged to when it was created. */ + fma_forest *m_forest; +}; + +/* Class holding all data and methods relative to the FMA steering of a given + function. The FMA steering pass could then run in parallel for different + functions. */ + +class func_fma_steering +{ +public: + func_fma_steering (); + ~func_fma_steering (); + + int get_fpu_balance (); + void remove_forest (fma_forest *); + bool put_node (fma_node *); + void update_balance (int); + fma_node *get_fma_node (rtx_insn *); + void analyze_fma_fmul_insn (fma_forest *, du_chain *, du_head_p); + void execute_fma_steering (); + +private: + void dfs (void (*) (fma_forest *), void (*) (fma_forest *, fma_root_node *), + void (*) (fma_forest *, fma_node *), bool); + void analyze (); + void rename_fma_trees (); + + /* Mapping between FMUL or FMADD/FMSUB instructions and the associated + fma_node object. Used when analyzing an instruction that is a root of + a chain to find if such an object was created because this instruction + is also a use in another chain. */ + hash_map *m_insn_fma_head_map; + + /* A list of all the forests in a given function. */ + std::list m_fma_forests; + + /* Balance of FMUL and FMADD/FMSUB instructions between the two FPU + pipelines: + < 0: more instruction dispatched to the first pipeline + == 0: perfect balance + > 0: more instruction dispatched to the second pipeline. */ + int m_fpu_balance; + + /* Identifier for the next forest created. */ + int m_next_forest_id; +}; + +/* Rename the register HEAD->regno in all the insns in the chain HEAD to any + register not in the set UNAVAILABLE. Adapted from rename_chains in + regrename.c. */ + +static bool +rename_single_chain (du_head_p head, HARD_REG_SET *unavailable) +{ + int best_new_reg; + int n_uses = 0; + struct du_chain *tmp; + int reg = head->regno; + enum reg_class super_class = NO_REGS; + + if (head->cannot_rename) + return false; + + if (fixed_regs[reg] || global_regs[reg] + || (frame_pointer_needed && reg == HARD_FRAME_POINTER_REGNUM)) + return false; + + /* Iterate over elements in the chain in order to: + 1. Count number of uses, and narrow the set of registers we can + use for renaming. + 2. Compute the superunion of register classes in this chain. */ + for (tmp = head->first; tmp; tmp = tmp->next_use) + { + if (DEBUG_INSN_P (tmp->insn)) + continue; + n_uses++; + IOR_COMPL_HARD_REG_SET (*unavailable, reg_class_contents[tmp->cl]); + super_class = reg_class_superunion[(int) super_class][(int) tmp->cl]; + } + + if (n_uses < 1) + return false; + + best_new_reg = find_rename_reg (head, super_class, unavailable, reg, + false); + + if (dump_file) + { + fprintf (dump_file, "Register %s in insn %d", reg_names[reg], + INSN_UID (head->first->insn)); + if (head->need_caller_save_reg) + fprintf (dump_file, " crosses a call"); + } + + if (best_new_reg == reg) + { + if (dump_file) + fprintf (dump_file, "; no available better choice\n"); + return false; + } + + if (dump_file) + fprintf (dump_file, ", renamed as %s\n", reg_names[best_new_reg]); + + regrename_do_replace (head, best_new_reg); + df_set_regs_ever_live (best_new_reg, true); + return true; +} + +/* Return whether T is the attribute of a FMADD/FMSUB-like instruction. */ + +static bool +is_fmac_op (enum attr_type t) +{ + return (t == TYPE_FMACS) || (t == TYPE_FMACD) || (t == TYPE_NEON_FP_MLA_S); +} + +/* Return whether T is the attribute of a FMUL instruction. */ + +static bool +is_fmul_op (enum attr_type t) +{ + return (t == TYPE_FMULS) || (t == TYPE_FMULD) || (t == TYPE_NEON_FP_MUL_S); +} + +/* Return whether INSN is an FMUL (if FMUL_OK is true) or FMADD/FMSUB + instruction. */ + +static bool +is_fmul_fmac_insn (rtx_insn *insn, bool fmul_ok) +{ + enum attr_type t; + + if (!NONDEBUG_INSN_P (insn)) + return false; + + if (recog_memoized (insn) < 0) + return false; + + /* Only consider chain(s) this instruction is a root of if this is an FMUL or + FMADD/FMSUB instruction. This allows to avoid browsing chains of all + instructions for FMUL or FMADD/FMSUB in them. */ + t = get_attr_type (insn); + return is_fmac_op (t) || (fmul_ok && is_fmul_op (t)); +} + + +/* + * Class fma_forest method definitions. + */ + +fma_forest::fma_forest (func_fma_steering *fma_steer, fma_root_node *fma_root, + int id) +{ + memset (this, 0, sizeof (*this)); + this->m_globals = fma_steer; + this->m_roots = new std::list; + this->m_roots->push_back (fma_root); + this->m_id = id; +} + +fma_forest::~fma_forest () +{ + delete this->m_roots; +} + +int +fma_forest::get_id () +{ + return this->m_id; +} + +std::list * +fma_forest::get_roots () +{ + return this->m_roots; +} + +func_fma_steering * +fma_forest::get_globals () +{ + return this->m_globals; +} + +int +fma_forest::get_target_parity () +{ + return this->m_target_parity; +} + +/* Act on the creation of NODE by updating statistics in FOREST and adding an + entry for it in the func_fma_steering hashmap. */ + +void fma_forest::fma_node_created (fma_node *node) +{ + bool created = !this->m_globals->put_node (node); + + gcc_assert (created); + this->m_nb_nodes++; +} + +/* Merge REF_FOREST and OTHER_FOREST together, making REF_FOREST the canonical + fma_forest object to represent both. */ + +void +fma_forest::merge_forest (fma_forest *other_forest) +{ + std::list *other_roots; + std::list::iterator other_root_iter; + + if (this == other_forest) + return; + + other_roots = other_forest->m_roots; + + /* Update root nodes' pointer to forest. */ + for (other_root_iter = other_roots->begin (); + other_root_iter != other_roots->end (); other_root_iter++) + (*other_root_iter)->set_forest (this); + + /* Remove other_forest from the list of forests and move its tree roots in + the list of tree roots of ref_forest. */ + this->m_globals->remove_forest (other_forest); + this->m_roots->splice (this->m_roots->begin (), *other_roots); + delete other_forest; + + this->m_nb_nodes += other_forest->m_nb_nodes; +} + +/* Dump information about the forest FOREST. */ + +void +fma_forest::dump_info () +{ + gcc_assert (dump_file); + + fprintf (dump_file, "Forest #%d has %d nodes\n", this->m_id, + this->m_nb_nodes); +} + +/* Wrapper around fma_forest::dump_info for use as parameter of function + pointer type in func_fma_steering::dfs. */ + +static void +dump_forest_info (fma_forest *forest) +{ + forest->dump_info (); +} + +/* Dispatch forest to the least utilized pipeline. */ + +void +fma_forest::dispatch () +{ + this->m_target_parity = this->m_roots->front ()->get_parity (); + int fpu_balance = this->m_globals->get_fpu_balance (); + if (fpu_balance != 0) + this->m_target_parity = (fpu_balance < 0); + + if (dump_file) + fprintf (dump_file, "Target parity for forest #%d: %s\n", this->m_id, + this->m_target_parity ? "odd" : "even"); +} + +/* Wrapper around fma_forest::dispatch for use as parameter of function pointer + type in func_fma_steering::dfs. */ + +static void +dispatch_forest (fma_forest *forest) +{ + forest->dispatch (); +} + +fma_node::fma_node (fma_node *parent, du_chain *chain) +{ + memset (this, 0, sizeof (*this)); + this->m_parent = parent; + this->m_children = new std::list; + this->m_insn = chain->insn; + /* root_p () cannot be used to check for root before root is set. */ + if (this->m_parent == this) + this->m_root = static_cast (parent); + else + { + this->m_root = parent->m_root; + this->get_forest ()->fma_node_created (this); + } +} + +fma_node::~fma_node () +{ + delete this->m_children; +} + +std::list * +fma_node::get_children () +{ + return this->m_children; +} + +rtx_insn * +fma_node::get_insn () +{ + return this->m_insn; +} + +void +fma_node::set_head (du_head *head) +{ + gcc_assert (!this->m_head); + this->m_head = head; +} + +/* Add a child to this node in the list of children. */ + +void +fma_node::add_child (fma_node *child) +{ + this->m_children->push_back (child); +} + +/* Return the parity of the destination register of the instruction represented + by this node. */ + +int +fma_node::get_parity () +{ + return this->m_head->regno % 2; +} + +/* Get the actual forest associated with a non root node as the one the node + points to might have been merged into another one. In that case the pointer + in the root nodes are updated so we return the forest pointer of a root node + pointed to by the initial forest. Despite being a oneliner, this method is + defined here as it references a method from fma_root_node. */ + +fma_forest * +fma_node::get_forest () +{ + return this->m_root->get_forest (); +} + +/* Return whether a node is a root node. */ + +bool +fma_node::root_p () +{ + return this->m_root == this; +} + +/* Dump information about the children of node FMA_NODE in forest FOREST. */ + +void +fma_node::dump_info (ATTRIBUTE_UNUSED fma_forest *forest) +{ + struct du_chain *chain; + std::list::iterator fma_child; + + gcc_assert (dump_file); + + if (this->get_children ()->empty ()) + return; + + fprintf (dump_file, "Instruction(s)"); + for (chain = this->m_head->first; chain; chain = chain->next_use) + { + if (!is_fmul_fmac_insn (chain->insn, true)) + continue; + + if (chain->loc != &SET_DEST (PATTERN (chain->insn))) + continue; + + fprintf (dump_file, " %d", INSN_UID (chain->insn)); + } + + fprintf (dump_file, " is(are) accumulator dependency of instructions"); + for (fma_child = this->get_children ()->begin (); + fma_child != this->get_children ()->end (); fma_child++) + fprintf (dump_file, " %d", INSN_UID ((*fma_child)->m_insn)); + fprintf (dump_file, "\n"); +} + +/* Wrapper around fma_node::dump_info for use as parameter of function pointer + type in func_fma_steering::dfs. */ + +static void +dump_tree_node_info (fma_forest *forest, fma_node *node) +{ + node->dump_info (forest); +} + +/* Rename the destination register of a single FMUL or FMADD/FMSUB instruction + represented by FMA_NODE to a register that respect the target parity for + FOREST or with same parity of the instruction represented by its parent node + if it has one. */ + +void +fma_node::rename (fma_forest *forest) +{ + int cur_parity, target_parity; + + /* This is alternate root of a chain and thus has no children. It will be + renamed when processing the canonical root for that chain. */ + if (!this->m_head) + return; + + target_parity = forest->get_target_parity (); + if (this->m_parent) + target_parity = this->m_parent->get_parity (); + cur_parity = this->get_parity (); + + /* Rename if parity differs. */ + if (cur_parity != target_parity) + { + rtx_insn *insn = this->m_insn; + HARD_REG_SET unavailable; + enum machine_mode mode; + int reg; + + if (dump_file) + { + unsigned cur_dest_reg = this->m_head->regno; + + fprintf (dump_file, "FMA or FMUL at insn %d but destination " + "register (%s) has different parity from expected to " + "maximize FPU pipeline utilization\n", INSN_UID (insn), + reg_names[cur_dest_reg]); + } + + /* Don't clobber traceback for noreturn functions. */ + CLEAR_HARD_REG_SET (unavailable); + if (frame_pointer_needed) + { + add_to_hard_reg_set (&unavailable, Pmode, FRAME_POINTER_REGNUM); + add_to_hard_reg_set (&unavailable, Pmode, HARD_FRAME_POINTER_REGNUM); + } + + /* Exclude registers with wrong parity. */ + mode = GET_MODE (SET_DEST (PATTERN (insn))); + for (reg = cur_parity; reg < FIRST_PSEUDO_REGISTER; reg += 2) + add_to_hard_reg_set (&unavailable, mode, reg); + + if (!rename_single_chain (this->m_head, &unavailable)) + { + if (dump_file) + fprintf (dump_file, "Destination register of insn %d could not be " + "renamed. Dependent FMA insns will use this parity from " + "there on.\n", INSN_UID (insn)); + } + else + cur_parity = target_parity; + } + + forest->get_globals ()->update_balance (cur_parity); +} + +/* Wrapper around fma_node::dump_info for use as parameter of function pointer + type in func_fma_steering::dfs. */ + +static void +rename_fma_node (fma_forest *forest, fma_node *node) +{ + node->rename (forest); +} + +fma_root_node::fma_root_node (func_fma_steering *globals, du_chain *chain, + int id) : fma_node (this, chain) +{ + this->m_forest = new fma_forest (globals, this, id); + this->m_forest->fma_node_created (this); +} + +fma_forest * +fma_root_node::get_forest () +{ + return this->m_forest; +} + +void +fma_root_node::set_forest (fma_forest *ref_forest) +{ + this->m_forest = ref_forest; +} + +/* Dump information about the roots of forest FOREST. */ + +void +fma_root_node::dump_info (fma_forest *forest) +{ + gcc_assert (dump_file); + + if (this == forest->get_roots ()->front ()) + fprintf (dump_file, "Instruction(s) at root of forest #%d:", + forest->get_id ()); + fprintf (dump_file, " %d", INSN_UID (this->m_insn)); + if (this == forest->get_roots ()->back ()) + fprintf (dump_file, "\n"); +} + +/* Wrapper around fma_root_node::dump_info for use as parameter of function + pointer type in func_fma_steering::dfs. */ + +static void +dump_tree_root_info (fma_forest *forest, fma_root_node *node) +{ + node->dump_info (forest); +} + +func_fma_steering::func_fma_steering () : m_fpu_balance (0) +{ + this->m_insn_fma_head_map = new hash_map; + this->m_fma_forests.clear (); + this->m_next_forest_id = 0; +} + +func_fma_steering::~func_fma_steering () +{ + delete this->m_insn_fma_head_map; +} + +int +func_fma_steering::get_fpu_balance () +{ + return this->m_fpu_balance; +} + +void +func_fma_steering::remove_forest (fma_forest *forest) +{ + this->m_fma_forests.remove (forest); +} + +/* Memorize the mapping of this instruction to its fma_node object and return + whether such a mapping existed. */ + +bool +func_fma_steering::put_node (fma_node *node) +{ + return this->m_insn_fma_head_map->put (node->get_insn (), node); +} + +/* Update the current balance considering a node with the given PARITY. */ + +void +func_fma_steering::update_balance (int parity) +{ + this->m_fpu_balance = parity ? this->m_fpu_balance + 1 + : this->m_fpu_balance - 1; +} + +/* Return whether an fma_node object exists for instruction INSN and, if not, + allocate one in *RET. */ + +fma_node * +func_fma_steering::get_fma_node (rtx_insn *insn) +{ + fma_node **fma_slot; + + fma_slot = this->m_insn_fma_head_map->get (insn); + if (fma_slot) + return *fma_slot; + return NULL; +} + +/* Allocate and initialize fma_node objects for the FMUL or FMADD/FMSUB + instruction in CHAIN->insn and its dependent FMADD/FMSUB instructions, all + part of FOREST. For the children, the associated head is left untouched + (and thus null) as this function will be called again when considering the + chain where they are def. For the parent, the chain is given in HEAD. */ + +void +func_fma_steering::analyze_fma_fmul_insn (fma_forest *ref_forest, + du_chain *chain, du_head_p head) +{ + fma_forest *forest; + fma_node *node = this->get_fma_node (chain->insn); + + /* This is a root node. */ + if (!node) + { + fma_root_node *root_node; + + root_node = new fma_root_node (this, chain, this->m_next_forest_id++); + forest = root_node->get_forest (); + node = root_node; + + /* Until proved otherwise, assume this root is not part of an existing + forest and thus add its forest to the list of forests. */ + this->m_fma_forests.push_back (forest); + } + else + forest = node->get_forest (); + + node->set_head (head); + + /* fma_node is part of a chain with several defs, one of them having already + been processed. The root of that already processed def is the canonical + one and the root of fma_node is added to its forest. No need to process + the children nodes as they were already processed when the other def was + processed. */ + if (ref_forest) + { + ref_forest->merge_forest (forest); + return; + } + + for (chain = head->first; chain; chain = chain->next_use) + { + fma_node *child_fma; + rtx fma_rtx, *accum_rtx_p; + + if (!is_fmul_fmac_insn (chain->insn, false)) + continue; + + /* Get FMA rtx. */ + fma_rtx = SET_SRC (PATTERN (chain->insn)); + /* FMA is negated. */ + if (GET_CODE (fma_rtx) == NEG) + fma_rtx = XEXP (fma_rtx, 0); + /* Get accumulator rtx. */ + accum_rtx_p = &XEXP (fma_rtx, 2); + /* Accumulator is negated. */ + if (!REG_P (*accum_rtx_p)) + accum_rtx_p = &XEXP (*accum_rtx_p, 0); + + /* This du_chain structure is not for the accumulator register. */ + if (accum_rtx_p != chain->loc) + continue; + + /* If object already created, this is a loop carried dependency. We + don't include this object in the children as we want trees for + rename_fma_trees to not be an infinite loop. */ + if (this->get_fma_node (chain->insn)) + continue; + + child_fma = new fma_node (node, chain); + + /* Memorize the mapping of this instruction to its fma_node object + as it will be processed for the chain starting at its destination + register later. */ + + /* Link to siblings. */ + node->add_child (child_fma); + } +} + +/* Perform a depth-first search of the forests of fma_node in + THIS->m_fma_forests, calling PROCESS_FOREST () on each fma_forest object in + THIS->m_fma_forests list, PROCESS_ROOT () on each tree root and + PROCESS_NODE () on each node. If FREE is true, free all std::list in the + same dfs. */ + +void +func_fma_steering::dfs (void (*process_forest) (fma_forest *), + void (*process_root) (fma_forest *, fma_root_node *), + void (*process_node) (fma_forest *, fma_node *), + bool free) +{ + vec to_process; + std::list::iterator forest_iter; + + to_process.create (0); + + /* For each forest. */ + for (forest_iter = this->m_fma_forests.begin (); + forest_iter != this->m_fma_forests.end (); forest_iter++) + { + std::list::iterator root_iter; + + if (process_forest) + process_forest (*forest_iter); + + /* For each tree root in this forest. */ + for (root_iter = (*forest_iter)->get_roots ()->begin (); + root_iter != (*forest_iter)->get_roots ()->end (); root_iter++) + { + if (process_root) + process_root (*forest_iter, *root_iter); + to_process.safe_push (*root_iter); + } + + /* For each tree node in this forest. */ + while (!to_process.is_empty ()) + { + fma_node *node; + std::list::iterator child_iter; + + node = to_process.pop (); + + if (process_node) + process_node (*forest_iter, node); + + /* Absence of children might indicate an alternate root of a *chain*. + It's ok to skip it here as the chain will be renamed when + processing the canonical root for that chain. */ + if (node->get_children ()->empty ()) + continue; + + for (child_iter = node->get_children ()->begin (); + child_iter != node->get_children ()->end (); child_iter++) + to_process.safe_push (*child_iter); + if (free) + { + if (node->root_p ()) + delete static_cast (node); + else + delete node; + } + } + if (free) + delete *forest_iter; + } + + to_process.release (); +} + +/* Build the dependency trees of FMUL and FMADD/FMSUB instructions. */ + +void +func_fma_steering::analyze () +{ + int i, n_blocks, *bb_dfs_preorder; + basic_block bb; + rtx_insn *insn; + + bb_dfs_preorder = XNEWVEC (int, last_basic_block_for_fn (cfun)); + n_blocks = pre_and_rev_post_order_compute (bb_dfs_preorder, NULL, false); + + /* Browse the graph of basic blocks looking for FMUL or FMADD/FMSUB + instructions. */ + for (i = 0; i < n_blocks; i++) + { + bb = BASIC_BLOCK_FOR_FN (cfun, bb_dfs_preorder[i]); + FOR_BB_INSNS (bb, insn) + { + operand_rr_info *dest_op_info; + struct du_chain *chain; + unsigned dest_regno; + fma_forest *forest; + du_head_p head; + int i; + + if (!is_fmul_fmac_insn (insn, true)) + continue; + + /* Search the chain where this instruction is (one of) the root. */ + dest_op_info = insn_rr[INSN_UID (insn)].op_info; + dest_regno = REGNO (SET_DEST (PATTERN (insn))); + for (i = 0; i < dest_op_info->n_chains; i++) + { + /* The register tracked by this chain does not match the + destination register of insn. */ + if (dest_op_info->heads[i]->regno != dest_regno) + continue; + + head = dest_op_info->heads[i]; + /* The chain was merged in another, find the new head. */ + if (!head->first) + head = regrename_chain_from_id (head->id); + + /* Search the chain element for this instruction and, if another + FMUL or FMADD/FMSUB instruction was already processed, note + the forest of its tree. */ + forest = NULL; + for (chain = head->first; chain; chain = chain->next_use) + { + fma_node **fma_slot; + + if (!is_fmul_fmac_insn (chain->insn, true)) + continue; + + /* This is a use, continue. */ + if (chain->loc != &SET_DEST (PATTERN (chain->insn))) + continue; + + if (chain->insn == insn) + break; + + fma_slot = this->m_insn_fma_head_map->get (chain->insn); + if (fma_slot && (*fma_slot)->get_children ()) + forest = (*fma_slot)->get_forest (); + } + if (chain) + break; + } + + /* We didn't find a chain with a def for this instruction. */ + gcc_assert (i < dest_op_info->n_chains); + + this->analyze_fma_fmul_insn (forest, chain, head); + } + } + free (bb_dfs_preorder); + + if (dump_file) + this->dfs (dump_forest_info, dump_tree_root_info, dump_tree_node_info, + false); +} + +/* Perform the renaming of all chains with FMUL or FMADD/FMSUB involved with + the objective of keeping FPU pipeline balanced in term of instructions and + having FMADD/FMSUB with dependencies on previous FMUL or FMADD/FMSUB be + scheduled on the same pipeline. */ + +void +func_fma_steering::rename_fma_trees () +{ + this->dfs (dispatch_forest, NULL, rename_fma_node, true); + + if (dump_file && !this->m_fma_forests.empty ()) + { + fprintf (dump_file, "Function %s has ", current_function_name ()); + if (this->m_fpu_balance == 0) + fprintf (dump_file, "perfect balance of FMUL/FMA chains between the " + "two FPU pipelines\n"); + else if (this->m_fpu_balance > 0) + fprintf (dump_file, "%d more FMUL/FMA chains scheduled on the second " + "FPU pipeline\n", this->m_fpu_balance); + else /* this->m_fpu_balance < 0 */ + fprintf (dump_file, "%d more FMUL/FMA chains scheduled on the first " + "FPU pipeline\n", - this->m_fpu_balance); + } +} + +/* Execute FMA steering pass. */ + +void +func_fma_steering::execute_fma_steering () +{ + df_set_flags (DF_LR_RUN_DCE); + df_note_add_problem (); + df_analyze (); + df_set_flags (DF_DEFER_INSN_RESCAN); + + regrename_init (true); + regrename_analyze (NULL); + this->analyze (); + this->rename_fma_trees (); + regrename_finish (); +} + +const pass_data pass_data_fma_steering = +{ + RTL_PASS, /* type */ + "fma_steering", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_fma_steering : public rtl_opt_pass +{ +public: + pass_fma_steering (gcc::context *ctxt) + : rtl_opt_pass (pass_data_fma_steering, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return optimize >= 2; + } + + virtual unsigned int execute (function *) + { + func_fma_steering *fma_steering = new func_fma_steering; + fma_steering->execute_fma_steering (); + delete fma_steering; + return 0; + } + +}; // class pass_fma_steering + +/* Create a new fma steering pass instance. */ + +static rtl_opt_pass * +make_pass_fma_steering (gcc::context *ctxt) +{ + return new pass_fma_steering (ctxt); +} + +/* Register the FMA steering pass to the pass manager. */ + +void +aarch64_register_fma_steering () +{ + opt_pass *pass_fma_steering = make_pass_fma_steering (g); + + static struct register_pass_info fma_steering_info + = { pass_fma_steering, "rnreg", 1, PASS_POS_INSERT_AFTER }; + + register_pass (&fma_steering_info); +} diff --git a/gcc/config/aarch64/cortex-a57-fma-steering.h b/gcc/config/aarch64/cortex-a57-fma-steering.h new file mode 100644 index 00000000000..e8915acb423 --- /dev/null +++ b/gcc/config/aarch64/cortex-a57-fma-steering.h @@ -0,0 +1,22 @@ +/* This file contains declarations for the FMA steering optimization + pass for Cortex-A57. + Copyright (C) 2015 Free Software Foundation, Inc. + Contributed by ARM Ltd. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +void aarch64_register_fma_steering (void); diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64 index e410bc919eb..0371203f0a3 100644 --- a/gcc/config/aarch64/t-aarch64 +++ b/gcc/config/aarch64/t-aarch64 @@ -48,6 +48,15 @@ aarch-common.o: $(srcdir)/config/arm/aarch-common.c $(CONFIG_H) $(SYSTEM_H) \ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/arm/aarch-common.c +cortex-a57-fma-steering.o: $(srcdir)/config/aarch64/cortex-a57-fma-steering.c \ + $(CONFIG_H) $(SYSTEM_H) $(TM_H) $(REGS_H) insn-config.h $(RTL_BASE_H) \ + dominance.h cfg.h cfganal.h $(BASIC_BLOCK_H) $(INSN_ATTR_H) $(RECOG_H) \ + output.h hash-map.h $(DF_H) $(OBSTACK_H) $(TARGET_H) $(RTL_H) \ + $(CONTEXT_H) $(TREE_PASS_H) regrename.h \ + $(srcdir)/config/aarch64/cortex-a57-fma-steering.h + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/aarch64/cortex-a57-fma-steering.c + comma=, MULTILIB_OPTIONS = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst $(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG)))) MULTILIB_DIRNAMES = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))