+2017-07-06 Aaron Sawdey <acsawdey@linux.vnet.ibm.com>
+
+ * config/rs6000/rs6000.c (union_defs, union_uses, insn_is_load_p,
+ insn_is_store_p, insn_is_swap_p, const_load_sequence_p, v2df_reduction_p,
+ rtx_is_swappable_p, insn_is_swappable_p, chain_contains_only_swaps,
+ mark_swaps_for_removal, swap_const_vector_halves, adjust_subreg_index,
+ permute_load, permute_store, adjust_extract, adjust_splat,
+ adjust_xxpermdi, adjust_concat, adjust_vperm, handle_special_swappables,
+ replace_swap_with_copy, dump_swap_insn_table,
+ alignment_with_canonical_addr, alignment_mask, find_alignment_op,
+ recombine_lvx_pattern, recombine_stvx_pattern,
+ recombine_lvx_stvx_patterns, rs6000_analyze_swaps,
+ make_pass_analyze_swaps): Move all code related to p8 swap optimizations
+ to file rs6000-p8swap.c.
+ * config/rs6000/rs6000-p8swap.c: New file.
+ * config/rs6000/t-rs6000: Add rule to build rs6000-p8swap.o.
+ * config.gcc: Add rs6000-p8swap.o to extra_objs for powerpc*-*-*
+ and rs6000*-*-* targets.
+
2017-07-06 David Malcolm <dmalcolm@redhat.com>
* Makefile.in (selftest): Remove dependency on s-selftest-c++.
;;
powerpc*-*-*)
cpu_type=rs6000
- extra_objs="rs6000-string.o"
+ extra_objs="rs6000-string.o rs6000-p8swap.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h"
extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h"
;;
rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
- extra_objs="rs6000-string.o"
+ extra_objs="rs6000-string.o rs6000-p8swap.o"
;;
sparc*-*-*)
cpu_type=sparc
--- /dev/null
+/* Subroutines used to remove unnecessary doubleword swaps
+ for p8 little-endian VSX code.
+ Copyright (C) 1991-2017 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "df.h"
+#include "tm_p.h"
+#include "ira.h"
+#include "print-tree.h"
+#include "varasm.h"
+#include "explow.h"
+#include "expr.h"
+#include "output.h"
+#include "tree-pass.h"
+
+/* Analyze vector computations and remove unnecessary doubleword
+ swaps (xxswapdi instructions). This pass is performed only
+ for little-endian VSX code generation.
+
+ For this specific case, loads and stores of 4x32 and 2x64 vectors
+ are inefficient. These are implemented using the lvx2dx and
+ stvx2dx instructions, which invert the order of doublewords in
+ a vector register. Thus the code generation inserts an xxswapdi
+ after each such load, and prior to each such store. (For spill
+ code after register assignment, an additional xxswapdi is inserted
+ following each store in order to return a hard register to its
+ unpermuted value.)
+
+ The extra xxswapdi instructions reduce performance. This can be
+ particularly bad for vectorized code. The purpose of this pass
+ is to reduce the number of xxswapdi instructions required for
+ correctness.
+
+ The primary insight is that much code that operates on vectors
+ does not care about the relative order of elements in a register,
+ so long as the correct memory order is preserved. If we have
+ a computation where all input values are provided by lvxd2x/xxswapdi
+ sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
+ and all intermediate computations are pure SIMD (independent of
+ element order), then all the xxswapdi's associated with the loads
+ and stores may be removed.
+
+ This pass uses some of the infrastructure and logical ideas from
+ the "web" pass in web.c. We create maximal webs of computations
+ fitting the description above using union-find. Each such web is
+ then optimized by removing its unnecessary xxswapdi instructions.
+
+ The pass is placed prior to global optimization so that we can
+ perform the optimization in the safest and simplest way possible;
+ that is, by replacing each xxswapdi insn with a register copy insn.
+ Subsequent forward propagation will remove copies where possible.
+
+ There are some operations sensitive to element order for which we
+ can still allow the operation, provided we modify those operations.
+ These include CONST_VECTORs, for which we must swap the first and
+ second halves of the constant vector; and SUBREGs, for which we
+ must adjust the byte offset to account for the swapped doublewords.
+ A remaining opportunity would be non-immediate-form splats, for
+ which we should adjust the selected lane of the input. We should
+ also make code generation adjustments for sum-across operations,
+ since this is a common vectorizer reduction.
+
+ Because we run prior to the first split, we can see loads and stores
+ here that match *vsx_le_perm_{load,store}_<mode>. These are vanilla
+ vector loads and stores that have not yet been split into a permuting
+ load/store and a swap. (One way this can happen is with a builtin
+ call to vec_vsx_{ld,st}.) We can handle these as well, but rather
+ than deleting a swap, we convert the load/store into a permuting
+ load/store (which effectively removes the swap). */
+
+/* Notes on Permutes
+
+ We do not currently handle computations that contain permutes. There
+ is a general transformation that can be performed correctly, but it
+ may introduce more expensive code than it replaces. To handle these
+ would require a cost model to determine when to perform the optimization.
+ This commentary records how this could be done if desired.
+
+ The most general permute is something like this (example for V16QI):
+
+ (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
+ (parallel [(const_int a0) (const_int a1)
+ ...
+ (const_int a14) (const_int a15)]))
+
+ where a0,...,a15 are in [0,31] and select elements from op1 and op2
+ to produce in the result.
+
+ Regardless of mode, we can convert the PARALLEL to a mask of 16
+ byte-element selectors. Let's call this M, with M[i] representing
+ the ith byte-element selector value. Then if we swap doublewords
+ throughout the computation, we can get correct behavior by replacing
+ M with M' as follows:
+
+ M'[i] = { (M[i]+8)%16 : M[i] in [0,15]
+ { ((M[i]+8)%16)+16 : M[i] in [16,31]
+
+ This seems promising at first, since we are just replacing one mask
+ with another. But certain masks are preferable to others. If M
+ is a mask that matches a vmrghh pattern, for example, M' certainly
+ will not. Instead of a single vmrghh, we would generate a load of
+ M' and a vperm. So we would need to know how many xxswapd's we can
+ remove as a result of this transformation to determine if it's
+ profitable; and preferably the logic would need to be aware of all
+ the special preferable masks.
+
+ Another form of permute is an UNSPEC_VPERM, in which the mask is
+ already in a register. In some cases, this mask may be a constant
+ that we can discover with ud-chains, in which case the above
+ transformation is ok. However, the common usage here is for the
+ mask to be produced by an UNSPEC_LVSL, in which case the mask
+ cannot be known at compile time. In such a case we would have to
+ generate several instructions to compute M' as above at run time,
+ and a cost model is needed again.
+
+ However, when the mask M for an UNSPEC_VPERM is loaded from the
+ constant pool, we can replace M with M' as above at no cost
+ beyond adding a constant pool entry. */
+
+/* This is based on the union-find logic in web.c. web_entry_base is
+ defined in df.h. */
+class swap_web_entry : public web_entry_base
+{
+ public:
+ /* Pointer to the insn. */
+ rtx_insn *insn;
+ /* Set if insn contains a mention of a vector register. All other
+ fields are undefined if this field is unset. */
+ unsigned int is_relevant : 1;
+ /* Set if insn is a load. */
+ unsigned int is_load : 1;
+ /* Set if insn is a store. */
+ unsigned int is_store : 1;
+ /* Set if insn is a doubleword swap. This can either be a register swap
+ or a permuting load or store (test is_load and is_store for this). */
+ unsigned int is_swap : 1;
+ /* Set if the insn has a live-in use of a parameter register. */
+ unsigned int is_live_in : 1;
+ /* Set if the insn has a live-out def of a return register. */
+ unsigned int is_live_out : 1;
+ /* Set if the insn contains a subreg reference of a vector register. */
+ unsigned int contains_subreg : 1;
+ /* Set if the insn contains a 128-bit integer operand. */
+ unsigned int is_128_int : 1;
+ /* Set if this is a call-insn. */
+ unsigned int is_call : 1;
+ /* Set if this insn does not perform a vector operation for which
+ element order matters, or if we know how to fix it up if it does.
+ Undefined if is_swap is set. */
+ unsigned int is_swappable : 1;
+ /* A nonzero value indicates what kind of special handling for this
+ insn is required if doublewords are swapped. Undefined if
+ is_swappable is not set. */
+ unsigned int special_handling : 4;
+ /* Set if the web represented by this entry cannot be optimized. */
+ unsigned int web_not_optimizable : 1;
+ /* Set if this insn should be deleted. */
+ unsigned int will_delete : 1;
+};
+
+enum special_handling_values {
+ SH_NONE = 0,
+ SH_CONST_VECTOR,
+ SH_SUBREG,
+ SH_NOSWAP_LD,
+ SH_NOSWAP_ST,
+ SH_EXTRACT,
+ SH_SPLAT,
+ SH_XXPERMDI,
+ SH_CONCAT,
+ SH_VPERM
+};
+
+/* Union INSN with all insns containing definitions that reach USE.
+ Detect whether USE is live-in to the current function. */
+static void
+union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
+{
+ struct df_link *link = DF_REF_CHAIN (use);
+
+ if (!link)
+ insn_entry[INSN_UID (insn)].is_live_in = 1;
+
+ while (link)
+ {
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ insn_entry[INSN_UID (insn)].is_live_in = 1;
+
+ if (DF_REF_INSN_INFO (link->ref))
+ {
+ rtx def_insn = DF_REF_INSN (link->ref);
+ (void)unionfind_union (insn_entry + INSN_UID (insn),
+ insn_entry + INSN_UID (def_insn));
+ }
+
+ link = link->next;
+ }
+}
+
+/* Union INSN with all insns containing uses reached from DEF.
+ Detect whether DEF is live-out from the current function. */
+static void
+union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
+{
+ struct df_link *link = DF_REF_CHAIN (def);
+
+ if (!link)
+ insn_entry[INSN_UID (insn)].is_live_out = 1;
+
+ while (link)
+ {
+ /* This could be an eh use or some other artificial use;
+ we treat these all the same (killing the optimization). */
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ insn_entry[INSN_UID (insn)].is_live_out = 1;
+
+ if (DF_REF_INSN_INFO (link->ref))
+ {
+ rtx use_insn = DF_REF_INSN (link->ref);
+ (void)unionfind_union (insn_entry + INSN_UID (insn),
+ insn_entry + INSN_UID (use_insn));
+ }
+
+ link = link->next;
+ }
+}
+
+/* Return 1 iff INSN is a load insn, including permuting loads that
+ represent an lvxd2x instruction; else return 0. */
+static unsigned int
+insn_is_load_p (rtx insn)
+{
+ rtx body = PATTERN (insn);
+
+ if (GET_CODE (body) == SET)
+ {
+ if (GET_CODE (SET_SRC (body)) == MEM)
+ return 1;
+
+ if (GET_CODE (SET_SRC (body)) == VEC_SELECT
+ && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM)
+ return 1;
+
+ return 0;
+ }
+
+ if (GET_CODE (body) != PARALLEL)
+ return 0;
+
+ rtx set = XVECEXP (body, 0, 0);
+
+ if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM)
+ return 1;
+
+ return 0;
+}
+
+/* Return 1 iff INSN is a store insn, including permuting stores that
+ represent an stvxd2x instruction; else return 0. */
+static unsigned int
+insn_is_store_p (rtx insn)
+{
+ rtx body = PATTERN (insn);
+ if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM)
+ return 1;
+ if (GET_CODE (body) != PARALLEL)
+ return 0;
+ rtx set = XVECEXP (body, 0, 0);
+ if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM)
+ return 1;
+ return 0;
+}
+
+/* Return 1 iff INSN swaps doublewords. This may be a reg-reg swap,
+ a permuting load, or a permuting store. */
+static unsigned int
+insn_is_swap_p (rtx insn)
+{
+ rtx body = PATTERN (insn);
+ if (GET_CODE (body) != SET)
+ return 0;
+ rtx rhs = SET_SRC (body);
+ if (GET_CODE (rhs) != VEC_SELECT)
+ return 0;
+ rtx parallel = XEXP (rhs, 1);
+ if (GET_CODE (parallel) != PARALLEL)
+ return 0;
+ unsigned int len = XVECLEN (parallel, 0);
+ if (len != 2 && len != 4 && len != 8 && len != 16)
+ return 0;
+ for (unsigned int i = 0; i < len / 2; ++i)
+ {
+ rtx op = XVECEXP (parallel, 0, i);
+ if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i)
+ return 0;
+ }
+ for (unsigned int i = len / 2; i < len; ++i)
+ {
+ rtx op = XVECEXP (parallel, 0, i);
+ if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2)
+ return 0;
+ }
+ return 1;
+}
+
+/* Return TRUE if insn is a swap fed by a load from the constant pool. */
+static bool
+const_load_sequence_p (swap_web_entry *insn_entry, rtx insn)
+{
+ unsigned uid = INSN_UID (insn);
+ if (!insn_entry[uid].is_swap || insn_entry[uid].is_load)
+ return false;
+
+ const_rtx tocrel_base;
+
+ /* Find the unique use in the swap and locate its def. If the def
+ isn't unique, punt. */
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ df_ref use;
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ if (!def_link || def_link->next)
+ return false;
+
+ rtx def_insn = DF_REF_INSN (def_link->ref);
+ unsigned uid2 = INSN_UID (def_insn);
+ if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap)
+ return false;
+
+ rtx body = PATTERN (def_insn);
+ if (GET_CODE (body) != SET
+ || GET_CODE (SET_SRC (body)) != VEC_SELECT
+ || GET_CODE (XEXP (SET_SRC (body), 0)) != MEM)
+ return false;
+
+ rtx mem = XEXP (SET_SRC (body), 0);
+ rtx base_reg = XEXP (mem, 0);
+
+ df_ref base_use;
+ insn_info = DF_INSN_INFO_GET (def_insn);
+ FOR_EACH_INSN_INFO_USE (base_use, insn_info)
+ {
+ if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
+ continue;
+
+ struct df_link *base_def_link = DF_REF_CHAIN (base_use);
+ if (!base_def_link || base_def_link->next)
+ return false;
+
+ rtx tocrel_insn = DF_REF_INSN (base_def_link->ref);
+ rtx tocrel_body = PATTERN (tocrel_insn);
+ rtx base, offset;
+ if (GET_CODE (tocrel_body) != SET)
+ return false;
+ /* There is an extra level of indirection for small/large
+ code models. */
+ rtx tocrel_expr = SET_SRC (tocrel_body);
+ if (GET_CODE (tocrel_expr) == MEM)
+ tocrel_expr = XEXP (tocrel_expr, 0);
+ if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
+ return false;
+ split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
+ if (GET_CODE (base) != SYMBOL_REF || !CONSTANT_POOL_ADDRESS_P (base))
+ return false;
+ }
+ }
+ return true;
+}
+
+/* Return TRUE iff OP matches a V2DF reduction pattern. See the
+ definition of vsx_reduc_<VEC_reduc_name>_v2df in vsx.md. */
+static bool
+v2df_reduction_p (rtx op)
+{
+ if (GET_MODE (op) != V2DFmode)
+ return false;
+
+ enum rtx_code code = GET_CODE (op);
+ if (code != PLUS && code != SMIN && code != SMAX)
+ return false;
+
+ rtx concat = XEXP (op, 0);
+ if (GET_CODE (concat) != VEC_CONCAT)
+ return false;
+
+ rtx select0 = XEXP (concat, 0);
+ rtx select1 = XEXP (concat, 1);
+ if (GET_CODE (select0) != VEC_SELECT || GET_CODE (select1) != VEC_SELECT)
+ return false;
+
+ rtx reg0 = XEXP (select0, 0);
+ rtx reg1 = XEXP (select1, 0);
+ if (!rtx_equal_p (reg0, reg1) || !REG_P (reg0))
+ return false;
+
+ rtx parallel0 = XEXP (select0, 1);
+ rtx parallel1 = XEXP (select1, 1);
+ if (GET_CODE (parallel0) != PARALLEL || GET_CODE (parallel1) != PARALLEL)
+ return false;
+
+ if (!rtx_equal_p (XVECEXP (parallel0, 0, 0), const1_rtx)
+ || !rtx_equal_p (XVECEXP (parallel1, 0, 0), const0_rtx))
+ return false;
+
+ return true;
+}
+
+/* Return 1 iff OP is an operand that will not be affected by having
+ vector doublewords swapped in memory. */
+static unsigned int
+rtx_is_swappable_p (rtx op, unsigned int *special)
+{
+ enum rtx_code code = GET_CODE (op);
+ int i, j;
+ rtx parallel;
+
+ switch (code)
+ {
+ case LABEL_REF:
+ case SYMBOL_REF:
+ case CLOBBER:
+ case REG:
+ return 1;
+
+ case VEC_CONCAT:
+ case ASM_INPUT:
+ case ASM_OPERANDS:
+ return 0;
+
+ case CONST_VECTOR:
+ {
+ *special = SH_CONST_VECTOR;
+ return 1;
+ }
+
+ case VEC_DUPLICATE:
+ /* Opportunity: If XEXP (op, 0) has the same mode as the result,
+ and XEXP (op, 1) is a PARALLEL with a single QImode const int,
+ it represents a vector splat for which we can do special
+ handling. */
+ if (GET_CODE (XEXP (op, 0)) == CONST_INT)
+ return 1;
+ else if (REG_P (XEXP (op, 0))
+ && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
+ /* This catches V2DF and V2DI splat, at a minimum. */
+ return 1;
+ else if (GET_CODE (XEXP (op, 0)) == TRUNCATE
+ && REG_P (XEXP (XEXP (op, 0), 0))
+ && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
+ /* This catches splat of a truncated value. */
+ return 1;
+ else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
+ /* If the duplicated item is from a select, defer to the select
+ processing to see if we can change the lane for the splat. */
+ return rtx_is_swappable_p (XEXP (op, 0), special);
+ else
+ return 0;
+
+ case VEC_SELECT:
+ /* A vec_extract operation is ok if we change the lane. */
+ if (GET_CODE (XEXP (op, 0)) == REG
+ && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
+ && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
+ && XVECLEN (parallel, 0) == 1
+ && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
+ {
+ *special = SH_EXTRACT;
+ return 1;
+ }
+ /* An XXPERMDI is ok if we adjust the lanes. Note that if the
+ XXPERMDI is a swap operation, it will be identified by
+ insn_is_swap_p and therefore we won't get here. */
+ else if (GET_CODE (XEXP (op, 0)) == VEC_CONCAT
+ && (GET_MODE (XEXP (op, 0)) == V4DFmode
+ || GET_MODE (XEXP (op, 0)) == V4DImode)
+ && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
+ && XVECLEN (parallel, 0) == 2
+ && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT
+ && GET_CODE (XVECEXP (parallel, 0, 1)) == CONST_INT)
+ {
+ *special = SH_XXPERMDI;
+ return 1;
+ }
+ else if (v2df_reduction_p (op))
+ return 1;
+ else
+ return 0;
+
+ case UNSPEC:
+ {
+ /* Various operations are unsafe for this optimization, at least
+ without significant additional work. Permutes are obviously
+ problematic, as both the permute control vector and the ordering
+ of the target values are invalidated by doubleword swapping.
+ Vector pack and unpack modify the number of vector lanes.
+ Merge-high/low will not operate correctly on swapped operands.
+ Vector shifts across element boundaries are clearly uncool,
+ as are vector select and concatenate operations. Vector
+ sum-across instructions define one operand with a specific
+ order-dependent element, so additional fixup code would be
+ needed to make those work. Vector set and non-immediate-form
+ vector splat are element-order sensitive. A few of these
+ cases might be workable with special handling if required.
+ Adding cost modeling would be appropriate in some cases. */
+ int val = XINT (op, 1);
+ switch (val)
+ {
+ default:
+ break;
+ case UNSPEC_VMRGH_DIRECT:
+ case UNSPEC_VMRGL_DIRECT:
+ case UNSPEC_VPACK_SIGN_SIGN_SAT:
+ case UNSPEC_VPACK_SIGN_UNS_SAT:
+ case UNSPEC_VPACK_UNS_UNS_MOD:
+ case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
+ case UNSPEC_VPACK_UNS_UNS_SAT:
+ case UNSPEC_VPERM:
+ case UNSPEC_VPERM_UNS:
+ case UNSPEC_VPERMHI:
+ case UNSPEC_VPERMSI:
+ case UNSPEC_VPKPX:
+ case UNSPEC_VSLDOI:
+ case UNSPEC_VSLO:
+ case UNSPEC_VSRO:
+ case UNSPEC_VSUM2SWS:
+ case UNSPEC_VSUM4S:
+ case UNSPEC_VSUM4UBS:
+ case UNSPEC_VSUMSWS:
+ case UNSPEC_VSUMSWS_DIRECT:
+ case UNSPEC_VSX_CONCAT:
+ case UNSPEC_VSX_SET:
+ case UNSPEC_VSX_SLDWI:
+ case UNSPEC_VUNPACK_HI_SIGN:
+ case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
+ case UNSPEC_VUNPACK_LO_SIGN:
+ case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
+ case UNSPEC_VUPKHPX:
+ case UNSPEC_VUPKHS_V4SF:
+ case UNSPEC_VUPKHU_V4SF:
+ case UNSPEC_VUPKLPX:
+ case UNSPEC_VUPKLS_V4SF:
+ case UNSPEC_VUPKLU_V4SF:
+ case UNSPEC_VSX_CVDPSPN:
+ case UNSPEC_VSX_CVSPDP:
+ case UNSPEC_VSX_CVSPDPN:
+ case UNSPEC_VSX_EXTRACT:
+ case UNSPEC_VSX_VSLO:
+ case UNSPEC_VSX_VEC_INIT:
+ return 0;
+ case UNSPEC_VSPLT_DIRECT:
+ case UNSPEC_VSX_XXSPLTD:
+ *special = SH_SPLAT;
+ return 1;
+ case UNSPEC_REDUC_PLUS:
+ case UNSPEC_REDUC:
+ return 1;
+ }
+ }
+
+ default:
+ break;
+ }
+
+ const char *fmt = GET_RTX_FORMAT (code);
+ int ok = 1;
+
+ for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+ if (fmt[i] == 'e' || fmt[i] == 'u')
+ {
+ unsigned int special_op = SH_NONE;
+ ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
+ if (special_op == SH_NONE)
+ continue;
+ /* Ensure we never have two kinds of special handling
+ for the same insn. */
+ if (*special != SH_NONE && *special != special_op)
+ return 0;
+ *special = special_op;
+ }
+ else if (fmt[i] == 'E')
+ for (j = 0; j < XVECLEN (op, i); ++j)
+ {
+ unsigned int special_op = SH_NONE;
+ ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
+ if (special_op == SH_NONE)
+ continue;
+ /* Ensure we never have two kinds of special handling
+ for the same insn. */
+ if (*special != SH_NONE && *special != special_op)
+ return 0;
+ *special = special_op;
+ }
+
+ return ok;
+}
+
+/* Return 1 iff INSN is an operand that will not be affected by
+ having vector doublewords swapped in memory (in which case
+ *SPECIAL is unchanged), or that can be modified to be correct
+ if vector doublewords are swapped in memory (in which case
+ *SPECIAL is changed to a value indicating how). */
+static unsigned int
+insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
+ unsigned int *special)
+{
+ /* Calls are always bad. */
+ if (GET_CODE (insn) == CALL_INSN)
+ return 0;
+
+ /* Loads and stores seen here are not permuting, but we can still
+ fix them up by converting them to permuting ones. Exceptions:
+ UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
+ body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
+ for the SET source. Also we must now make an exception for lvx
+ and stvx when they are not in the UNSPEC_LVX/STVX form (with the
+ explicit "& -16") since this leads to unrecognizable insns. */
+ rtx body = PATTERN (insn);
+ int i = INSN_UID (insn);
+
+ if (insn_entry[i].is_load)
+ {
+ if (GET_CODE (body) == SET)
+ {
+ rtx rhs = SET_SRC (body);
+ /* Even without a swap, the RHS might be a vec_select for, say,
+ a byte-reversing load. */
+ if (GET_CODE (rhs) != MEM)
+ return 0;
+ if (GET_CODE (XEXP (rhs, 0)) == AND)
+ return 0;
+
+ *special = SH_NOSWAP_LD;
+ return 1;
+ }
+ else
+ return 0;
+ }
+
+ if (insn_entry[i].is_store)
+ {
+ if (GET_CODE (body) == SET
+ && GET_CODE (SET_SRC (body)) != UNSPEC)
+ {
+ rtx lhs = SET_DEST (body);
+ /* Even without a swap, the LHS might be a vec_select for, say,
+ a byte-reversing store. */
+ if (GET_CODE (lhs) != MEM)
+ return 0;
+ if (GET_CODE (XEXP (lhs, 0)) == AND)
+ return 0;
+
+ *special = SH_NOSWAP_ST;
+ return 1;
+ }
+ else
+ return 0;
+ }
+
+ /* A convert to single precision can be left as is provided that
+ all of its uses are in xxspltw instructions that splat BE element
+ zero. */
+ if (GET_CODE (body) == SET
+ && GET_CODE (SET_SRC (body)) == UNSPEC
+ && XINT (SET_SRC (body), 1) == UNSPEC_VSX_CVDPSPN)
+ {
+ df_ref def;
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+
+ FOR_EACH_INSN_INFO_DEF (def, insn_info)
+ {
+ struct df_link *link = DF_REF_CHAIN (def);
+ if (!link)
+ return 0;
+
+ for (; link; link = link->next) {
+ rtx use_insn = DF_REF_INSN (link->ref);
+ rtx use_body = PATTERN (use_insn);
+ if (GET_CODE (use_body) != SET
+ || GET_CODE (SET_SRC (use_body)) != UNSPEC
+ || XINT (SET_SRC (use_body), 1) != UNSPEC_VSX_XXSPLTW
+ || XVECEXP (SET_SRC (use_body), 0, 1) != const0_rtx)
+ return 0;
+ }
+ }
+
+ return 1;
+ }
+
+ /* A concatenation of two doublewords is ok if we reverse the
+ order of the inputs. */
+ if (GET_CODE (body) == SET
+ && GET_CODE (SET_SRC (body)) == VEC_CONCAT
+ && (GET_MODE (SET_SRC (body)) == V2DFmode
+ || GET_MODE (SET_SRC (body)) == V2DImode))
+ {
+ *special = SH_CONCAT;
+ return 1;
+ }
+
+ /* V2DF reductions are always swappable. */
+ if (GET_CODE (body) == PARALLEL)
+ {
+ rtx expr = XVECEXP (body, 0, 0);
+ if (GET_CODE (expr) == SET
+ && v2df_reduction_p (SET_SRC (expr)))
+ return 1;
+ }
+
+ /* An UNSPEC_VPERM is ok if the mask operand is loaded from the
+ constant pool. */
+ if (GET_CODE (body) == SET
+ && GET_CODE (SET_SRC (body)) == UNSPEC
+ && XINT (SET_SRC (body), 1) == UNSPEC_VPERM
+ && XVECLEN (SET_SRC (body), 0) == 3
+ && GET_CODE (XVECEXP (SET_SRC (body), 0, 2)) == REG)
+ {
+ rtx mask_reg = XVECEXP (SET_SRC (body), 0, 2);
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ df_ref use;
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ if (rtx_equal_p (DF_REF_REG (use), mask_reg))
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ /* Punt if multiple definitions for this reg. */
+ if (def_link && !def_link->next &&
+ const_load_sequence_p (insn_entry,
+ DF_REF_INSN (def_link->ref)))
+ {
+ *special = SH_VPERM;
+ return 1;
+ }
+ }
+ }
+
+ /* Otherwise check the operands for vector lane violations. */
+ return rtx_is_swappable_p (body, special);
+}
+
+enum chain_purpose { FOR_LOADS, FOR_STORES };
+
+/* Return true if the UD or DU chain headed by LINK is non-empty,
+ and every entry on the chain references an insn that is a
+ register swap. Furthermore, if PURPOSE is FOR_LOADS, each such
+ register swap must have only permuting loads as reaching defs.
+ If PURPOSE is FOR_STORES, each such register swap must have only
+ register swaps or permuting stores as reached uses. */
+static bool
+chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
+ enum chain_purpose purpose)
+{
+ if (!link)
+ return false;
+
+ for (; link; link = link->next)
+ {
+ if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (DF_REF_REG (link->ref))))
+ continue;
+
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ return false;
+
+ rtx reached_insn = DF_REF_INSN (link->ref);
+ unsigned uid = INSN_UID (reached_insn);
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (reached_insn);
+
+ if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
+ || insn_entry[uid].is_store)
+ return false;
+
+ if (purpose == FOR_LOADS)
+ {
+ df_ref use;
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ struct df_link *swap_link = DF_REF_CHAIN (use);
+
+ while (swap_link)
+ {
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ return false;
+
+ rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
+ unsigned uid2 = INSN_UID (swap_def_insn);
+
+ /* Only permuting loads are allowed. */
+ if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
+ return false;
+
+ swap_link = swap_link->next;
+ }
+ }
+ }
+ else if (purpose == FOR_STORES)
+ {
+ df_ref def;
+ FOR_EACH_INSN_INFO_DEF (def, insn_info)
+ {
+ struct df_link *swap_link = DF_REF_CHAIN (def);
+
+ while (swap_link)
+ {
+ if (DF_REF_IS_ARTIFICIAL (link->ref))
+ return false;
+
+ rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
+ unsigned uid2 = INSN_UID (swap_use_insn);
+
+ /* Permuting stores or register swaps are allowed. */
+ if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
+ return false;
+
+ swap_link = swap_link->next;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+/* Mark the xxswapdi instructions associated with permuting loads and
+ stores for removal. Note that we only flag them for deletion here,
+ as there is a possibility of a swap being reached from multiple
+ loads, etc. */
+static void
+mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
+{
+ rtx insn = insn_entry[i].insn;
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+
+ if (insn_entry[i].is_load)
+ {
+ df_ref def;
+ FOR_EACH_INSN_INFO_DEF (def, insn_info)
+ {
+ struct df_link *link = DF_REF_CHAIN (def);
+
+ /* We know by now that these are swaps, so we can delete
+ them confidently. */
+ while (link)
+ {
+ rtx use_insn = DF_REF_INSN (link->ref);
+ insn_entry[INSN_UID (use_insn)].will_delete = 1;
+ link = link->next;
+ }
+ }
+ }
+ else if (insn_entry[i].is_store)
+ {
+ df_ref use;
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ /* Ignore uses for addressability. */
+ machine_mode mode = GET_MODE (DF_REF_REG (use));
+ if (!ALTIVEC_OR_VSX_VECTOR_MODE (mode))
+ continue;
+
+ struct df_link *link = DF_REF_CHAIN (use);
+
+ /* We know by now that these are swaps, so we can delete
+ them confidently. */
+ while (link)
+ {
+ rtx def_insn = DF_REF_INSN (link->ref);
+ insn_entry[INSN_UID (def_insn)].will_delete = 1;
+ link = link->next;
+ }
+ }
+ }
+}
+
+/* OP is either a CONST_VECTOR or an expression containing one.
+ Swap the first half of the vector with the second in the first
+ case. Recurse to find it in the second. */
+static void
+swap_const_vector_halves (rtx op)
+{
+ int i;
+ enum rtx_code code = GET_CODE (op);
+ if (GET_CODE (op) == CONST_VECTOR)
+ {
+ int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2;
+ for (i = 0; i < half_units; ++i)
+ {
+ rtx temp = CONST_VECTOR_ELT (op, i);
+ CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units);
+ CONST_VECTOR_ELT (op, i + half_units) = temp;
+ }
+ }
+ else
+ {
+ int j;
+ const char *fmt = GET_RTX_FORMAT (code);
+ for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+ if (fmt[i] == 'e' || fmt[i] == 'u')
+ swap_const_vector_halves (XEXP (op, i));
+ else if (fmt[i] == 'E')
+ for (j = 0; j < XVECLEN (op, i); ++j)
+ swap_const_vector_halves (XVECEXP (op, i, j));
+ }
+}
+
+/* Find all subregs of a vector expression that perform a narrowing,
+ and adjust the subreg index to account for doubleword swapping. */
+static void
+adjust_subreg_index (rtx op)
+{
+ enum rtx_code code = GET_CODE (op);
+ if (code == SUBREG
+ && (GET_MODE_SIZE (GET_MODE (op))
+ < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
+ {
+ unsigned int index = SUBREG_BYTE (op);
+ if (index < 8)
+ index += 8;
+ else
+ index -= 8;
+ SUBREG_BYTE (op) = index;
+ }
+
+ const char *fmt = GET_RTX_FORMAT (code);
+ int i,j;
+ for (i = 0; i < GET_RTX_LENGTH (code); ++i)
+ if (fmt[i] == 'e' || fmt[i] == 'u')
+ adjust_subreg_index (XEXP (op, i));
+ else if (fmt[i] == 'E')
+ for (j = 0; j < XVECLEN (op, i); ++j)
+ adjust_subreg_index (XVECEXP (op, i, j));
+}
+
+/* Convert the non-permuting load INSN to a permuting one. */
+static void
+permute_load (rtx_insn *insn)
+{
+ rtx body = PATTERN (insn);
+ rtx mem_op = SET_SRC (body);
+ rtx tgt_reg = SET_DEST (body);
+ machine_mode mode = GET_MODE (tgt_reg);
+ int n_elts = GET_MODE_NUNITS (mode);
+ int half_elts = n_elts / 2;
+ rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
+ int i, j;
+ for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
+ XVECEXP (par, 0, i) = GEN_INT (j);
+ for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
+ XVECEXP (par, 0, i) = GEN_INT (j);
+ rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
+ SET_SRC (body) = sel;
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Replacing load %d with permuted load\n",
+ INSN_UID (insn));
+}
+
+/* Convert the non-permuting store INSN to a permuting one. */
+static void
+permute_store (rtx_insn *insn)
+{
+ rtx body = PATTERN (insn);
+ rtx src_reg = SET_SRC (body);
+ machine_mode mode = GET_MODE (src_reg);
+ int n_elts = GET_MODE_NUNITS (mode);
+ int half_elts = n_elts / 2;
+ rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
+ int i, j;
+ for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
+ XVECEXP (par, 0, i) = GEN_INT (j);
+ for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
+ XVECEXP (par, 0, i) = GEN_INT (j);
+ rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
+ SET_SRC (body) = sel;
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Replacing store %d with permuted store\n",
+ INSN_UID (insn));
+}
+
+/* Given OP that contains a vector extract operation, adjust the index
+ of the extracted lane to account for the doubleword swap. */
+static void
+adjust_extract (rtx_insn *insn)
+{
+ rtx pattern = PATTERN (insn);
+ if (GET_CODE (pattern) == PARALLEL)
+ pattern = XVECEXP (pattern, 0, 0);
+ rtx src = SET_SRC (pattern);
+ /* The vec_select may be wrapped in a vec_duplicate for a splat, so
+ account for that. */
+ rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
+ rtx par = XEXP (sel, 1);
+ int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
+ int lane = INTVAL (XVECEXP (par, 0, 0));
+ lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+ XVECEXP (par, 0, 0) = GEN_INT (lane);
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
+}
+
+/* Given OP that contains a vector direct-splat operation, adjust the index
+ of the source lane to account for the doubleword swap. */
+static void
+adjust_splat (rtx_insn *insn)
+{
+ rtx body = PATTERN (insn);
+ rtx unspec = XEXP (body, 1);
+ int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
+ int lane = INTVAL (XVECEXP (unspec, 0, 1));
+ lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
+ XVECEXP (unspec, 0, 1) = GEN_INT (lane);
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
+}
+
+/* Given OP that contains an XXPERMDI operation (that is not a doubleword
+ swap), reverse the order of the source operands and adjust the indices
+ of the source lanes to account for doubleword reversal. */
+static void
+adjust_xxpermdi (rtx_insn *insn)
+{
+ rtx set = PATTERN (insn);
+ rtx select = XEXP (set, 1);
+ rtx concat = XEXP (select, 0);
+ rtx src0 = XEXP (concat, 0);
+ XEXP (concat, 0) = XEXP (concat, 1);
+ XEXP (concat, 1) = src0;
+ rtx parallel = XEXP (select, 1);
+ int lane0 = INTVAL (XVECEXP (parallel, 0, 0));
+ int lane1 = INTVAL (XVECEXP (parallel, 0, 1));
+ int new_lane0 = 3 - lane1;
+ int new_lane1 = 3 - lane0;
+ XVECEXP (parallel, 0, 0) = GEN_INT (new_lane0);
+ XVECEXP (parallel, 0, 1) = GEN_INT (new_lane1);
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Changing lanes for xxpermdi %d\n", INSN_UID (insn));
+}
+
+/* Given OP that contains a VEC_CONCAT operation of two doublewords,
+ reverse the order of those inputs. */
+static void
+adjust_concat (rtx_insn *insn)
+{
+ rtx set = PATTERN (insn);
+ rtx concat = XEXP (set, 1);
+ rtx src0 = XEXP (concat, 0);
+ XEXP (concat, 0) = XEXP (concat, 1);
+ XEXP (concat, 1) = src0;
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Reversing inputs for concat %d\n", INSN_UID (insn));
+}
+
+/* Given an UNSPEC_VPERM insn, modify the mask loaded from the
+ constant pool to reflect swapped doublewords. */
+static void
+adjust_vperm (rtx_insn *insn)
+{
+ /* We previously determined that the UNSPEC_VPERM was fed by a
+ swap of a swapping load of a TOC-relative constant pool symbol.
+ Find the MEM in the swapping load and replace it with a MEM for
+ the adjusted mask constant. */
+ rtx set = PATTERN (insn);
+ rtx mask_reg = XVECEXP (SET_SRC (set), 0, 2);
+
+ /* Find the swap. */
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ df_ref use;
+ rtx_insn *swap_insn = 0;
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ if (rtx_equal_p (DF_REF_REG (use), mask_reg))
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ gcc_assert (def_link && !def_link->next);
+ swap_insn = DF_REF_INSN (def_link->ref);
+ break;
+ }
+ gcc_assert (swap_insn);
+
+ /* Find the load. */
+ insn_info = DF_INSN_INFO_GET (swap_insn);
+ rtx_insn *load_insn = 0;
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ gcc_assert (def_link && !def_link->next);
+ load_insn = DF_REF_INSN (def_link->ref);
+ break;
+ }
+ gcc_assert (load_insn);
+
+ /* Find the TOC-relative symbol access. */
+ insn_info = DF_INSN_INFO_GET (load_insn);
+ rtx_insn *tocrel_insn = 0;
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ gcc_assert (def_link && !def_link->next);
+ tocrel_insn = DF_REF_INSN (def_link->ref);
+ break;
+ }
+ gcc_assert (tocrel_insn);
+
+ /* Find the embedded CONST_VECTOR. We have to call toc_relative_expr_p
+ to set tocrel_base; otherwise it would be unnecessary as we've
+ already established it will return true. */
+ rtx base, offset;
+ const_rtx tocrel_base;
+ rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn));
+ /* There is an extra level of indirection for small/large code models. */
+ if (GET_CODE (tocrel_expr) == MEM)
+ tocrel_expr = XEXP (tocrel_expr, 0);
+ if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
+ gcc_unreachable ();
+ split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
+ rtx const_vector = get_pool_constant (base);
+ /* With the extra indirection, get_pool_constant will produce the
+ real constant from the reg_equal expression, so get the real
+ constant. */
+ if (GET_CODE (const_vector) == SYMBOL_REF)
+ const_vector = get_pool_constant (const_vector);
+ gcc_assert (GET_CODE (const_vector) == CONST_VECTOR);
+
+ /* Create an adjusted mask from the initial mask. */
+ unsigned int new_mask[16], i, val;
+ for (i = 0; i < 16; ++i) {
+ val = INTVAL (XVECEXP (const_vector, 0, i));
+ if (val < 16)
+ new_mask[i] = (val + 8) % 16;
+ else
+ new_mask[i] = ((val + 8) % 16) + 16;
+ }
+
+ /* Create a new CONST_VECTOR and a MEM that references it. */
+ rtx vals = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
+ for (i = 0; i < 16; ++i)
+ XVECEXP (vals, 0, i) = GEN_INT (new_mask[i]);
+ rtx new_const_vector = gen_rtx_CONST_VECTOR (V16QImode, XVEC (vals, 0));
+ rtx new_mem = force_const_mem (V16QImode, new_const_vector);
+ /* This gives us a MEM whose base operand is a SYMBOL_REF, which we
+ can't recognize. Force the SYMBOL_REF into a register. */
+ if (!REG_P (XEXP (new_mem, 0))) {
+ rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0));
+ XEXP (new_mem, 0) = base_reg;
+ /* Move the newly created insn ahead of the load insn. */
+ rtx_insn *force_insn = get_last_insn ();
+ remove_insn (force_insn);
+ rtx_insn *before_load_insn = PREV_INSN (load_insn);
+ add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn));
+ df_insn_rescan (before_load_insn);
+ df_insn_rescan (force_insn);
+ }
+
+ /* Replace the MEM in the load instruction and rescan it. */
+ XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem;
+ INSN_CODE (load_insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (load_insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Adjusting mask for vperm %d\n", INSN_UID (insn));
+}
+
+/* The insn described by INSN_ENTRY[I] can be swapped, but only
+ with special handling. Take care of that here. */
+static void
+handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
+{
+ rtx_insn *insn = insn_entry[i].insn;
+ rtx body = PATTERN (insn);
+
+ switch (insn_entry[i].special_handling)
+ {
+ default:
+ gcc_unreachable ();
+ case SH_CONST_VECTOR:
+ {
+ /* A CONST_VECTOR will only show up somewhere in the RHS of a SET. */
+ gcc_assert (GET_CODE (body) == SET);
+ rtx rhs = SET_SRC (body);
+ swap_const_vector_halves (rhs);
+ if (dump_file)
+ fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
+ break;
+ }
+ case SH_SUBREG:
+ /* A subreg of the same size is already safe. For subregs that
+ select a smaller portion of a reg, adjust the index for
+ swapped doublewords. */
+ adjust_subreg_index (body);
+ if (dump_file)
+ fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
+ break;
+ case SH_NOSWAP_LD:
+ /* Convert a non-permuting load to a permuting one. */
+ permute_load (insn);
+ break;
+ case SH_NOSWAP_ST:
+ /* Convert a non-permuting store to a permuting one. */
+ permute_store (insn);
+ break;
+ case SH_EXTRACT:
+ /* Change the lane on an extract operation. */
+ adjust_extract (insn);
+ break;
+ case SH_SPLAT:
+ /* Change the lane on a direct-splat operation. */
+ adjust_splat (insn);
+ break;
+ case SH_XXPERMDI:
+ /* Change the lanes on an XXPERMDI operation. */
+ adjust_xxpermdi (insn);
+ break;
+ case SH_CONCAT:
+ /* Reverse the order of a concatenation operation. */
+ adjust_concat (insn);
+ break;
+ case SH_VPERM:
+ /* Change the mask loaded from the constant pool for a VPERM. */
+ adjust_vperm (insn);
+ break;
+ }
+}
+
+/* Find the insn from the Ith table entry, which is known to be a
+ register swap Y = SWAP(X). Replace it with a copy Y = X. */
+static void
+replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
+{
+ rtx_insn *insn = insn_entry[i].insn;
+ rtx body = PATTERN (insn);
+ rtx src_reg = XEXP (SET_SRC (body), 0);
+ rtx copy = gen_rtx_SET (SET_DEST (body), src_reg);
+ rtx_insn *new_insn = emit_insn_before (copy, insn);
+ set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
+ df_insn_rescan (new_insn);
+
+ if (dump_file)
+ {
+ unsigned int new_uid = INSN_UID (new_insn);
+ fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
+ }
+
+ df_insn_delete (insn);
+ remove_insn (insn);
+ insn->set_deleted ();
+}
+
+/* Dump the swap table to DUMP_FILE. */
+static void
+dump_swap_insn_table (swap_web_entry *insn_entry)
+{
+ int e = get_max_uid ();
+ fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
+
+ for (int i = 0; i < e; ++i)
+ if (insn_entry[i].is_relevant)
+ {
+ swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
+ fprintf (dump_file, "%6d %6d ", i,
+ pred_entry && pred_entry->insn
+ ? INSN_UID (pred_entry->insn) : 0);
+ if (insn_entry[i].is_load)
+ fputs ("load ", dump_file);
+ if (insn_entry[i].is_store)
+ fputs ("store ", dump_file);
+ if (insn_entry[i].is_swap)
+ fputs ("swap ", dump_file);
+ if (insn_entry[i].is_live_in)
+ fputs ("live-in ", dump_file);
+ if (insn_entry[i].is_live_out)
+ fputs ("live-out ", dump_file);
+ if (insn_entry[i].contains_subreg)
+ fputs ("subreg ", dump_file);
+ if (insn_entry[i].is_128_int)
+ fputs ("int128 ", dump_file);
+ if (insn_entry[i].is_call)
+ fputs ("call ", dump_file);
+ if (insn_entry[i].is_swappable)
+ {
+ fputs ("swappable ", dump_file);
+ if (insn_entry[i].special_handling == SH_CONST_VECTOR)
+ fputs ("special:constvec ", dump_file);
+ else if (insn_entry[i].special_handling == SH_SUBREG)
+ fputs ("special:subreg ", dump_file);
+ else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
+ fputs ("special:load ", dump_file);
+ else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
+ fputs ("special:store ", dump_file);
+ else if (insn_entry[i].special_handling == SH_EXTRACT)
+ fputs ("special:extract ", dump_file);
+ else if (insn_entry[i].special_handling == SH_SPLAT)
+ fputs ("special:splat ", dump_file);
+ else if (insn_entry[i].special_handling == SH_XXPERMDI)
+ fputs ("special:xxpermdi ", dump_file);
+ else if (insn_entry[i].special_handling == SH_CONCAT)
+ fputs ("special:concat ", dump_file);
+ else if (insn_entry[i].special_handling == SH_VPERM)
+ fputs ("special:vperm ", dump_file);
+ }
+ if (insn_entry[i].web_not_optimizable)
+ fputs ("unoptimizable ", dump_file);
+ if (insn_entry[i].will_delete)
+ fputs ("delete ", dump_file);
+ fputs ("\n", dump_file);
+ }
+ fputs ("\n", dump_file);
+}
+
+/* Return RTX with its address canonicalized to (reg) or (+ reg reg).
+ Here RTX is an (& addr (const_int -16)). Always return a new copy
+ to avoid problems with combine. */
+static rtx
+alignment_with_canonical_addr (rtx align)
+{
+ rtx canon;
+ rtx addr = XEXP (align, 0);
+
+ if (REG_P (addr))
+ canon = addr;
+
+ else if (GET_CODE (addr) == PLUS)
+ {
+ rtx addrop0 = XEXP (addr, 0);
+ rtx addrop1 = XEXP (addr, 1);
+
+ if (!REG_P (addrop0))
+ addrop0 = force_reg (GET_MODE (addrop0), addrop0);
+
+ if (!REG_P (addrop1))
+ addrop1 = force_reg (GET_MODE (addrop1), addrop1);
+
+ canon = gen_rtx_PLUS (GET_MODE (addr), addrop0, addrop1);
+ }
+
+ else
+ canon = force_reg (GET_MODE (addr), addr);
+
+ return gen_rtx_AND (GET_MODE (align), canon, GEN_INT (-16));
+}
+
+/* Check whether an rtx is an alignment mask, and if so, return
+ a fully-expanded rtx for the masking operation. */
+static rtx
+alignment_mask (rtx_insn *insn)
+{
+ rtx body = PATTERN (insn);
+
+ if (GET_CODE (body) != SET
+ || GET_CODE (SET_SRC (body)) != AND
+ || !REG_P (XEXP (SET_SRC (body), 0)))
+ return 0;
+
+ rtx mask = XEXP (SET_SRC (body), 1);
+
+ if (GET_CODE (mask) == CONST_INT)
+ {
+ if (INTVAL (mask) == -16)
+ return alignment_with_canonical_addr (SET_SRC (body));
+ else
+ return 0;
+ }
+
+ if (!REG_P (mask))
+ return 0;
+
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ df_ref use;
+ rtx real_mask = 0;
+
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ if (!rtx_equal_p (DF_REF_REG (use), mask))
+ continue;
+
+ struct df_link *def_link = DF_REF_CHAIN (use);
+ if (!def_link || def_link->next)
+ return 0;
+
+ rtx_insn *const_insn = DF_REF_INSN (def_link->ref);
+ rtx const_body = PATTERN (const_insn);
+ if (GET_CODE (const_body) != SET)
+ return 0;
+
+ real_mask = SET_SRC (const_body);
+
+ if (GET_CODE (real_mask) != CONST_INT
+ || INTVAL (real_mask) != -16)
+ return 0;
+ }
+
+ if (real_mask == 0)
+ return 0;
+
+ return alignment_with_canonical_addr (SET_SRC (body));
+}
+
+/* Given INSN that's a load or store based at BASE_REG, look for a
+ feeding computation that aligns its address on a 16-byte boundary. */
+static rtx
+find_alignment_op (rtx_insn *insn, rtx base_reg)
+{
+ df_ref base_use;
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ rtx and_operation = 0;
+
+ FOR_EACH_INSN_INFO_USE (base_use, insn_info)
+ {
+ if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
+ continue;
+
+ struct df_link *base_def_link = DF_REF_CHAIN (base_use);
+ if (!base_def_link || base_def_link->next)
+ break;
+
+ /* With stack-protector code enabled, and possibly in other
+ circumstances, there may not be an associated insn for
+ the def. */
+ if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
+ break;
+
+ rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref);
+ and_operation = alignment_mask (and_insn);
+ if (and_operation != 0)
+ break;
+ }
+
+ return and_operation;
+}
+
+struct del_info { bool replace; rtx_insn *replace_insn; };
+
+/* If INSN is the load for an lvx pattern, put it in canonical form. */
+static void
+recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete)
+{
+ rtx body = PATTERN (insn);
+ gcc_assert (GET_CODE (body) == SET
+ && GET_CODE (SET_SRC (body)) == VEC_SELECT
+ && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM);
+
+ rtx mem = XEXP (SET_SRC (body), 0);
+ rtx base_reg = XEXP (mem, 0);
+
+ rtx and_operation = find_alignment_op (insn, base_reg);
+
+ if (and_operation != 0)
+ {
+ df_ref def;
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ FOR_EACH_INSN_INFO_DEF (def, insn_info)
+ {
+ struct df_link *link = DF_REF_CHAIN (def);
+ if (!link || link->next)
+ break;
+
+ rtx_insn *swap_insn = DF_REF_INSN (link->ref);
+ if (!insn_is_swap_p (swap_insn)
+ || insn_is_load_p (swap_insn)
+ || insn_is_store_p (swap_insn))
+ break;
+
+ /* Expected lvx pattern found. Change the swap to
+ a copy, and propagate the AND operation into the
+ load. */
+ to_delete[INSN_UID (swap_insn)].replace = true;
+ to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
+
+ XEXP (mem, 0) = and_operation;
+ SET_SRC (body) = mem;
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "lvx opportunity found at %d\n",
+ INSN_UID (insn));
+ }
+ }
+}
+
+/* If INSN is the store for an stvx pattern, put it in canonical form. */
+static void
+recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete)
+{
+ rtx body = PATTERN (insn);
+ gcc_assert (GET_CODE (body) == SET
+ && GET_CODE (SET_DEST (body)) == MEM
+ && GET_CODE (SET_SRC (body)) == VEC_SELECT);
+ rtx mem = SET_DEST (body);
+ rtx base_reg = XEXP (mem, 0);
+
+ rtx and_operation = find_alignment_op (insn, base_reg);
+
+ if (and_operation != 0)
+ {
+ rtx src_reg = XEXP (SET_SRC (body), 0);
+ df_ref src_use;
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ FOR_EACH_INSN_INFO_USE (src_use, insn_info)
+ {
+ if (!rtx_equal_p (DF_REF_REG (src_use), src_reg))
+ continue;
+
+ struct df_link *link = DF_REF_CHAIN (src_use);
+ if (!link || link->next)
+ break;
+
+ rtx_insn *swap_insn = DF_REF_INSN (link->ref);
+ if (!insn_is_swap_p (swap_insn)
+ || insn_is_load_p (swap_insn)
+ || insn_is_store_p (swap_insn))
+ break;
+
+ /* Expected stvx pattern found. Change the swap to
+ a copy, and propagate the AND operation into the
+ store. */
+ to_delete[INSN_UID (swap_insn)].replace = true;
+ to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
+
+ XEXP (mem, 0) = and_operation;
+ SET_SRC (body) = src_reg;
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "stvx opportunity found at %d\n",
+ INSN_UID (insn));
+ }
+ }
+}
+
+/* Look for patterns created from builtin lvx and stvx calls, and
+ canonicalize them to be properly recognized as such. */
+static void
+recombine_lvx_stvx_patterns (function *fun)
+{
+ int i;
+ basic_block bb;
+ rtx_insn *insn;
+
+ int num_insns = get_max_uid ();
+ del_info *to_delete = XCNEWVEC (del_info, num_insns);
+
+ FOR_ALL_BB_FN (bb, fun)
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+
+ if (insn_is_load_p (insn) && insn_is_swap_p (insn))
+ recombine_lvx_pattern (insn, to_delete);
+ else if (insn_is_store_p (insn) && insn_is_swap_p (insn))
+ recombine_stvx_pattern (insn, to_delete);
+ }
+
+ /* Turning swaps into copies is delayed until now, to avoid problems
+ with deleting instructions during the insn walk. */
+ for (i = 0; i < num_insns; i++)
+ if (to_delete[i].replace)
+ {
+ rtx swap_body = PATTERN (to_delete[i].replace_insn);
+ rtx src_reg = XEXP (SET_SRC (swap_body), 0);
+ rtx copy = gen_rtx_SET (SET_DEST (swap_body), src_reg);
+ rtx_insn *new_insn = emit_insn_before (copy,
+ to_delete[i].replace_insn);
+ set_block_for_insn (new_insn,
+ BLOCK_FOR_INSN (to_delete[i].replace_insn));
+ df_insn_rescan (new_insn);
+ df_insn_delete (to_delete[i].replace_insn);
+ remove_insn (to_delete[i].replace_insn);
+ to_delete[i].replace_insn->set_deleted ();
+ }
+
+ free (to_delete);
+}
+
+/* Main entry point for this pass. */
+unsigned int
+rs6000_analyze_swaps (function *fun)
+{
+ swap_web_entry *insn_entry;
+ basic_block bb;
+ rtx_insn *insn, *curr_insn = 0;
+
+ /* Dataflow analysis for use-def chains. */
+ df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
+ df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+ df_analyze ();
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+
+ /* Pre-pass to recombine lvx and stvx patterns so we don't lose info. */
+ recombine_lvx_stvx_patterns (fun);
+
+ /* Allocate structure to represent webs of insns. */
+ insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
+
+ /* Walk the insns to gather basic data. */
+ FOR_ALL_BB_FN (bb, fun)
+ FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
+ {
+ unsigned int uid = INSN_UID (insn);
+ if (NONDEBUG_INSN_P (insn))
+ {
+ insn_entry[uid].insn = insn;
+
+ if (GET_CODE (insn) == CALL_INSN)
+ insn_entry[uid].is_call = 1;
+
+ /* Walk the uses and defs to see if we mention vector regs.
+ Record any constraints on optimization of such mentions. */
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ df_ref mention;
+ FOR_EACH_INSN_INFO_USE (mention, insn_info)
+ {
+ /* We use DF_REF_REAL_REG here to get inside any subregs. */
+ machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
+
+ /* If a use gets its value from a call insn, it will be
+ a hard register and will look like (reg:V4SI 3 3).
+ The df analysis creates two mentions for GPR3 and GPR4,
+ both DImode. We must recognize this and treat it as a
+ vector mention to ensure the call is unioned with this
+ use. */
+ if (mode == DImode && DF_REF_INSN_INFO (mention))
+ {
+ rtx feeder = DF_REF_INSN (mention);
+ /* FIXME: It is pretty hard to get from the df mention
+ to the mode of the use in the insn. We arbitrarily
+ pick a vector mode here, even though the use might
+ be a real DImode. We can be too conservative
+ (create a web larger than necessary) because of
+ this, so consider eventually fixing this. */
+ if (GET_CODE (feeder) == CALL_INSN)
+ mode = V4SImode;
+ }
+
+ if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
+ {
+ insn_entry[uid].is_relevant = 1;
+ if (mode == TImode || mode == V1TImode
+ || FLOAT128_VECTOR_P (mode))
+ insn_entry[uid].is_128_int = 1;
+ if (DF_REF_INSN_INFO (mention))
+ insn_entry[uid].contains_subreg
+ = !rtx_equal_p (DF_REF_REG (mention),
+ DF_REF_REAL_REG (mention));
+ union_defs (insn_entry, insn, mention);
+ }
+ }
+ FOR_EACH_INSN_INFO_DEF (mention, insn_info)
+ {
+ /* We use DF_REF_REAL_REG here to get inside any subregs. */
+ machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
+
+ /* If we're loading up a hard vector register for a call,
+ it looks like (set (reg:V4SI 9 9) (...)). The df
+ analysis creates two mentions for GPR9 and GPR10, both
+ DImode. So relying on the mode from the mentions
+ isn't sufficient to ensure we union the call into the
+ web with the parameter setup code. */
+ if (mode == DImode && GET_CODE (insn) == SET
+ && ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (SET_DEST (insn))))
+ mode = GET_MODE (SET_DEST (insn));
+
+ if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
+ {
+ insn_entry[uid].is_relevant = 1;
+ if (mode == TImode || mode == V1TImode
+ || FLOAT128_VECTOR_P (mode))
+ insn_entry[uid].is_128_int = 1;
+ if (DF_REF_INSN_INFO (mention))
+ insn_entry[uid].contains_subreg
+ = !rtx_equal_p (DF_REF_REG (mention),
+ DF_REF_REAL_REG (mention));
+ /* REG_FUNCTION_VALUE_P is not valid for subregs. */
+ else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
+ insn_entry[uid].is_live_out = 1;
+ union_uses (insn_entry, insn, mention);
+ }
+ }
+
+ if (insn_entry[uid].is_relevant)
+ {
+ /* Determine if this is a load or store. */
+ insn_entry[uid].is_load = insn_is_load_p (insn);
+ insn_entry[uid].is_store = insn_is_store_p (insn);
+
+ /* Determine if this is a doubleword swap. If not,
+ determine whether it can legally be swapped. */
+ if (insn_is_swap_p (insn))
+ insn_entry[uid].is_swap = 1;
+ else
+ {
+ unsigned int special = SH_NONE;
+ insn_entry[uid].is_swappable
+ = insn_is_swappable_p (insn_entry, insn, &special);
+ if (special != SH_NONE && insn_entry[uid].contains_subreg)
+ insn_entry[uid].is_swappable = 0;
+ else if (special != SH_NONE)
+ insn_entry[uid].special_handling = special;
+ else if (insn_entry[uid].contains_subreg)
+ insn_entry[uid].special_handling = SH_SUBREG;
+ }
+ }
+ }
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nSwap insn entry table when first built\n");
+ dump_swap_insn_table (insn_entry);
+ }
+
+ /* Record unoptimizable webs. */
+ unsigned e = get_max_uid (), i;
+ for (i = 0; i < e; ++i)
+ {
+ if (!insn_entry[i].is_relevant)
+ continue;
+
+ swap_web_entry *root
+ = (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
+
+ if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
+ || (insn_entry[i].contains_subreg
+ && insn_entry[i].special_handling != SH_SUBREG)
+ || insn_entry[i].is_128_int || insn_entry[i].is_call
+ || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
+ root->web_not_optimizable = 1;
+
+ /* If we have loads or stores that aren't permuting then the
+ optimization isn't appropriate. */
+ else if ((insn_entry[i].is_load || insn_entry[i].is_store)
+ && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
+ root->web_not_optimizable = 1;
+
+ /* If we have permuting loads or stores that are not accompanied
+ by a register swap, the optimization isn't appropriate. */
+ else if (insn_entry[i].is_load && insn_entry[i].is_swap)
+ {
+ rtx insn = insn_entry[i].insn;
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ df_ref def;
+
+ FOR_EACH_INSN_INFO_DEF (def, insn_info)
+ {
+ struct df_link *link = DF_REF_CHAIN (def);
+
+ if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
+ {
+ root->web_not_optimizable = 1;
+ break;
+ }
+ }
+ }
+ else if (insn_entry[i].is_store && insn_entry[i].is_swap)
+ {
+ rtx insn = insn_entry[i].insn;
+ struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
+ df_ref use;
+
+ FOR_EACH_INSN_INFO_USE (use, insn_info)
+ {
+ struct df_link *link = DF_REF_CHAIN (use);
+
+ if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
+ {
+ root->web_not_optimizable = 1;
+ break;
+ }
+ }
+ }
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
+ dump_swap_insn_table (insn_entry);
+ }
+
+ /* For each load and store in an optimizable web (which implies
+ the loads and stores are permuting), find the associated
+ register swaps and mark them for removal. Due to various
+ optimizations we may mark the same swap more than once. Also
+ perform special handling for swappable insns that require it. */
+ for (i = 0; i < e; ++i)
+ if ((insn_entry[i].is_load || insn_entry[i].is_store)
+ && insn_entry[i].is_swap)
+ {
+ swap_web_entry* root_entry
+ = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
+ if (!root_entry->web_not_optimizable)
+ mark_swaps_for_removal (insn_entry, i);
+ }
+ else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
+ {
+ swap_web_entry* root_entry
+ = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
+ if (!root_entry->web_not_optimizable)
+ handle_special_swappables (insn_entry, i);
+ }
+
+ /* Now delete the swaps marked for removal. */
+ for (i = 0; i < e; ++i)
+ if (insn_entry[i].will_delete)
+ replace_swap_with_copy (insn_entry, i);
+
+ /* Clean up. */
+ free (insn_entry);
+ return 0;
+}
+
+const pass_data pass_data_analyze_swaps =
+{
+ RTL_PASS, /* type */
+ "swaps", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_analyze_swaps : public rtl_opt_pass
+{
+public:
+ pass_analyze_swaps(gcc::context *ctxt)
+ : rtl_opt_pass(pass_data_analyze_swaps, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
+ && !TARGET_P9_VECTOR && rs6000_optimize_swaps);
+ }
+
+ virtual unsigned int execute (function *fun)
+ {
+ return rs6000_analyze_swaps (fun);
+ }
+
+ opt_pass *clone ()
+ {
+ return new pass_analyze_swaps (m_ctxt);
+ }
+
+}; // class pass_analyze_swaps
+
+rtl_opt_pass *
+make_pass_analyze_swaps (gcc::context *ctxt)
+{
+ return new pass_analyze_swaps (ctxt);
+}
+
return "";
}
-\f
-/* Analyze vector computations and remove unnecessary doubleword
- swaps (xxswapdi instructions). This pass is performed only
- for little-endian VSX code generation.
-
- For this specific case, loads and stores of 4x32 and 2x64 vectors
- are inefficient. These are implemented using the lvx2dx and
- stvx2dx instructions, which invert the order of doublewords in
- a vector register. Thus the code generation inserts an xxswapdi
- after each such load, and prior to each such store. (For spill
- code after register assignment, an additional xxswapdi is inserted
- following each store in order to return a hard register to its
- unpermuted value.)
-
- The extra xxswapdi instructions reduce performance. This can be
- particularly bad for vectorized code. The purpose of this pass
- is to reduce the number of xxswapdi instructions required for
- correctness.
-
- The primary insight is that much code that operates on vectors
- does not care about the relative order of elements in a register,
- so long as the correct memory order is preserved. If we have
- a computation where all input values are provided by lvxd2x/xxswapdi
- sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
- and all intermediate computations are pure SIMD (independent of
- element order), then all the xxswapdi's associated with the loads
- and stores may be removed.
-
- This pass uses some of the infrastructure and logical ideas from
- the "web" pass in web.c. We create maximal webs of computations
- fitting the description above using union-find. Each such web is
- then optimized by removing its unnecessary xxswapdi instructions.
-
- The pass is placed prior to global optimization so that we can
- perform the optimization in the safest and simplest way possible;
- that is, by replacing each xxswapdi insn with a register copy insn.
- Subsequent forward propagation will remove copies where possible.
-
- There are some operations sensitive to element order for which we
- can still allow the operation, provided we modify those operations.
- These include CONST_VECTORs, for which we must swap the first and
- second halves of the constant vector; and SUBREGs, for which we
- must adjust the byte offset to account for the swapped doublewords.
- A remaining opportunity would be non-immediate-form splats, for
- which we should adjust the selected lane of the input. We should
- also make code generation adjustments for sum-across operations,
- since this is a common vectorizer reduction.
-
- Because we run prior to the first split, we can see loads and stores
- here that match *vsx_le_perm_{load,store}_<mode>. These are vanilla
- vector loads and stores that have not yet been split into a permuting
- load/store and a swap. (One way this can happen is with a builtin
- call to vec_vsx_{ld,st}.) We can handle these as well, but rather
- than deleting a swap, we convert the load/store into a permuting
- load/store (which effectively removes the swap). */
-
-/* Notes on Permutes
-
- We do not currently handle computations that contain permutes. There
- is a general transformation that can be performed correctly, but it
- may introduce more expensive code than it replaces. To handle these
- would require a cost model to determine when to perform the optimization.
- This commentary records how this could be done if desired.
-
- The most general permute is something like this (example for V16QI):
-
- (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
- (parallel [(const_int a0) (const_int a1)
- ...
- (const_int a14) (const_int a15)]))
-
- where a0,...,a15 are in [0,31] and select elements from op1 and op2
- to produce in the result.
-
- Regardless of mode, we can convert the PARALLEL to a mask of 16
- byte-element selectors. Let's call this M, with M[i] representing
- the ith byte-element selector value. Then if we swap doublewords
- throughout the computation, we can get correct behavior by replacing
- M with M' as follows:
-
- M'[i] = { (M[i]+8)%16 : M[i] in [0,15]
- { ((M[i]+8)%16)+16 : M[i] in [16,31]
-
- This seems promising at first, since we are just replacing one mask
- with another. But certain masks are preferable to others. If M
- is a mask that matches a vmrghh pattern, for example, M' certainly
- will not. Instead of a single vmrghh, we would generate a load of
- M' and a vperm. So we would need to know how many xxswapd's we can
- remove as a result of this transformation to determine if it's
- profitable; and preferably the logic would need to be aware of all
- the special preferable masks.
-
- Another form of permute is an UNSPEC_VPERM, in which the mask is
- already in a register. In some cases, this mask may be a constant
- that we can discover with ud-chains, in which case the above
- transformation is ok. However, the common usage here is for the
- mask to be produced by an UNSPEC_LVSL, in which case the mask
- cannot be known at compile time. In such a case we would have to
- generate several instructions to compute M' as above at run time,
- and a cost model is needed again.
-
- However, when the mask M for an UNSPEC_VPERM is loaded from the
- constant pool, we can replace M with M' as above at no cost
- beyond adding a constant pool entry. */
-
-/* This is based on the union-find logic in web.c. web_entry_base is
- defined in df.h. */
-class swap_web_entry : public web_entry_base
-{
- public:
- /* Pointer to the insn. */
- rtx_insn *insn;
- /* Set if insn contains a mention of a vector register. All other
- fields are undefined if this field is unset. */
- unsigned int is_relevant : 1;
- /* Set if insn is a load. */
- unsigned int is_load : 1;
- /* Set if insn is a store. */
- unsigned int is_store : 1;
- /* Set if insn is a doubleword swap. This can either be a register swap
- or a permuting load or store (test is_load and is_store for this). */
- unsigned int is_swap : 1;
- /* Set if the insn has a live-in use of a parameter register. */
- unsigned int is_live_in : 1;
- /* Set if the insn has a live-out def of a return register. */
- unsigned int is_live_out : 1;
- /* Set if the insn contains a subreg reference of a vector register. */
- unsigned int contains_subreg : 1;
- /* Set if the insn contains a 128-bit integer operand. */
- unsigned int is_128_int : 1;
- /* Set if this is a call-insn. */
- unsigned int is_call : 1;
- /* Set if this insn does not perform a vector operation for which
- element order matters, or if we know how to fix it up if it does.
- Undefined if is_swap is set. */
- unsigned int is_swappable : 1;
- /* A nonzero value indicates what kind of special handling for this
- insn is required if doublewords are swapped. Undefined if
- is_swappable is not set. */
- unsigned int special_handling : 4;
- /* Set if the web represented by this entry cannot be optimized. */
- unsigned int web_not_optimizable : 1;
- /* Set if this insn should be deleted. */
- unsigned int will_delete : 1;
-};
-
-enum special_handling_values {
- SH_NONE = 0,
- SH_CONST_VECTOR,
- SH_SUBREG,
- SH_NOSWAP_LD,
- SH_NOSWAP_ST,
- SH_EXTRACT,
- SH_SPLAT,
- SH_XXPERMDI,
- SH_CONCAT,
- SH_VPERM
-};
-
-/* Union INSN with all insns containing definitions that reach USE.
- Detect whether USE is live-in to the current function. */
-static void
-union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
-{
- struct df_link *link = DF_REF_CHAIN (use);
-
- if (!link)
- insn_entry[INSN_UID (insn)].is_live_in = 1;
-
- while (link)
- {
- if (DF_REF_IS_ARTIFICIAL (link->ref))
- insn_entry[INSN_UID (insn)].is_live_in = 1;
-
- if (DF_REF_INSN_INFO (link->ref))
- {
- rtx def_insn = DF_REF_INSN (link->ref);
- (void)unionfind_union (insn_entry + INSN_UID (insn),
- insn_entry + INSN_UID (def_insn));
- }
-
- link = link->next;
- }
-}
-
-/* Union INSN with all insns containing uses reached from DEF.
- Detect whether DEF is live-out from the current function. */
-static void
-union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
-{
- struct df_link *link = DF_REF_CHAIN (def);
-
- if (!link)
- insn_entry[INSN_UID (insn)].is_live_out = 1;
-
- while (link)
- {
- /* This could be an eh use or some other artificial use;
- we treat these all the same (killing the optimization). */
- if (DF_REF_IS_ARTIFICIAL (link->ref))
- insn_entry[INSN_UID (insn)].is_live_out = 1;
-
- if (DF_REF_INSN_INFO (link->ref))
- {
- rtx use_insn = DF_REF_INSN (link->ref);
- (void)unionfind_union (insn_entry + INSN_UID (insn),
- insn_entry + INSN_UID (use_insn));
- }
-
- link = link->next;
- }
-}
-
-/* Return 1 iff INSN is a load insn, including permuting loads that
- represent an lvxd2x instruction; else return 0. */
-static unsigned int
-insn_is_load_p (rtx insn)
-{
- rtx body = PATTERN (insn);
-
- if (GET_CODE (body) == SET)
- {
- if (GET_CODE (SET_SRC (body)) == MEM)
- return 1;
-
- if (GET_CODE (SET_SRC (body)) == VEC_SELECT
- && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM)
- return 1;
-
- return 0;
- }
-
- if (GET_CODE (body) != PARALLEL)
- return 0;
-
- rtx set = XVECEXP (body, 0, 0);
-
- if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM)
- return 1;
-
- return 0;
-}
-
-/* Return 1 iff INSN is a store insn, including permuting stores that
- represent an stvxd2x instruction; else return 0. */
-static unsigned int
-insn_is_store_p (rtx insn)
-{
- rtx body = PATTERN (insn);
- if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM)
- return 1;
- if (GET_CODE (body) != PARALLEL)
- return 0;
- rtx set = XVECEXP (body, 0, 0);
- if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM)
- return 1;
- return 0;
-}
-
-/* Return 1 iff INSN swaps doublewords. This may be a reg-reg swap,
- a permuting load, or a permuting store. */
-static unsigned int
-insn_is_swap_p (rtx insn)
-{
- rtx body = PATTERN (insn);
- if (GET_CODE (body) != SET)
- return 0;
- rtx rhs = SET_SRC (body);
- if (GET_CODE (rhs) != VEC_SELECT)
- return 0;
- rtx parallel = XEXP (rhs, 1);
- if (GET_CODE (parallel) != PARALLEL)
- return 0;
- unsigned int len = XVECLEN (parallel, 0);
- if (len != 2 && len != 4 && len != 8 && len != 16)
- return 0;
- for (unsigned int i = 0; i < len / 2; ++i)
- {
- rtx op = XVECEXP (parallel, 0, i);
- if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i)
- return 0;
- }
- for (unsigned int i = len / 2; i < len; ++i)
- {
- rtx op = XVECEXP (parallel, 0, i);
- if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2)
- return 0;
- }
- return 1;
-}
-
-/* Return TRUE if insn is a swap fed by a load from the constant pool. */
-static bool
-const_load_sequence_p (swap_web_entry *insn_entry, rtx insn)
-{
- unsigned uid = INSN_UID (insn);
- if (!insn_entry[uid].is_swap || insn_entry[uid].is_load)
- return false;
-
- const_rtx tocrel_base;
-
- /* Find the unique use in the swap and locate its def. If the def
- isn't unique, punt. */
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- df_ref use;
- FOR_EACH_INSN_INFO_USE (use, insn_info)
- {
- struct df_link *def_link = DF_REF_CHAIN (use);
- if (!def_link || def_link->next)
- return false;
-
- rtx def_insn = DF_REF_INSN (def_link->ref);
- unsigned uid2 = INSN_UID (def_insn);
- if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap)
- return false;
-
- rtx body = PATTERN (def_insn);
- if (GET_CODE (body) != SET
- || GET_CODE (SET_SRC (body)) != VEC_SELECT
- || GET_CODE (XEXP (SET_SRC (body), 0)) != MEM)
- return false;
-
- rtx mem = XEXP (SET_SRC (body), 0);
- rtx base_reg = XEXP (mem, 0);
-
- df_ref base_use;
- insn_info = DF_INSN_INFO_GET (def_insn);
- FOR_EACH_INSN_INFO_USE (base_use, insn_info)
- {
- if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
- continue;
-
- struct df_link *base_def_link = DF_REF_CHAIN (base_use);
- if (!base_def_link || base_def_link->next)
- return false;
-
- rtx tocrel_insn = DF_REF_INSN (base_def_link->ref);
- rtx tocrel_body = PATTERN (tocrel_insn);
- rtx base, offset;
- if (GET_CODE (tocrel_body) != SET)
- return false;
- /* There is an extra level of indirection for small/large
- code models. */
- rtx tocrel_expr = SET_SRC (tocrel_body);
- if (GET_CODE (tocrel_expr) == MEM)
- tocrel_expr = XEXP (tocrel_expr, 0);
- if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
- return false;
- split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
- if (GET_CODE (base) != SYMBOL_REF || !CONSTANT_POOL_ADDRESS_P (base))
- return false;
- }
- }
- return true;
-}
-
-/* Return TRUE iff OP matches a V2DF reduction pattern. See the
- definition of vsx_reduc_<VEC_reduc_name>_v2df in vsx.md. */
-static bool
-v2df_reduction_p (rtx op)
-{
- if (GET_MODE (op) != V2DFmode)
- return false;
-
- enum rtx_code code = GET_CODE (op);
- if (code != PLUS && code != SMIN && code != SMAX)
- return false;
-
- rtx concat = XEXP (op, 0);
- if (GET_CODE (concat) != VEC_CONCAT)
- return false;
-
- rtx select0 = XEXP (concat, 0);
- rtx select1 = XEXP (concat, 1);
- if (GET_CODE (select0) != VEC_SELECT || GET_CODE (select1) != VEC_SELECT)
- return false;
-
- rtx reg0 = XEXP (select0, 0);
- rtx reg1 = XEXP (select1, 0);
- if (!rtx_equal_p (reg0, reg1) || !REG_P (reg0))
- return false;
-
- rtx parallel0 = XEXP (select0, 1);
- rtx parallel1 = XEXP (select1, 1);
- if (GET_CODE (parallel0) != PARALLEL || GET_CODE (parallel1) != PARALLEL)
- return false;
-
- if (!rtx_equal_p (XVECEXP (parallel0, 0, 0), const1_rtx)
- || !rtx_equal_p (XVECEXP (parallel1, 0, 0), const0_rtx))
- return false;
-
- return true;
-}
-
-/* Return 1 iff OP is an operand that will not be affected by having
- vector doublewords swapped in memory. */
-static unsigned int
-rtx_is_swappable_p (rtx op, unsigned int *special)
-{
- enum rtx_code code = GET_CODE (op);
- int i, j;
- rtx parallel;
-
- switch (code)
- {
- case LABEL_REF:
- case SYMBOL_REF:
- case CLOBBER:
- case REG:
- return 1;
-
- case VEC_CONCAT:
- case ASM_INPUT:
- case ASM_OPERANDS:
- return 0;
-
- case CONST_VECTOR:
- {
- *special = SH_CONST_VECTOR;
- return 1;
- }
-
- case VEC_DUPLICATE:
- /* Opportunity: If XEXP (op, 0) has the same mode as the result,
- and XEXP (op, 1) is a PARALLEL with a single QImode const int,
- it represents a vector splat for which we can do special
- handling. */
- if (GET_CODE (XEXP (op, 0)) == CONST_INT)
- return 1;
- else if (REG_P (XEXP (op, 0))
- && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
- /* This catches V2DF and V2DI splat, at a minimum. */
- return 1;
- else if (GET_CODE (XEXP (op, 0)) == TRUNCATE
- && REG_P (XEXP (XEXP (op, 0), 0))
- && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
- /* This catches splat of a truncated value. */
- return 1;
- else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
- /* If the duplicated item is from a select, defer to the select
- processing to see if we can change the lane for the splat. */
- return rtx_is_swappable_p (XEXP (op, 0), special);
- else
- return 0;
-
- case VEC_SELECT:
- /* A vec_extract operation is ok if we change the lane. */
- if (GET_CODE (XEXP (op, 0)) == REG
- && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
- && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
- && XVECLEN (parallel, 0) == 1
- && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
- {
- *special = SH_EXTRACT;
- return 1;
- }
- /* An XXPERMDI is ok if we adjust the lanes. Note that if the
- XXPERMDI is a swap operation, it will be identified by
- insn_is_swap_p and therefore we won't get here. */
- else if (GET_CODE (XEXP (op, 0)) == VEC_CONCAT
- && (GET_MODE (XEXP (op, 0)) == V4DFmode
- || GET_MODE (XEXP (op, 0)) == V4DImode)
- && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
- && XVECLEN (parallel, 0) == 2
- && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT
- && GET_CODE (XVECEXP (parallel, 0, 1)) == CONST_INT)
- {
- *special = SH_XXPERMDI;
- return 1;
- }
- else if (v2df_reduction_p (op))
- return 1;
- else
- return 0;
-
- case UNSPEC:
- {
- /* Various operations are unsafe for this optimization, at least
- without significant additional work. Permutes are obviously
- problematic, as both the permute control vector and the ordering
- of the target values are invalidated by doubleword swapping.
- Vector pack and unpack modify the number of vector lanes.
- Merge-high/low will not operate correctly on swapped operands.
- Vector shifts across element boundaries are clearly uncool,
- as are vector select and concatenate operations. Vector
- sum-across instructions define one operand with a specific
- order-dependent element, so additional fixup code would be
- needed to make those work. Vector set and non-immediate-form
- vector splat are element-order sensitive. A few of these
- cases might be workable with special handling if required.
- Adding cost modeling would be appropriate in some cases. */
- int val = XINT (op, 1);
- switch (val)
- {
- default:
- break;
- case UNSPEC_VMRGH_DIRECT:
- case UNSPEC_VMRGL_DIRECT:
- case UNSPEC_VPACK_SIGN_SIGN_SAT:
- case UNSPEC_VPACK_SIGN_UNS_SAT:
- case UNSPEC_VPACK_UNS_UNS_MOD:
- case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
- case UNSPEC_VPACK_UNS_UNS_SAT:
- case UNSPEC_VPERM:
- case UNSPEC_VPERM_UNS:
- case UNSPEC_VPERMHI:
- case UNSPEC_VPERMSI:
- case UNSPEC_VPKPX:
- case UNSPEC_VSLDOI:
- case UNSPEC_VSLO:
- case UNSPEC_VSRO:
- case UNSPEC_VSUM2SWS:
- case UNSPEC_VSUM4S:
- case UNSPEC_VSUM4UBS:
- case UNSPEC_VSUMSWS:
- case UNSPEC_VSUMSWS_DIRECT:
- case UNSPEC_VSX_CONCAT:
- case UNSPEC_VSX_SET:
- case UNSPEC_VSX_SLDWI:
- case UNSPEC_VUNPACK_HI_SIGN:
- case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
- case UNSPEC_VUNPACK_LO_SIGN:
- case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
- case UNSPEC_VUPKHPX:
- case UNSPEC_VUPKHS_V4SF:
- case UNSPEC_VUPKHU_V4SF:
- case UNSPEC_VUPKLPX:
- case UNSPEC_VUPKLS_V4SF:
- case UNSPEC_VUPKLU_V4SF:
- case UNSPEC_VSX_CVDPSPN:
- case UNSPEC_VSX_CVSPDP:
- case UNSPEC_VSX_CVSPDPN:
- case UNSPEC_VSX_EXTRACT:
- case UNSPEC_VSX_VSLO:
- case UNSPEC_VSX_VEC_INIT:
- return 0;
- case UNSPEC_VSPLT_DIRECT:
- case UNSPEC_VSX_XXSPLTD:
- *special = SH_SPLAT;
- return 1;
- case UNSPEC_REDUC_PLUS:
- case UNSPEC_REDUC:
- return 1;
- }
- }
-
- default:
- break;
- }
-
- const char *fmt = GET_RTX_FORMAT (code);
- int ok = 1;
-
- for (i = 0; i < GET_RTX_LENGTH (code); ++i)
- if (fmt[i] == 'e' || fmt[i] == 'u')
- {
- unsigned int special_op = SH_NONE;
- ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
- if (special_op == SH_NONE)
- continue;
- /* Ensure we never have two kinds of special handling
- for the same insn. */
- if (*special != SH_NONE && *special != special_op)
- return 0;
- *special = special_op;
- }
- else if (fmt[i] == 'E')
- for (j = 0; j < XVECLEN (op, i); ++j)
- {
- unsigned int special_op = SH_NONE;
- ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
- if (special_op == SH_NONE)
- continue;
- /* Ensure we never have two kinds of special handling
- for the same insn. */
- if (*special != SH_NONE && *special != special_op)
- return 0;
- *special = special_op;
- }
-
- return ok;
-}
-
-/* Return 1 iff INSN is an operand that will not be affected by
- having vector doublewords swapped in memory (in which case
- *SPECIAL is unchanged), or that can be modified to be correct
- if vector doublewords are swapped in memory (in which case
- *SPECIAL is changed to a value indicating how). */
-static unsigned int
-insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
- unsigned int *special)
-{
- /* Calls are always bad. */
- if (GET_CODE (insn) == CALL_INSN)
- return 0;
-
- /* Loads and stores seen here are not permuting, but we can still
- fix them up by converting them to permuting ones. Exceptions:
- UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
- body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
- for the SET source. Also we must now make an exception for lvx
- and stvx when they are not in the UNSPEC_LVX/STVX form (with the
- explicit "& -16") since this leads to unrecognizable insns. */
- rtx body = PATTERN (insn);
- int i = INSN_UID (insn);
-
- if (insn_entry[i].is_load)
- {
- if (GET_CODE (body) == SET)
- {
- rtx rhs = SET_SRC (body);
- /* Even without a swap, the RHS might be a vec_select for, say,
- a byte-reversing load. */
- if (GET_CODE (rhs) != MEM)
- return 0;
- if (GET_CODE (XEXP (rhs, 0)) == AND)
- return 0;
-
- *special = SH_NOSWAP_LD;
- return 1;
- }
- else
- return 0;
- }
-
- if (insn_entry[i].is_store)
- {
- if (GET_CODE (body) == SET
- && GET_CODE (SET_SRC (body)) != UNSPEC)
- {
- rtx lhs = SET_DEST (body);
- /* Even without a swap, the LHS might be a vec_select for, say,
- a byte-reversing store. */
- if (GET_CODE (lhs) != MEM)
- return 0;
- if (GET_CODE (XEXP (lhs, 0)) == AND)
- return 0;
-
- *special = SH_NOSWAP_ST;
- return 1;
- }
- else
- return 0;
- }
-
- /* A convert to single precision can be left as is provided that
- all of its uses are in xxspltw instructions that splat BE element
- zero. */
- if (GET_CODE (body) == SET
- && GET_CODE (SET_SRC (body)) == UNSPEC
- && XINT (SET_SRC (body), 1) == UNSPEC_VSX_CVDPSPN)
- {
- df_ref def;
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-
- FOR_EACH_INSN_INFO_DEF (def, insn_info)
- {
- struct df_link *link = DF_REF_CHAIN (def);
- if (!link)
- return 0;
-
- for (; link; link = link->next) {
- rtx use_insn = DF_REF_INSN (link->ref);
- rtx use_body = PATTERN (use_insn);
- if (GET_CODE (use_body) != SET
- || GET_CODE (SET_SRC (use_body)) != UNSPEC
- || XINT (SET_SRC (use_body), 1) != UNSPEC_VSX_XXSPLTW
- || XVECEXP (SET_SRC (use_body), 0, 1) != const0_rtx)
- return 0;
- }
- }
-
- return 1;
- }
-
- /* A concatenation of two doublewords is ok if we reverse the
- order of the inputs. */
- if (GET_CODE (body) == SET
- && GET_CODE (SET_SRC (body)) == VEC_CONCAT
- && (GET_MODE (SET_SRC (body)) == V2DFmode
- || GET_MODE (SET_SRC (body)) == V2DImode))
- {
- *special = SH_CONCAT;
- return 1;
- }
-
- /* V2DF reductions are always swappable. */
- if (GET_CODE (body) == PARALLEL)
- {
- rtx expr = XVECEXP (body, 0, 0);
- if (GET_CODE (expr) == SET
- && v2df_reduction_p (SET_SRC (expr)))
- return 1;
- }
-
- /* An UNSPEC_VPERM is ok if the mask operand is loaded from the
- constant pool. */
- if (GET_CODE (body) == SET
- && GET_CODE (SET_SRC (body)) == UNSPEC
- && XINT (SET_SRC (body), 1) == UNSPEC_VPERM
- && XVECLEN (SET_SRC (body), 0) == 3
- && GET_CODE (XVECEXP (SET_SRC (body), 0, 2)) == REG)
- {
- rtx mask_reg = XVECEXP (SET_SRC (body), 0, 2);
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- df_ref use;
- FOR_EACH_INSN_INFO_USE (use, insn_info)
- if (rtx_equal_p (DF_REF_REG (use), mask_reg))
- {
- struct df_link *def_link = DF_REF_CHAIN (use);
- /* Punt if multiple definitions for this reg. */
- if (def_link && !def_link->next &&
- const_load_sequence_p (insn_entry,
- DF_REF_INSN (def_link->ref)))
- {
- *special = SH_VPERM;
- return 1;
- }
- }
- }
-
- /* Otherwise check the operands for vector lane violations. */
- return rtx_is_swappable_p (body, special);
-}
-
-enum chain_purpose { FOR_LOADS, FOR_STORES };
-
-/* Return true if the UD or DU chain headed by LINK is non-empty,
- and every entry on the chain references an insn that is a
- register swap. Furthermore, if PURPOSE is FOR_LOADS, each such
- register swap must have only permuting loads as reaching defs.
- If PURPOSE is FOR_STORES, each such register swap must have only
- register swaps or permuting stores as reached uses. */
-static bool
-chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
- enum chain_purpose purpose)
-{
- if (!link)
- return false;
-
- for (; link; link = link->next)
- {
- if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (DF_REF_REG (link->ref))))
- continue;
-
- if (DF_REF_IS_ARTIFICIAL (link->ref))
- return false;
-
- rtx reached_insn = DF_REF_INSN (link->ref);
- unsigned uid = INSN_UID (reached_insn);
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (reached_insn);
-
- if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
- || insn_entry[uid].is_store)
- return false;
-
- if (purpose == FOR_LOADS)
- {
- df_ref use;
- FOR_EACH_INSN_INFO_USE (use, insn_info)
- {
- struct df_link *swap_link = DF_REF_CHAIN (use);
-
- while (swap_link)
- {
- if (DF_REF_IS_ARTIFICIAL (link->ref))
- return false;
-
- rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
- unsigned uid2 = INSN_UID (swap_def_insn);
-
- /* Only permuting loads are allowed. */
- if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
- return false;
-
- swap_link = swap_link->next;
- }
- }
- }
- else if (purpose == FOR_STORES)
- {
- df_ref def;
- FOR_EACH_INSN_INFO_DEF (def, insn_info)
- {
- struct df_link *swap_link = DF_REF_CHAIN (def);
-
- while (swap_link)
- {
- if (DF_REF_IS_ARTIFICIAL (link->ref))
- return false;
-
- rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
- unsigned uid2 = INSN_UID (swap_use_insn);
-
- /* Permuting stores or register swaps are allowed. */
- if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
- return false;
-
- swap_link = swap_link->next;
- }
- }
- }
- }
-
- return true;
-}
-
-/* Mark the xxswapdi instructions associated with permuting loads and
- stores for removal. Note that we only flag them for deletion here,
- as there is a possibility of a swap being reached from multiple
- loads, etc. */
-static void
-mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
-{
- rtx insn = insn_entry[i].insn;
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
-
- if (insn_entry[i].is_load)
- {
- df_ref def;
- FOR_EACH_INSN_INFO_DEF (def, insn_info)
- {
- struct df_link *link = DF_REF_CHAIN (def);
-
- /* We know by now that these are swaps, so we can delete
- them confidently. */
- while (link)
- {
- rtx use_insn = DF_REF_INSN (link->ref);
- insn_entry[INSN_UID (use_insn)].will_delete = 1;
- link = link->next;
- }
- }
- }
- else if (insn_entry[i].is_store)
- {
- df_ref use;
- FOR_EACH_INSN_INFO_USE (use, insn_info)
- {
- /* Ignore uses for addressability. */
- machine_mode mode = GET_MODE (DF_REF_REG (use));
- if (!ALTIVEC_OR_VSX_VECTOR_MODE (mode))
- continue;
-
- struct df_link *link = DF_REF_CHAIN (use);
-
- /* We know by now that these are swaps, so we can delete
- them confidently. */
- while (link)
- {
- rtx def_insn = DF_REF_INSN (link->ref);
- insn_entry[INSN_UID (def_insn)].will_delete = 1;
- link = link->next;
- }
- }
- }
-}
-
-/* OP is either a CONST_VECTOR or an expression containing one.
- Swap the first half of the vector with the second in the first
- case. Recurse to find it in the second. */
-static void
-swap_const_vector_halves (rtx op)
-{
- int i;
- enum rtx_code code = GET_CODE (op);
- if (GET_CODE (op) == CONST_VECTOR)
- {
- int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2;
- for (i = 0; i < half_units; ++i)
- {
- rtx temp = CONST_VECTOR_ELT (op, i);
- CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units);
- CONST_VECTOR_ELT (op, i + half_units) = temp;
- }
- }
- else
- {
- int j;
- const char *fmt = GET_RTX_FORMAT (code);
- for (i = 0; i < GET_RTX_LENGTH (code); ++i)
- if (fmt[i] == 'e' || fmt[i] == 'u')
- swap_const_vector_halves (XEXP (op, i));
- else if (fmt[i] == 'E')
- for (j = 0; j < XVECLEN (op, i); ++j)
- swap_const_vector_halves (XVECEXP (op, i, j));
- }
-}
-
-/* Find all subregs of a vector expression that perform a narrowing,
- and adjust the subreg index to account for doubleword swapping. */
-static void
-adjust_subreg_index (rtx op)
-{
- enum rtx_code code = GET_CODE (op);
- if (code == SUBREG
- && (GET_MODE_SIZE (GET_MODE (op))
- < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
- {
- unsigned int index = SUBREG_BYTE (op);
- if (index < 8)
- index += 8;
- else
- index -= 8;
- SUBREG_BYTE (op) = index;
- }
-
- const char *fmt = GET_RTX_FORMAT (code);
- int i,j;
- for (i = 0; i < GET_RTX_LENGTH (code); ++i)
- if (fmt[i] == 'e' || fmt[i] == 'u')
- adjust_subreg_index (XEXP (op, i));
- else if (fmt[i] == 'E')
- for (j = 0; j < XVECLEN (op, i); ++j)
- adjust_subreg_index (XVECEXP (op, i, j));
-}
-
-/* Convert the non-permuting load INSN to a permuting one. */
-static void
-permute_load (rtx_insn *insn)
-{
- rtx body = PATTERN (insn);
- rtx mem_op = SET_SRC (body);
- rtx tgt_reg = SET_DEST (body);
- machine_mode mode = GET_MODE (tgt_reg);
- int n_elts = GET_MODE_NUNITS (mode);
- int half_elts = n_elts / 2;
- rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
- int i, j;
- for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
- XVECEXP (par, 0, i) = GEN_INT (j);
- for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
- XVECEXP (par, 0, i) = GEN_INT (j);
- rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
- SET_SRC (body) = sel;
- INSN_CODE (insn) = -1; /* Force re-recognition. */
- df_insn_rescan (insn);
-
- if (dump_file)
- fprintf (dump_file, "Replacing load %d with permuted load\n",
- INSN_UID (insn));
-}
-
-/* Convert the non-permuting store INSN to a permuting one. */
-static void
-permute_store (rtx_insn *insn)
-{
- rtx body = PATTERN (insn);
- rtx src_reg = SET_SRC (body);
- machine_mode mode = GET_MODE (src_reg);
- int n_elts = GET_MODE_NUNITS (mode);
- int half_elts = n_elts / 2;
- rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
- int i, j;
- for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
- XVECEXP (par, 0, i) = GEN_INT (j);
- for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
- XVECEXP (par, 0, i) = GEN_INT (j);
- rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
- SET_SRC (body) = sel;
- INSN_CODE (insn) = -1; /* Force re-recognition. */
- df_insn_rescan (insn);
-
- if (dump_file)
- fprintf (dump_file, "Replacing store %d with permuted store\n",
- INSN_UID (insn));
-}
-
-/* Given OP that contains a vector extract operation, adjust the index
- of the extracted lane to account for the doubleword swap. */
-static void
-adjust_extract (rtx_insn *insn)
-{
- rtx pattern = PATTERN (insn);
- if (GET_CODE (pattern) == PARALLEL)
- pattern = XVECEXP (pattern, 0, 0);
- rtx src = SET_SRC (pattern);
- /* The vec_select may be wrapped in a vec_duplicate for a splat, so
- account for that. */
- rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
- rtx par = XEXP (sel, 1);
- int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
- int lane = INTVAL (XVECEXP (par, 0, 0));
- lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
- XVECEXP (par, 0, 0) = GEN_INT (lane);
- INSN_CODE (insn) = -1; /* Force re-recognition. */
- df_insn_rescan (insn);
-
- if (dump_file)
- fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
-}
-
-/* Given OP that contains a vector direct-splat operation, adjust the index
- of the source lane to account for the doubleword swap. */
-static void
-adjust_splat (rtx_insn *insn)
-{
- rtx body = PATTERN (insn);
- rtx unspec = XEXP (body, 1);
- int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
- int lane = INTVAL (XVECEXP (unspec, 0, 1));
- lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
- XVECEXP (unspec, 0, 1) = GEN_INT (lane);
- INSN_CODE (insn) = -1; /* Force re-recognition. */
- df_insn_rescan (insn);
-
- if (dump_file)
- fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
-}
-
-/* Given OP that contains an XXPERMDI operation (that is not a doubleword
- swap), reverse the order of the source operands and adjust the indices
- of the source lanes to account for doubleword reversal. */
-static void
-adjust_xxpermdi (rtx_insn *insn)
-{
- rtx set = PATTERN (insn);
- rtx select = XEXP (set, 1);
- rtx concat = XEXP (select, 0);
- rtx src0 = XEXP (concat, 0);
- XEXP (concat, 0) = XEXP (concat, 1);
- XEXP (concat, 1) = src0;
- rtx parallel = XEXP (select, 1);
- int lane0 = INTVAL (XVECEXP (parallel, 0, 0));
- int lane1 = INTVAL (XVECEXP (parallel, 0, 1));
- int new_lane0 = 3 - lane1;
- int new_lane1 = 3 - lane0;
- XVECEXP (parallel, 0, 0) = GEN_INT (new_lane0);
- XVECEXP (parallel, 0, 1) = GEN_INT (new_lane1);
- INSN_CODE (insn) = -1; /* Force re-recognition. */
- df_insn_rescan (insn);
-
- if (dump_file)
- fprintf (dump_file, "Changing lanes for xxpermdi %d\n", INSN_UID (insn));
-}
-
-/* Given OP that contains a VEC_CONCAT operation of two doublewords,
- reverse the order of those inputs. */
-static void
-adjust_concat (rtx_insn *insn)
-{
- rtx set = PATTERN (insn);
- rtx concat = XEXP (set, 1);
- rtx src0 = XEXP (concat, 0);
- XEXP (concat, 0) = XEXP (concat, 1);
- XEXP (concat, 1) = src0;
- INSN_CODE (insn) = -1; /* Force re-recognition. */
- df_insn_rescan (insn);
-
- if (dump_file)
- fprintf (dump_file, "Reversing inputs for concat %d\n", INSN_UID (insn));
-}
-
-/* Given an UNSPEC_VPERM insn, modify the mask loaded from the
- constant pool to reflect swapped doublewords. */
-static void
-adjust_vperm (rtx_insn *insn)
-{
- /* We previously determined that the UNSPEC_VPERM was fed by a
- swap of a swapping load of a TOC-relative constant pool symbol.
- Find the MEM in the swapping load and replace it with a MEM for
- the adjusted mask constant. */
- rtx set = PATTERN (insn);
- rtx mask_reg = XVECEXP (SET_SRC (set), 0, 2);
-
- /* Find the swap. */
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- df_ref use;
- rtx_insn *swap_insn = 0;
- FOR_EACH_INSN_INFO_USE (use, insn_info)
- if (rtx_equal_p (DF_REF_REG (use), mask_reg))
- {
- struct df_link *def_link = DF_REF_CHAIN (use);
- gcc_assert (def_link && !def_link->next);
- swap_insn = DF_REF_INSN (def_link->ref);
- break;
- }
- gcc_assert (swap_insn);
-
- /* Find the load. */
- insn_info = DF_INSN_INFO_GET (swap_insn);
- rtx_insn *load_insn = 0;
- FOR_EACH_INSN_INFO_USE (use, insn_info)
- {
- struct df_link *def_link = DF_REF_CHAIN (use);
- gcc_assert (def_link && !def_link->next);
- load_insn = DF_REF_INSN (def_link->ref);
- break;
- }
- gcc_assert (load_insn);
-
- /* Find the TOC-relative symbol access. */
- insn_info = DF_INSN_INFO_GET (load_insn);
- rtx_insn *tocrel_insn = 0;
- FOR_EACH_INSN_INFO_USE (use, insn_info)
- {
- struct df_link *def_link = DF_REF_CHAIN (use);
- gcc_assert (def_link && !def_link->next);
- tocrel_insn = DF_REF_INSN (def_link->ref);
- break;
- }
- gcc_assert (tocrel_insn);
-
- /* Find the embedded CONST_VECTOR. We have to call toc_relative_expr_p
- to set tocrel_base; otherwise it would be unnecessary as we've
- already established it will return true. */
- rtx base, offset;
- const_rtx tocrel_base;
- rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn));
- /* There is an extra level of indirection for small/large code models. */
- if (GET_CODE (tocrel_expr) == MEM)
- tocrel_expr = XEXP (tocrel_expr, 0);
- if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
- gcc_unreachable ();
- split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
- rtx const_vector = get_pool_constant (base);
- /* With the extra indirection, get_pool_constant will produce the
- real constant from the reg_equal expression, so get the real
- constant. */
- if (GET_CODE (const_vector) == SYMBOL_REF)
- const_vector = get_pool_constant (const_vector);
- gcc_assert (GET_CODE (const_vector) == CONST_VECTOR);
-
- /* Create an adjusted mask from the initial mask. */
- unsigned int new_mask[16], i, val;
- for (i = 0; i < 16; ++i) {
- val = INTVAL (XVECEXP (const_vector, 0, i));
- if (val < 16)
- new_mask[i] = (val + 8) % 16;
- else
- new_mask[i] = ((val + 8) % 16) + 16;
- }
-
- /* Create a new CONST_VECTOR and a MEM that references it. */
- rtx vals = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
- for (i = 0; i < 16; ++i)
- XVECEXP (vals, 0, i) = GEN_INT (new_mask[i]);
- rtx new_const_vector = gen_rtx_CONST_VECTOR (V16QImode, XVEC (vals, 0));
- rtx new_mem = force_const_mem (V16QImode, new_const_vector);
- /* This gives us a MEM whose base operand is a SYMBOL_REF, which we
- can't recognize. Force the SYMBOL_REF into a register. */
- if (!REG_P (XEXP (new_mem, 0))) {
- rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0));
- XEXP (new_mem, 0) = base_reg;
- /* Move the newly created insn ahead of the load insn. */
- rtx_insn *force_insn = get_last_insn ();
- remove_insn (force_insn);
- rtx_insn *before_load_insn = PREV_INSN (load_insn);
- add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn));
- df_insn_rescan (before_load_insn);
- df_insn_rescan (force_insn);
- }
-
- /* Replace the MEM in the load instruction and rescan it. */
- XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem;
- INSN_CODE (load_insn) = -1; /* Force re-recognition. */
- df_insn_rescan (load_insn);
-
- if (dump_file)
- fprintf (dump_file, "Adjusting mask for vperm %d\n", INSN_UID (insn));
-}
-
-/* The insn described by INSN_ENTRY[I] can be swapped, but only
- with special handling. Take care of that here. */
-static void
-handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
-{
- rtx_insn *insn = insn_entry[i].insn;
- rtx body = PATTERN (insn);
-
- switch (insn_entry[i].special_handling)
- {
- default:
- gcc_unreachable ();
- case SH_CONST_VECTOR:
- {
- /* A CONST_VECTOR will only show up somewhere in the RHS of a SET. */
- gcc_assert (GET_CODE (body) == SET);
- rtx rhs = SET_SRC (body);
- swap_const_vector_halves (rhs);
- if (dump_file)
- fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
- break;
- }
- case SH_SUBREG:
- /* A subreg of the same size is already safe. For subregs that
- select a smaller portion of a reg, adjust the index for
- swapped doublewords. */
- adjust_subreg_index (body);
- if (dump_file)
- fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
- break;
- case SH_NOSWAP_LD:
- /* Convert a non-permuting load to a permuting one. */
- permute_load (insn);
- break;
- case SH_NOSWAP_ST:
- /* Convert a non-permuting store to a permuting one. */
- permute_store (insn);
- break;
- case SH_EXTRACT:
- /* Change the lane on an extract operation. */
- adjust_extract (insn);
- break;
- case SH_SPLAT:
- /* Change the lane on a direct-splat operation. */
- adjust_splat (insn);
- break;
- case SH_XXPERMDI:
- /* Change the lanes on an XXPERMDI operation. */
- adjust_xxpermdi (insn);
- break;
- case SH_CONCAT:
- /* Reverse the order of a concatenation operation. */
- adjust_concat (insn);
- break;
- case SH_VPERM:
- /* Change the mask loaded from the constant pool for a VPERM. */
- adjust_vperm (insn);
- break;
- }
-}
-
-/* Find the insn from the Ith table entry, which is known to be a
- register swap Y = SWAP(X). Replace it with a copy Y = X. */
-static void
-replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
-{
- rtx_insn *insn = insn_entry[i].insn;
- rtx body = PATTERN (insn);
- rtx src_reg = XEXP (SET_SRC (body), 0);
- rtx copy = gen_rtx_SET (SET_DEST (body), src_reg);
- rtx_insn *new_insn = emit_insn_before (copy, insn);
- set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
- df_insn_rescan (new_insn);
-
- if (dump_file)
- {
- unsigned int new_uid = INSN_UID (new_insn);
- fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
- }
-
- df_insn_delete (insn);
- remove_insn (insn);
- insn->set_deleted ();
-}
-
-/* Dump the swap table to DUMP_FILE. */
-static void
-dump_swap_insn_table (swap_web_entry *insn_entry)
-{
- int e = get_max_uid ();
- fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
-
- for (int i = 0; i < e; ++i)
- if (insn_entry[i].is_relevant)
- {
- swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
- fprintf (dump_file, "%6d %6d ", i,
- pred_entry && pred_entry->insn
- ? INSN_UID (pred_entry->insn) : 0);
- if (insn_entry[i].is_load)
- fputs ("load ", dump_file);
- if (insn_entry[i].is_store)
- fputs ("store ", dump_file);
- if (insn_entry[i].is_swap)
- fputs ("swap ", dump_file);
- if (insn_entry[i].is_live_in)
- fputs ("live-in ", dump_file);
- if (insn_entry[i].is_live_out)
- fputs ("live-out ", dump_file);
- if (insn_entry[i].contains_subreg)
- fputs ("subreg ", dump_file);
- if (insn_entry[i].is_128_int)
- fputs ("int128 ", dump_file);
- if (insn_entry[i].is_call)
- fputs ("call ", dump_file);
- if (insn_entry[i].is_swappable)
- {
- fputs ("swappable ", dump_file);
- if (insn_entry[i].special_handling == SH_CONST_VECTOR)
- fputs ("special:constvec ", dump_file);
- else if (insn_entry[i].special_handling == SH_SUBREG)
- fputs ("special:subreg ", dump_file);
- else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
- fputs ("special:load ", dump_file);
- else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
- fputs ("special:store ", dump_file);
- else if (insn_entry[i].special_handling == SH_EXTRACT)
- fputs ("special:extract ", dump_file);
- else if (insn_entry[i].special_handling == SH_SPLAT)
- fputs ("special:splat ", dump_file);
- else if (insn_entry[i].special_handling == SH_XXPERMDI)
- fputs ("special:xxpermdi ", dump_file);
- else if (insn_entry[i].special_handling == SH_CONCAT)
- fputs ("special:concat ", dump_file);
- else if (insn_entry[i].special_handling == SH_VPERM)
- fputs ("special:vperm ", dump_file);
- }
- if (insn_entry[i].web_not_optimizable)
- fputs ("unoptimizable ", dump_file);
- if (insn_entry[i].will_delete)
- fputs ("delete ", dump_file);
- fputs ("\n", dump_file);
- }
- fputs ("\n", dump_file);
-}
-
-/* Return RTX with its address canonicalized to (reg) or (+ reg reg).
- Here RTX is an (& addr (const_int -16)). Always return a new copy
- to avoid problems with combine. */
-static rtx
-alignment_with_canonical_addr (rtx align)
-{
- rtx canon;
- rtx addr = XEXP (align, 0);
-
- if (REG_P (addr))
- canon = addr;
-
- else if (GET_CODE (addr) == PLUS)
- {
- rtx addrop0 = XEXP (addr, 0);
- rtx addrop1 = XEXP (addr, 1);
-
- if (!REG_P (addrop0))
- addrop0 = force_reg (GET_MODE (addrop0), addrop0);
-
- if (!REG_P (addrop1))
- addrop1 = force_reg (GET_MODE (addrop1), addrop1);
-
- canon = gen_rtx_PLUS (GET_MODE (addr), addrop0, addrop1);
- }
-
- else
- canon = force_reg (GET_MODE (addr), addr);
-
- return gen_rtx_AND (GET_MODE (align), canon, GEN_INT (-16));
-}
-
-/* Check whether an rtx is an alignment mask, and if so, return
- a fully-expanded rtx for the masking operation. */
-static rtx
-alignment_mask (rtx_insn *insn)
-{
- rtx body = PATTERN (insn);
-
- if (GET_CODE (body) != SET
- || GET_CODE (SET_SRC (body)) != AND
- || !REG_P (XEXP (SET_SRC (body), 0)))
- return 0;
-
- rtx mask = XEXP (SET_SRC (body), 1);
-
- if (GET_CODE (mask) == CONST_INT)
- {
- if (INTVAL (mask) == -16)
- return alignment_with_canonical_addr (SET_SRC (body));
- else
- return 0;
- }
-
- if (!REG_P (mask))
- return 0;
-
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- df_ref use;
- rtx real_mask = 0;
-
- FOR_EACH_INSN_INFO_USE (use, insn_info)
- {
- if (!rtx_equal_p (DF_REF_REG (use), mask))
- continue;
-
- struct df_link *def_link = DF_REF_CHAIN (use);
- if (!def_link || def_link->next)
- return 0;
-
- rtx_insn *const_insn = DF_REF_INSN (def_link->ref);
- rtx const_body = PATTERN (const_insn);
- if (GET_CODE (const_body) != SET)
- return 0;
-
- real_mask = SET_SRC (const_body);
-
- if (GET_CODE (real_mask) != CONST_INT
- || INTVAL (real_mask) != -16)
- return 0;
- }
-
- if (real_mask == 0)
- return 0;
-
- return alignment_with_canonical_addr (SET_SRC (body));
-}
-
-/* Given INSN that's a load or store based at BASE_REG, look for a
- feeding computation that aligns its address on a 16-byte boundary. */
-static rtx
-find_alignment_op (rtx_insn *insn, rtx base_reg)
-{
- df_ref base_use;
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- rtx and_operation = 0;
-
- FOR_EACH_INSN_INFO_USE (base_use, insn_info)
- {
- if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
- continue;
-
- struct df_link *base_def_link = DF_REF_CHAIN (base_use);
- if (!base_def_link || base_def_link->next)
- break;
-
- /* With stack-protector code enabled, and possibly in other
- circumstances, there may not be an associated insn for
- the def. */
- if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
- break;
-
- rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref);
- and_operation = alignment_mask (and_insn);
- if (and_operation != 0)
- break;
- }
-
- return and_operation;
-}
-
-struct del_info { bool replace; rtx_insn *replace_insn; };
-
-/* If INSN is the load for an lvx pattern, put it in canonical form. */
-static void
-recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete)
-{
- rtx body = PATTERN (insn);
- gcc_assert (GET_CODE (body) == SET
- && GET_CODE (SET_SRC (body)) == VEC_SELECT
- && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM);
-
- rtx mem = XEXP (SET_SRC (body), 0);
- rtx base_reg = XEXP (mem, 0);
-
- rtx and_operation = find_alignment_op (insn, base_reg);
-
- if (and_operation != 0)
- {
- df_ref def;
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- FOR_EACH_INSN_INFO_DEF (def, insn_info)
- {
- struct df_link *link = DF_REF_CHAIN (def);
- if (!link || link->next)
- break;
-
- rtx_insn *swap_insn = DF_REF_INSN (link->ref);
- if (!insn_is_swap_p (swap_insn)
- || insn_is_load_p (swap_insn)
- || insn_is_store_p (swap_insn))
- break;
-
- /* Expected lvx pattern found. Change the swap to
- a copy, and propagate the AND operation into the
- load. */
- to_delete[INSN_UID (swap_insn)].replace = true;
- to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
-
- XEXP (mem, 0) = and_operation;
- SET_SRC (body) = mem;
- INSN_CODE (insn) = -1; /* Force re-recognition. */
- df_insn_rescan (insn);
-
- if (dump_file)
- fprintf (dump_file, "lvx opportunity found at %d\n",
- INSN_UID (insn));
- }
- }
-}
-
-/* If INSN is the store for an stvx pattern, put it in canonical form. */
-static void
-recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete)
-{
- rtx body = PATTERN (insn);
- gcc_assert (GET_CODE (body) == SET
- && GET_CODE (SET_DEST (body)) == MEM
- && GET_CODE (SET_SRC (body)) == VEC_SELECT);
- rtx mem = SET_DEST (body);
- rtx base_reg = XEXP (mem, 0);
-
- rtx and_operation = find_alignment_op (insn, base_reg);
-
- if (and_operation != 0)
- {
- rtx src_reg = XEXP (SET_SRC (body), 0);
- df_ref src_use;
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- FOR_EACH_INSN_INFO_USE (src_use, insn_info)
- {
- if (!rtx_equal_p (DF_REF_REG (src_use), src_reg))
- continue;
-
- struct df_link *link = DF_REF_CHAIN (src_use);
- if (!link || link->next)
- break;
-
- rtx_insn *swap_insn = DF_REF_INSN (link->ref);
- if (!insn_is_swap_p (swap_insn)
- || insn_is_load_p (swap_insn)
- || insn_is_store_p (swap_insn))
- break;
-
- /* Expected stvx pattern found. Change the swap to
- a copy, and propagate the AND operation into the
- store. */
- to_delete[INSN_UID (swap_insn)].replace = true;
- to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
-
- XEXP (mem, 0) = and_operation;
- SET_SRC (body) = src_reg;
- INSN_CODE (insn) = -1; /* Force re-recognition. */
- df_insn_rescan (insn);
-
- if (dump_file)
- fprintf (dump_file, "stvx opportunity found at %d\n",
- INSN_UID (insn));
- }
- }
-}
-
-/* Look for patterns created from builtin lvx and stvx calls, and
- canonicalize them to be properly recognized as such. */
-static void
-recombine_lvx_stvx_patterns (function *fun)
-{
- int i;
- basic_block bb;
- rtx_insn *insn;
-
- int num_insns = get_max_uid ();
- del_info *to_delete = XCNEWVEC (del_info, num_insns);
-
- FOR_ALL_BB_FN (bb, fun)
- FOR_BB_INSNS (bb, insn)
- {
- if (!NONDEBUG_INSN_P (insn))
- continue;
-
- if (insn_is_load_p (insn) && insn_is_swap_p (insn))
- recombine_lvx_pattern (insn, to_delete);
- else if (insn_is_store_p (insn) && insn_is_swap_p (insn))
- recombine_stvx_pattern (insn, to_delete);
- }
-
- /* Turning swaps into copies is delayed until now, to avoid problems
- with deleting instructions during the insn walk. */
- for (i = 0; i < num_insns; i++)
- if (to_delete[i].replace)
- {
- rtx swap_body = PATTERN (to_delete[i].replace_insn);
- rtx src_reg = XEXP (SET_SRC (swap_body), 0);
- rtx copy = gen_rtx_SET (SET_DEST (swap_body), src_reg);
- rtx_insn *new_insn = emit_insn_before (copy,
- to_delete[i].replace_insn);
- set_block_for_insn (new_insn,
- BLOCK_FOR_INSN (to_delete[i].replace_insn));
- df_insn_rescan (new_insn);
- df_insn_delete (to_delete[i].replace_insn);
- remove_insn (to_delete[i].replace_insn);
- to_delete[i].replace_insn->set_deleted ();
- }
-
- free (to_delete);
-}
-
-/* Main entry point for this pass. */
-unsigned int
-rs6000_analyze_swaps (function *fun)
-{
- swap_web_entry *insn_entry;
- basic_block bb;
- rtx_insn *insn, *curr_insn = 0;
-
- /* Dataflow analysis for use-def chains. */
- df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
- df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
- df_analyze ();
- df_set_flags (DF_DEFER_INSN_RESCAN);
-
- /* Pre-pass to recombine lvx and stvx patterns so we don't lose info. */
- recombine_lvx_stvx_patterns (fun);
-
- /* Allocate structure to represent webs of insns. */
- insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
-
- /* Walk the insns to gather basic data. */
- FOR_ALL_BB_FN (bb, fun)
- FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
- {
- unsigned int uid = INSN_UID (insn);
- if (NONDEBUG_INSN_P (insn))
- {
- insn_entry[uid].insn = insn;
-
- if (GET_CODE (insn) == CALL_INSN)
- insn_entry[uid].is_call = 1;
-
- /* Walk the uses and defs to see if we mention vector regs.
- Record any constraints on optimization of such mentions. */
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- df_ref mention;
- FOR_EACH_INSN_INFO_USE (mention, insn_info)
- {
- /* We use DF_REF_REAL_REG here to get inside any subregs. */
- machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
-
- /* If a use gets its value from a call insn, it will be
- a hard register and will look like (reg:V4SI 3 3).
- The df analysis creates two mentions for GPR3 and GPR4,
- both DImode. We must recognize this and treat it as a
- vector mention to ensure the call is unioned with this
- use. */
- if (mode == DImode && DF_REF_INSN_INFO (mention))
- {
- rtx feeder = DF_REF_INSN (mention);
- /* FIXME: It is pretty hard to get from the df mention
- to the mode of the use in the insn. We arbitrarily
- pick a vector mode here, even though the use might
- be a real DImode. We can be too conservative
- (create a web larger than necessary) because of
- this, so consider eventually fixing this. */
- if (GET_CODE (feeder) == CALL_INSN)
- mode = V4SImode;
- }
-
- if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
- {
- insn_entry[uid].is_relevant = 1;
- if (mode == TImode || mode == V1TImode
- || FLOAT128_VECTOR_P (mode))
- insn_entry[uid].is_128_int = 1;
- if (DF_REF_INSN_INFO (mention))
- insn_entry[uid].contains_subreg
- = !rtx_equal_p (DF_REF_REG (mention),
- DF_REF_REAL_REG (mention));
- union_defs (insn_entry, insn, mention);
- }
- }
- FOR_EACH_INSN_INFO_DEF (mention, insn_info)
- {
- /* We use DF_REF_REAL_REG here to get inside any subregs. */
- machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
-
- /* If we're loading up a hard vector register for a call,
- it looks like (set (reg:V4SI 9 9) (...)). The df
- analysis creates two mentions for GPR9 and GPR10, both
- DImode. So relying on the mode from the mentions
- isn't sufficient to ensure we union the call into the
- web with the parameter setup code. */
- if (mode == DImode && GET_CODE (insn) == SET
- && ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (SET_DEST (insn))))
- mode = GET_MODE (SET_DEST (insn));
-
- if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
- {
- insn_entry[uid].is_relevant = 1;
- if (mode == TImode || mode == V1TImode
- || FLOAT128_VECTOR_P (mode))
- insn_entry[uid].is_128_int = 1;
- if (DF_REF_INSN_INFO (mention))
- insn_entry[uid].contains_subreg
- = !rtx_equal_p (DF_REF_REG (mention),
- DF_REF_REAL_REG (mention));
- /* REG_FUNCTION_VALUE_P is not valid for subregs. */
- else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
- insn_entry[uid].is_live_out = 1;
- union_uses (insn_entry, insn, mention);
- }
- }
-
- if (insn_entry[uid].is_relevant)
- {
- /* Determine if this is a load or store. */
- insn_entry[uid].is_load = insn_is_load_p (insn);
- insn_entry[uid].is_store = insn_is_store_p (insn);
-
- /* Determine if this is a doubleword swap. If not,
- determine whether it can legally be swapped. */
- if (insn_is_swap_p (insn))
- insn_entry[uid].is_swap = 1;
- else
- {
- unsigned int special = SH_NONE;
- insn_entry[uid].is_swappable
- = insn_is_swappable_p (insn_entry, insn, &special);
- if (special != SH_NONE && insn_entry[uid].contains_subreg)
- insn_entry[uid].is_swappable = 0;
- else if (special != SH_NONE)
- insn_entry[uid].special_handling = special;
- else if (insn_entry[uid].contains_subreg)
- insn_entry[uid].special_handling = SH_SUBREG;
- }
- }
- }
- }
-
- if (dump_file)
- {
- fprintf (dump_file, "\nSwap insn entry table when first built\n");
- dump_swap_insn_table (insn_entry);
- }
-
- /* Record unoptimizable webs. */
- unsigned e = get_max_uid (), i;
- for (i = 0; i < e; ++i)
- {
- if (!insn_entry[i].is_relevant)
- continue;
-
- swap_web_entry *root
- = (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
-
- if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
- || (insn_entry[i].contains_subreg
- && insn_entry[i].special_handling != SH_SUBREG)
- || insn_entry[i].is_128_int || insn_entry[i].is_call
- || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
- root->web_not_optimizable = 1;
-
- /* If we have loads or stores that aren't permuting then the
- optimization isn't appropriate. */
- else if ((insn_entry[i].is_load || insn_entry[i].is_store)
- && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
- root->web_not_optimizable = 1;
-
- /* If we have permuting loads or stores that are not accompanied
- by a register swap, the optimization isn't appropriate. */
- else if (insn_entry[i].is_load && insn_entry[i].is_swap)
- {
- rtx insn = insn_entry[i].insn;
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- df_ref def;
-
- FOR_EACH_INSN_INFO_DEF (def, insn_info)
- {
- struct df_link *link = DF_REF_CHAIN (def);
-
- if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
- {
- root->web_not_optimizable = 1;
- break;
- }
- }
- }
- else if (insn_entry[i].is_store && insn_entry[i].is_swap)
- {
- rtx insn = insn_entry[i].insn;
- struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
- df_ref use;
-
- FOR_EACH_INSN_INFO_USE (use, insn_info)
- {
- struct df_link *link = DF_REF_CHAIN (use);
-
- if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
- {
- root->web_not_optimizable = 1;
- break;
- }
- }
- }
- }
-
- if (dump_file)
- {
- fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
- dump_swap_insn_table (insn_entry);
- }
-
- /* For each load and store in an optimizable web (which implies
- the loads and stores are permuting), find the associated
- register swaps and mark them for removal. Due to various
- optimizations we may mark the same swap more than once. Also
- perform special handling for swappable insns that require it. */
- for (i = 0; i < e; ++i)
- if ((insn_entry[i].is_load || insn_entry[i].is_store)
- && insn_entry[i].is_swap)
- {
- swap_web_entry* root_entry
- = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
- if (!root_entry->web_not_optimizable)
- mark_swaps_for_removal (insn_entry, i);
- }
- else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
- {
- swap_web_entry* root_entry
- = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
- if (!root_entry->web_not_optimizable)
- handle_special_swappables (insn_entry, i);
- }
-
- /* Now delete the swaps marked for removal. */
- for (i = 0; i < e; ++i)
- if (insn_entry[i].will_delete)
- replace_swap_with_copy (insn_entry, i);
-
- /* Clean up. */
- free (insn_entry);
- return 0;
-}
-
-const pass_data pass_data_analyze_swaps =
-{
- RTL_PASS, /* type */
- "swaps", /* name */
- OPTGROUP_NONE, /* optinfo_flags */
- TV_NONE, /* tv_id */
- 0, /* properties_required */
- 0, /* properties_provided */
- 0, /* properties_destroyed */
- 0, /* todo_flags_start */
- TODO_df_finish, /* todo_flags_finish */
-};
-
-class pass_analyze_swaps : public rtl_opt_pass
-{
-public:
- pass_analyze_swaps(gcc::context *ctxt)
- : rtl_opt_pass(pass_data_analyze_swaps, ctxt)
- {}
-
- /* opt_pass methods: */
- virtual bool gate (function *)
- {
- return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
- && !TARGET_P9_VECTOR && rs6000_optimize_swaps);
- }
-
- virtual unsigned int execute (function *fun)
- {
- return rs6000_analyze_swaps (fun);
- }
-
- opt_pass *clone ()
- {
- return new pass_analyze_swaps (m_ctxt);
- }
-
-}; // class pass_analyze_swaps
-
-rtl_opt_pass *
-make_pass_analyze_swaps (gcc::context *ctxt)
-{
- return new pass_analyze_swaps (ctxt);
-}
-
#ifdef RS6000_GLIBC_ATOMIC_FENV
/* Function declarations for rs6000_atomic_assign_expand_fenv. */
static tree atomic_hold_decl, atomic_clear_decl, atomic_update_decl;
$(COMPILE) $<
$(POSTCOMPILE)
+rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.c
+ $(COMPILE) $<
+ $(POSTCOMPILE)
+
$(srcdir)/config/rs6000/rs6000-tables.opt: $(srcdir)/config/rs6000/genopt.sh \
$(srcdir)/config/rs6000/rs6000-cpus.def
$(SHELL) $(srcdir)/config/rs6000/genopt.sh $(srcdir)/config/rs6000 > \