From 0dc6645fc3b10c78c02d3543d344b9b5fba0d0d5 Mon Sep 17 00:00:00 2001 From: Aaron Sawdey Date: Thu, 6 Jul 2017 20:20:48 +0000 Subject: [PATCH] rs6000.c (union_defs, [...]): Move all code related to p8 swap optimizations to file rs6000-p8swap.c. 2017-07-06 Aaron Sawdey * config/rs6000/rs6000.c (union_defs, union_uses, insn_is_load_p, insn_is_store_p, insn_is_swap_p, const_load_sequence_p, v2df_reduction_p, rtx_is_swappable_p, insn_is_swappable_p, chain_contains_only_swaps, mark_swaps_for_removal, swap_const_vector_halves, adjust_subreg_index, permute_load, permute_store, adjust_extract, adjust_splat, adjust_xxpermdi, adjust_concat, adjust_vperm, handle_special_swappables, replace_swap_with_copy, dump_swap_insn_table, alignment_with_canonical_addr, alignment_mask, find_alignment_op, recombine_lvx_pattern, recombine_stvx_pattern, recombine_lvx_stvx_patterns, rs6000_analyze_swaps, make_pass_analyze_swaps): Move all code related to p8 swap optimizations to file rs6000-p8swap.c. * config/rs6000/rs6000-p8swap.c: New file. * config/rs6000/t-rs6000: Add rule to build rs6000-p8swap.o. * config.gcc: Add rs6000-p8swap.o to extra_objs for powerpc*-*-* and rs6000*-*-* targets. From-SVN: r250040 --- gcc/ChangeLog | 19 + gcc/config.gcc | 4 +- gcc/config/rs6000/rs6000-p8swap.c | 1892 +++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000.c | 1856 ---------------------------- gcc/config/rs6000/t-rs6000 | 4 + 5 files changed, 1917 insertions(+), 1858 deletions(-) create mode 100644 gcc/config/rs6000/rs6000-p8swap.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 355572f4257..933d7152283 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,22 @@ +2017-07-06 Aaron Sawdey + + * config/rs6000/rs6000.c (union_defs, union_uses, insn_is_load_p, + insn_is_store_p, insn_is_swap_p, const_load_sequence_p, v2df_reduction_p, + rtx_is_swappable_p, insn_is_swappable_p, chain_contains_only_swaps, + mark_swaps_for_removal, swap_const_vector_halves, adjust_subreg_index, + permute_load, permute_store, adjust_extract, adjust_splat, + adjust_xxpermdi, adjust_concat, adjust_vperm, handle_special_swappables, + replace_swap_with_copy, dump_swap_insn_table, + alignment_with_canonical_addr, alignment_mask, find_alignment_op, + recombine_lvx_pattern, recombine_stvx_pattern, + recombine_lvx_stvx_patterns, rs6000_analyze_swaps, + make_pass_analyze_swaps): Move all code related to p8 swap optimizations + to file rs6000-p8swap.c. + * config/rs6000/rs6000-p8swap.c: New file. + * config/rs6000/t-rs6000: Add rule to build rs6000-p8swap.o. + * config.gcc: Add rs6000-p8swap.o to extra_objs for powerpc*-*-* + and rs6000*-*-* targets. + 2017-07-06 David Malcolm * Makefile.in (selftest): Remove dependency on s-selftest-c++. diff --git a/gcc/config.gcc b/gcc/config.gcc index c5ae8cab7d9..4a729507200 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -454,7 +454,7 @@ powerpc*-*-*spe*) ;; powerpc*-*-*) cpu_type=rs6000 - extra_objs="rs6000-string.o" + extra_objs="rs6000-string.o rs6000-p8swap.o" extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h" extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h" extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h" @@ -472,7 +472,7 @@ riscv*) ;; rs6000*-*-*) extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt" - extra_objs="rs6000-string.o" + extra_objs="rs6000-string.o rs6000-p8swap.o" ;; sparc*-*-*) cpu_type=sparc diff --git a/gcc/config/rs6000/rs6000-p8swap.c b/gcc/config/rs6000/rs6000-p8swap.c new file mode 100644 index 00000000000..1557f7f587e --- /dev/null +++ b/gcc/config/rs6000/rs6000-p8swap.c @@ -0,0 +1,1892 @@ +/* Subroutines used to remove unnecessary doubleword swaps + for p8 little-endian VSX code. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "rtl.h" +#include "tree.h" +#include "memmodel.h" +#include "df.h" +#include "tm_p.h" +#include "ira.h" +#include "print-tree.h" +#include "varasm.h" +#include "explow.h" +#include "expr.h" +#include "output.h" +#include "tree-pass.h" + +/* Analyze vector computations and remove unnecessary doubleword + swaps (xxswapdi instructions). This pass is performed only + for little-endian VSX code generation. + + For this specific case, loads and stores of 4x32 and 2x64 vectors + are inefficient. These are implemented using the lvx2dx and + stvx2dx instructions, which invert the order of doublewords in + a vector register. Thus the code generation inserts an xxswapdi + after each such load, and prior to each such store. (For spill + code after register assignment, an additional xxswapdi is inserted + following each store in order to return a hard register to its + unpermuted value.) + + The extra xxswapdi instructions reduce performance. This can be + particularly bad for vectorized code. The purpose of this pass + is to reduce the number of xxswapdi instructions required for + correctness. + + The primary insight is that much code that operates on vectors + does not care about the relative order of elements in a register, + so long as the correct memory order is preserved. If we have + a computation where all input values are provided by lvxd2x/xxswapdi + sequences, all outputs are stored using xxswapdi/stvxd2x sequences, + and all intermediate computations are pure SIMD (independent of + element order), then all the xxswapdi's associated with the loads + and stores may be removed. + + This pass uses some of the infrastructure and logical ideas from + the "web" pass in web.c. We create maximal webs of computations + fitting the description above using union-find. Each such web is + then optimized by removing its unnecessary xxswapdi instructions. + + The pass is placed prior to global optimization so that we can + perform the optimization in the safest and simplest way possible; + that is, by replacing each xxswapdi insn with a register copy insn. + Subsequent forward propagation will remove copies where possible. + + There are some operations sensitive to element order for which we + can still allow the operation, provided we modify those operations. + These include CONST_VECTORs, for which we must swap the first and + second halves of the constant vector; and SUBREGs, for which we + must adjust the byte offset to account for the swapped doublewords. + A remaining opportunity would be non-immediate-form splats, for + which we should adjust the selected lane of the input. We should + also make code generation adjustments for sum-across operations, + since this is a common vectorizer reduction. + + Because we run prior to the first split, we can see loads and stores + here that match *vsx_le_perm_{load,store}_. These are vanilla + vector loads and stores that have not yet been split into a permuting + load/store and a swap. (One way this can happen is with a builtin + call to vec_vsx_{ld,st}.) We can handle these as well, but rather + than deleting a swap, we convert the load/store into a permuting + load/store (which effectively removes the swap). */ + +/* Notes on Permutes + + We do not currently handle computations that contain permutes. There + is a general transformation that can be performed correctly, but it + may introduce more expensive code than it replaces. To handle these + would require a cost model to determine when to perform the optimization. + This commentary records how this could be done if desired. + + The most general permute is something like this (example for V16QI): + + (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI)) + (parallel [(const_int a0) (const_int a1) + ... + (const_int a14) (const_int a15)])) + + where a0,...,a15 are in [0,31] and select elements from op1 and op2 + to produce in the result. + + Regardless of mode, we can convert the PARALLEL to a mask of 16 + byte-element selectors. Let's call this M, with M[i] representing + the ith byte-element selector value. Then if we swap doublewords + throughout the computation, we can get correct behavior by replacing + M with M' as follows: + + M'[i] = { (M[i]+8)%16 : M[i] in [0,15] + { ((M[i]+8)%16)+16 : M[i] in [16,31] + + This seems promising at first, since we are just replacing one mask + with another. But certain masks are preferable to others. If M + is a mask that matches a vmrghh pattern, for example, M' certainly + will not. Instead of a single vmrghh, we would generate a load of + M' and a vperm. So we would need to know how many xxswapd's we can + remove as a result of this transformation to determine if it's + profitable; and preferably the logic would need to be aware of all + the special preferable masks. + + Another form of permute is an UNSPEC_VPERM, in which the mask is + already in a register. In some cases, this mask may be a constant + that we can discover with ud-chains, in which case the above + transformation is ok. However, the common usage here is for the + mask to be produced by an UNSPEC_LVSL, in which case the mask + cannot be known at compile time. In such a case we would have to + generate several instructions to compute M' as above at run time, + and a cost model is needed again. + + However, when the mask M for an UNSPEC_VPERM is loaded from the + constant pool, we can replace M with M' as above at no cost + beyond adding a constant pool entry. */ + +/* This is based on the union-find logic in web.c. web_entry_base is + defined in df.h. */ +class swap_web_entry : public web_entry_base +{ + public: + /* Pointer to the insn. */ + rtx_insn *insn; + /* Set if insn contains a mention of a vector register. All other + fields are undefined if this field is unset. */ + unsigned int is_relevant : 1; + /* Set if insn is a load. */ + unsigned int is_load : 1; + /* Set if insn is a store. */ + unsigned int is_store : 1; + /* Set if insn is a doubleword swap. This can either be a register swap + or a permuting load or store (test is_load and is_store for this). */ + unsigned int is_swap : 1; + /* Set if the insn has a live-in use of a parameter register. */ + unsigned int is_live_in : 1; + /* Set if the insn has a live-out def of a return register. */ + unsigned int is_live_out : 1; + /* Set if the insn contains a subreg reference of a vector register. */ + unsigned int contains_subreg : 1; + /* Set if the insn contains a 128-bit integer operand. */ + unsigned int is_128_int : 1; + /* Set if this is a call-insn. */ + unsigned int is_call : 1; + /* Set if this insn does not perform a vector operation for which + element order matters, or if we know how to fix it up if it does. + Undefined if is_swap is set. */ + unsigned int is_swappable : 1; + /* A nonzero value indicates what kind of special handling for this + insn is required if doublewords are swapped. Undefined if + is_swappable is not set. */ + unsigned int special_handling : 4; + /* Set if the web represented by this entry cannot be optimized. */ + unsigned int web_not_optimizable : 1; + /* Set if this insn should be deleted. */ + unsigned int will_delete : 1; +}; + +enum special_handling_values { + SH_NONE = 0, + SH_CONST_VECTOR, + SH_SUBREG, + SH_NOSWAP_LD, + SH_NOSWAP_ST, + SH_EXTRACT, + SH_SPLAT, + SH_XXPERMDI, + SH_CONCAT, + SH_VPERM +}; + +/* Union INSN with all insns containing definitions that reach USE. + Detect whether USE is live-in to the current function. */ +static void +union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use) +{ + struct df_link *link = DF_REF_CHAIN (use); + + if (!link) + insn_entry[INSN_UID (insn)].is_live_in = 1; + + while (link) + { + if (DF_REF_IS_ARTIFICIAL (link->ref)) + insn_entry[INSN_UID (insn)].is_live_in = 1; + + if (DF_REF_INSN_INFO (link->ref)) + { + rtx def_insn = DF_REF_INSN (link->ref); + (void)unionfind_union (insn_entry + INSN_UID (insn), + insn_entry + INSN_UID (def_insn)); + } + + link = link->next; + } +} + +/* Union INSN with all insns containing uses reached from DEF. + Detect whether DEF is live-out from the current function. */ +static void +union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def) +{ + struct df_link *link = DF_REF_CHAIN (def); + + if (!link) + insn_entry[INSN_UID (insn)].is_live_out = 1; + + while (link) + { + /* This could be an eh use or some other artificial use; + we treat these all the same (killing the optimization). */ + if (DF_REF_IS_ARTIFICIAL (link->ref)) + insn_entry[INSN_UID (insn)].is_live_out = 1; + + if (DF_REF_INSN_INFO (link->ref)) + { + rtx use_insn = DF_REF_INSN (link->ref); + (void)unionfind_union (insn_entry + INSN_UID (insn), + insn_entry + INSN_UID (use_insn)); + } + + link = link->next; + } +} + +/* Return 1 iff INSN is a load insn, including permuting loads that + represent an lvxd2x instruction; else return 0. */ +static unsigned int +insn_is_load_p (rtx insn) +{ + rtx body = PATTERN (insn); + + if (GET_CODE (body) == SET) + { + if (GET_CODE (SET_SRC (body)) == MEM) + return 1; + + if (GET_CODE (SET_SRC (body)) == VEC_SELECT + && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM) + return 1; + + return 0; + } + + if (GET_CODE (body) != PARALLEL) + return 0; + + rtx set = XVECEXP (body, 0, 0); + + if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM) + return 1; + + return 0; +} + +/* Return 1 iff INSN is a store insn, including permuting stores that + represent an stvxd2x instruction; else return 0. */ +static unsigned int +insn_is_store_p (rtx insn) +{ + rtx body = PATTERN (insn); + if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM) + return 1; + if (GET_CODE (body) != PARALLEL) + return 0; + rtx set = XVECEXP (body, 0, 0); + if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM) + return 1; + return 0; +} + +/* Return 1 iff INSN swaps doublewords. This may be a reg-reg swap, + a permuting load, or a permuting store. */ +static unsigned int +insn_is_swap_p (rtx insn) +{ + rtx body = PATTERN (insn); + if (GET_CODE (body) != SET) + return 0; + rtx rhs = SET_SRC (body); + if (GET_CODE (rhs) != VEC_SELECT) + return 0; + rtx parallel = XEXP (rhs, 1); + if (GET_CODE (parallel) != PARALLEL) + return 0; + unsigned int len = XVECLEN (parallel, 0); + if (len != 2 && len != 4 && len != 8 && len != 16) + return 0; + for (unsigned int i = 0; i < len / 2; ++i) + { + rtx op = XVECEXP (parallel, 0, i); + if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i) + return 0; + } + for (unsigned int i = len / 2; i < len; ++i) + { + rtx op = XVECEXP (parallel, 0, i); + if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2) + return 0; + } + return 1; +} + +/* Return TRUE if insn is a swap fed by a load from the constant pool. */ +static bool +const_load_sequence_p (swap_web_entry *insn_entry, rtx insn) +{ + unsigned uid = INSN_UID (insn); + if (!insn_entry[uid].is_swap || insn_entry[uid].is_load) + return false; + + const_rtx tocrel_base; + + /* Find the unique use in the swap and locate its def. If the def + isn't unique, punt. */ + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + df_ref use; + FOR_EACH_INSN_INFO_USE (use, insn_info) + { + struct df_link *def_link = DF_REF_CHAIN (use); + if (!def_link || def_link->next) + return false; + + rtx def_insn = DF_REF_INSN (def_link->ref); + unsigned uid2 = INSN_UID (def_insn); + if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap) + return false; + + rtx body = PATTERN (def_insn); + if (GET_CODE (body) != SET + || GET_CODE (SET_SRC (body)) != VEC_SELECT + || GET_CODE (XEXP (SET_SRC (body), 0)) != MEM) + return false; + + rtx mem = XEXP (SET_SRC (body), 0); + rtx base_reg = XEXP (mem, 0); + + df_ref base_use; + insn_info = DF_INSN_INFO_GET (def_insn); + FOR_EACH_INSN_INFO_USE (base_use, insn_info) + { + if (!rtx_equal_p (DF_REF_REG (base_use), base_reg)) + continue; + + struct df_link *base_def_link = DF_REF_CHAIN (base_use); + if (!base_def_link || base_def_link->next) + return false; + + rtx tocrel_insn = DF_REF_INSN (base_def_link->ref); + rtx tocrel_body = PATTERN (tocrel_insn); + rtx base, offset; + if (GET_CODE (tocrel_body) != SET) + return false; + /* There is an extra level of indirection for small/large + code models. */ + rtx tocrel_expr = SET_SRC (tocrel_body); + if (GET_CODE (tocrel_expr) == MEM) + tocrel_expr = XEXP (tocrel_expr, 0); + if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL)) + return false; + split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset); + if (GET_CODE (base) != SYMBOL_REF || !CONSTANT_POOL_ADDRESS_P (base)) + return false; + } + } + return true; +} + +/* Return TRUE iff OP matches a V2DF reduction pattern. See the + definition of vsx_reduc__v2df in vsx.md. */ +static bool +v2df_reduction_p (rtx op) +{ + if (GET_MODE (op) != V2DFmode) + return false; + + enum rtx_code code = GET_CODE (op); + if (code != PLUS && code != SMIN && code != SMAX) + return false; + + rtx concat = XEXP (op, 0); + if (GET_CODE (concat) != VEC_CONCAT) + return false; + + rtx select0 = XEXP (concat, 0); + rtx select1 = XEXP (concat, 1); + if (GET_CODE (select0) != VEC_SELECT || GET_CODE (select1) != VEC_SELECT) + return false; + + rtx reg0 = XEXP (select0, 0); + rtx reg1 = XEXP (select1, 0); + if (!rtx_equal_p (reg0, reg1) || !REG_P (reg0)) + return false; + + rtx parallel0 = XEXP (select0, 1); + rtx parallel1 = XEXP (select1, 1); + if (GET_CODE (parallel0) != PARALLEL || GET_CODE (parallel1) != PARALLEL) + return false; + + if (!rtx_equal_p (XVECEXP (parallel0, 0, 0), const1_rtx) + || !rtx_equal_p (XVECEXP (parallel1, 0, 0), const0_rtx)) + return false; + + return true; +} + +/* Return 1 iff OP is an operand that will not be affected by having + vector doublewords swapped in memory. */ +static unsigned int +rtx_is_swappable_p (rtx op, unsigned int *special) +{ + enum rtx_code code = GET_CODE (op); + int i, j; + rtx parallel; + + switch (code) + { + case LABEL_REF: + case SYMBOL_REF: + case CLOBBER: + case REG: + return 1; + + case VEC_CONCAT: + case ASM_INPUT: + case ASM_OPERANDS: + return 0; + + case CONST_VECTOR: + { + *special = SH_CONST_VECTOR; + return 1; + } + + case VEC_DUPLICATE: + /* Opportunity: If XEXP (op, 0) has the same mode as the result, + and XEXP (op, 1) is a PARALLEL with a single QImode const int, + it represents a vector splat for which we can do special + handling. */ + if (GET_CODE (XEXP (op, 0)) == CONST_INT) + return 1; + else if (REG_P (XEXP (op, 0)) + && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0))) + /* This catches V2DF and V2DI splat, at a minimum. */ + return 1; + else if (GET_CODE (XEXP (op, 0)) == TRUNCATE + && REG_P (XEXP (XEXP (op, 0), 0)) + && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0))) + /* This catches splat of a truncated value. */ + return 1; + else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT) + /* If the duplicated item is from a select, defer to the select + processing to see if we can change the lane for the splat. */ + return rtx_is_swappable_p (XEXP (op, 0), special); + else + return 0; + + case VEC_SELECT: + /* A vec_extract operation is ok if we change the lane. */ + if (GET_CODE (XEXP (op, 0)) == REG + && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op) + && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL + && XVECLEN (parallel, 0) == 1 + && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT) + { + *special = SH_EXTRACT; + return 1; + } + /* An XXPERMDI is ok if we adjust the lanes. Note that if the + XXPERMDI is a swap operation, it will be identified by + insn_is_swap_p and therefore we won't get here. */ + else if (GET_CODE (XEXP (op, 0)) == VEC_CONCAT + && (GET_MODE (XEXP (op, 0)) == V4DFmode + || GET_MODE (XEXP (op, 0)) == V4DImode) + && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL + && XVECLEN (parallel, 0) == 2 + && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT + && GET_CODE (XVECEXP (parallel, 0, 1)) == CONST_INT) + { + *special = SH_XXPERMDI; + return 1; + } + else if (v2df_reduction_p (op)) + return 1; + else + return 0; + + case UNSPEC: + { + /* Various operations are unsafe for this optimization, at least + without significant additional work. Permutes are obviously + problematic, as both the permute control vector and the ordering + of the target values are invalidated by doubleword swapping. + Vector pack and unpack modify the number of vector lanes. + Merge-high/low will not operate correctly on swapped operands. + Vector shifts across element boundaries are clearly uncool, + as are vector select and concatenate operations. Vector + sum-across instructions define one operand with a specific + order-dependent element, so additional fixup code would be + needed to make those work. Vector set and non-immediate-form + vector splat are element-order sensitive. A few of these + cases might be workable with special handling if required. + Adding cost modeling would be appropriate in some cases. */ + int val = XINT (op, 1); + switch (val) + { + default: + break; + case UNSPEC_VMRGH_DIRECT: + case UNSPEC_VMRGL_DIRECT: + case UNSPEC_VPACK_SIGN_SIGN_SAT: + case UNSPEC_VPACK_SIGN_UNS_SAT: + case UNSPEC_VPACK_UNS_UNS_MOD: + case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT: + case UNSPEC_VPACK_UNS_UNS_SAT: + case UNSPEC_VPERM: + case UNSPEC_VPERM_UNS: + case UNSPEC_VPERMHI: + case UNSPEC_VPERMSI: + case UNSPEC_VPKPX: + case UNSPEC_VSLDOI: + case UNSPEC_VSLO: + case UNSPEC_VSRO: + case UNSPEC_VSUM2SWS: + case UNSPEC_VSUM4S: + case UNSPEC_VSUM4UBS: + case UNSPEC_VSUMSWS: + case UNSPEC_VSUMSWS_DIRECT: + case UNSPEC_VSX_CONCAT: + case UNSPEC_VSX_SET: + case UNSPEC_VSX_SLDWI: + case UNSPEC_VUNPACK_HI_SIGN: + case UNSPEC_VUNPACK_HI_SIGN_DIRECT: + case UNSPEC_VUNPACK_LO_SIGN: + case UNSPEC_VUNPACK_LO_SIGN_DIRECT: + case UNSPEC_VUPKHPX: + case UNSPEC_VUPKHS_V4SF: + case UNSPEC_VUPKHU_V4SF: + case UNSPEC_VUPKLPX: + case UNSPEC_VUPKLS_V4SF: + case UNSPEC_VUPKLU_V4SF: + case UNSPEC_VSX_CVDPSPN: + case UNSPEC_VSX_CVSPDP: + case UNSPEC_VSX_CVSPDPN: + case UNSPEC_VSX_EXTRACT: + case UNSPEC_VSX_VSLO: + case UNSPEC_VSX_VEC_INIT: + return 0; + case UNSPEC_VSPLT_DIRECT: + case UNSPEC_VSX_XXSPLTD: + *special = SH_SPLAT; + return 1; + case UNSPEC_REDUC_PLUS: + case UNSPEC_REDUC: + return 1; + } + } + + default: + break; + } + + const char *fmt = GET_RTX_FORMAT (code); + int ok = 1; + + for (i = 0; i < GET_RTX_LENGTH (code); ++i) + if (fmt[i] == 'e' || fmt[i] == 'u') + { + unsigned int special_op = SH_NONE; + ok &= rtx_is_swappable_p (XEXP (op, i), &special_op); + if (special_op == SH_NONE) + continue; + /* Ensure we never have two kinds of special handling + for the same insn. */ + if (*special != SH_NONE && *special != special_op) + return 0; + *special = special_op; + } + else if (fmt[i] == 'E') + for (j = 0; j < XVECLEN (op, i); ++j) + { + unsigned int special_op = SH_NONE; + ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op); + if (special_op == SH_NONE) + continue; + /* Ensure we never have two kinds of special handling + for the same insn. */ + if (*special != SH_NONE && *special != special_op) + return 0; + *special = special_op; + } + + return ok; +} + +/* Return 1 iff INSN is an operand that will not be affected by + having vector doublewords swapped in memory (in which case + *SPECIAL is unchanged), or that can be modified to be correct + if vector doublewords are swapped in memory (in which case + *SPECIAL is changed to a value indicating how). */ +static unsigned int +insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn, + unsigned int *special) +{ + /* Calls are always bad. */ + if (GET_CODE (insn) == CALL_INSN) + return 0; + + /* Loads and stores seen here are not permuting, but we can still + fix them up by converting them to permuting ones. Exceptions: + UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL + body instead of a SET; and UNSPEC_STVE, which has an UNSPEC + for the SET source. Also we must now make an exception for lvx + and stvx when they are not in the UNSPEC_LVX/STVX form (with the + explicit "& -16") since this leads to unrecognizable insns. */ + rtx body = PATTERN (insn); + int i = INSN_UID (insn); + + if (insn_entry[i].is_load) + { + if (GET_CODE (body) == SET) + { + rtx rhs = SET_SRC (body); + /* Even without a swap, the RHS might be a vec_select for, say, + a byte-reversing load. */ + if (GET_CODE (rhs) != MEM) + return 0; + if (GET_CODE (XEXP (rhs, 0)) == AND) + return 0; + + *special = SH_NOSWAP_LD; + return 1; + } + else + return 0; + } + + if (insn_entry[i].is_store) + { + if (GET_CODE (body) == SET + && GET_CODE (SET_SRC (body)) != UNSPEC) + { + rtx lhs = SET_DEST (body); + /* Even without a swap, the LHS might be a vec_select for, say, + a byte-reversing store. */ + if (GET_CODE (lhs) != MEM) + return 0; + if (GET_CODE (XEXP (lhs, 0)) == AND) + return 0; + + *special = SH_NOSWAP_ST; + return 1; + } + else + return 0; + } + + /* A convert to single precision can be left as is provided that + all of its uses are in xxspltw instructions that splat BE element + zero. */ + if (GET_CODE (body) == SET + && GET_CODE (SET_SRC (body)) == UNSPEC + && XINT (SET_SRC (body), 1) == UNSPEC_VSX_CVDPSPN) + { + df_ref def; + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + + FOR_EACH_INSN_INFO_DEF (def, insn_info) + { + struct df_link *link = DF_REF_CHAIN (def); + if (!link) + return 0; + + for (; link; link = link->next) { + rtx use_insn = DF_REF_INSN (link->ref); + rtx use_body = PATTERN (use_insn); + if (GET_CODE (use_body) != SET + || GET_CODE (SET_SRC (use_body)) != UNSPEC + || XINT (SET_SRC (use_body), 1) != UNSPEC_VSX_XXSPLTW + || XVECEXP (SET_SRC (use_body), 0, 1) != const0_rtx) + return 0; + } + } + + return 1; + } + + /* A concatenation of two doublewords is ok if we reverse the + order of the inputs. */ + if (GET_CODE (body) == SET + && GET_CODE (SET_SRC (body)) == VEC_CONCAT + && (GET_MODE (SET_SRC (body)) == V2DFmode + || GET_MODE (SET_SRC (body)) == V2DImode)) + { + *special = SH_CONCAT; + return 1; + } + + /* V2DF reductions are always swappable. */ + if (GET_CODE (body) == PARALLEL) + { + rtx expr = XVECEXP (body, 0, 0); + if (GET_CODE (expr) == SET + && v2df_reduction_p (SET_SRC (expr))) + return 1; + } + + /* An UNSPEC_VPERM is ok if the mask operand is loaded from the + constant pool. */ + if (GET_CODE (body) == SET + && GET_CODE (SET_SRC (body)) == UNSPEC + && XINT (SET_SRC (body), 1) == UNSPEC_VPERM + && XVECLEN (SET_SRC (body), 0) == 3 + && GET_CODE (XVECEXP (SET_SRC (body), 0, 2)) == REG) + { + rtx mask_reg = XVECEXP (SET_SRC (body), 0, 2); + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + df_ref use; + FOR_EACH_INSN_INFO_USE (use, insn_info) + if (rtx_equal_p (DF_REF_REG (use), mask_reg)) + { + struct df_link *def_link = DF_REF_CHAIN (use); + /* Punt if multiple definitions for this reg. */ + if (def_link && !def_link->next && + const_load_sequence_p (insn_entry, + DF_REF_INSN (def_link->ref))) + { + *special = SH_VPERM; + return 1; + } + } + } + + /* Otherwise check the operands for vector lane violations. */ + return rtx_is_swappable_p (body, special); +} + +enum chain_purpose { FOR_LOADS, FOR_STORES }; + +/* Return true if the UD or DU chain headed by LINK is non-empty, + and every entry on the chain references an insn that is a + register swap. Furthermore, if PURPOSE is FOR_LOADS, each such + register swap must have only permuting loads as reaching defs. + If PURPOSE is FOR_STORES, each such register swap must have only + register swaps or permuting stores as reached uses. */ +static bool +chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link, + enum chain_purpose purpose) +{ + if (!link) + return false; + + for (; link; link = link->next) + { + if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (DF_REF_REG (link->ref)))) + continue; + + if (DF_REF_IS_ARTIFICIAL (link->ref)) + return false; + + rtx reached_insn = DF_REF_INSN (link->ref); + unsigned uid = INSN_UID (reached_insn); + struct df_insn_info *insn_info = DF_INSN_INFO_GET (reached_insn); + + if (!insn_entry[uid].is_swap || insn_entry[uid].is_load + || insn_entry[uid].is_store) + return false; + + if (purpose == FOR_LOADS) + { + df_ref use; + FOR_EACH_INSN_INFO_USE (use, insn_info) + { + struct df_link *swap_link = DF_REF_CHAIN (use); + + while (swap_link) + { + if (DF_REF_IS_ARTIFICIAL (link->ref)) + return false; + + rtx swap_def_insn = DF_REF_INSN (swap_link->ref); + unsigned uid2 = INSN_UID (swap_def_insn); + + /* Only permuting loads are allowed. */ + if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load) + return false; + + swap_link = swap_link->next; + } + } + } + else if (purpose == FOR_STORES) + { + df_ref def; + FOR_EACH_INSN_INFO_DEF (def, insn_info) + { + struct df_link *swap_link = DF_REF_CHAIN (def); + + while (swap_link) + { + if (DF_REF_IS_ARTIFICIAL (link->ref)) + return false; + + rtx swap_use_insn = DF_REF_INSN (swap_link->ref); + unsigned uid2 = INSN_UID (swap_use_insn); + + /* Permuting stores or register swaps are allowed. */ + if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load) + return false; + + swap_link = swap_link->next; + } + } + } + } + + return true; +} + +/* Mark the xxswapdi instructions associated with permuting loads and + stores for removal. Note that we only flag them for deletion here, + as there is a possibility of a swap being reached from multiple + loads, etc. */ +static void +mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i) +{ + rtx insn = insn_entry[i].insn; + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + + if (insn_entry[i].is_load) + { + df_ref def; + FOR_EACH_INSN_INFO_DEF (def, insn_info) + { + struct df_link *link = DF_REF_CHAIN (def); + + /* We know by now that these are swaps, so we can delete + them confidently. */ + while (link) + { + rtx use_insn = DF_REF_INSN (link->ref); + insn_entry[INSN_UID (use_insn)].will_delete = 1; + link = link->next; + } + } + } + else if (insn_entry[i].is_store) + { + df_ref use; + FOR_EACH_INSN_INFO_USE (use, insn_info) + { + /* Ignore uses for addressability. */ + machine_mode mode = GET_MODE (DF_REF_REG (use)); + if (!ALTIVEC_OR_VSX_VECTOR_MODE (mode)) + continue; + + struct df_link *link = DF_REF_CHAIN (use); + + /* We know by now that these are swaps, so we can delete + them confidently. */ + while (link) + { + rtx def_insn = DF_REF_INSN (link->ref); + insn_entry[INSN_UID (def_insn)].will_delete = 1; + link = link->next; + } + } + } +} + +/* OP is either a CONST_VECTOR or an expression containing one. + Swap the first half of the vector with the second in the first + case. Recurse to find it in the second. */ +static void +swap_const_vector_halves (rtx op) +{ + int i; + enum rtx_code code = GET_CODE (op); + if (GET_CODE (op) == CONST_VECTOR) + { + int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2; + for (i = 0; i < half_units; ++i) + { + rtx temp = CONST_VECTOR_ELT (op, i); + CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units); + CONST_VECTOR_ELT (op, i + half_units) = temp; + } + } + else + { + int j; + const char *fmt = GET_RTX_FORMAT (code); + for (i = 0; i < GET_RTX_LENGTH (code); ++i) + if (fmt[i] == 'e' || fmt[i] == 'u') + swap_const_vector_halves (XEXP (op, i)); + else if (fmt[i] == 'E') + for (j = 0; j < XVECLEN (op, i); ++j) + swap_const_vector_halves (XVECEXP (op, i, j)); + } +} + +/* Find all subregs of a vector expression that perform a narrowing, + and adjust the subreg index to account for doubleword swapping. */ +static void +adjust_subreg_index (rtx op) +{ + enum rtx_code code = GET_CODE (op); + if (code == SUBREG + && (GET_MODE_SIZE (GET_MODE (op)) + < GET_MODE_SIZE (GET_MODE (XEXP (op, 0))))) + { + unsigned int index = SUBREG_BYTE (op); + if (index < 8) + index += 8; + else + index -= 8; + SUBREG_BYTE (op) = index; + } + + const char *fmt = GET_RTX_FORMAT (code); + int i,j; + for (i = 0; i < GET_RTX_LENGTH (code); ++i) + if (fmt[i] == 'e' || fmt[i] == 'u') + adjust_subreg_index (XEXP (op, i)); + else if (fmt[i] == 'E') + for (j = 0; j < XVECLEN (op, i); ++j) + adjust_subreg_index (XVECEXP (op, i, j)); +} + +/* Convert the non-permuting load INSN to a permuting one. */ +static void +permute_load (rtx_insn *insn) +{ + rtx body = PATTERN (insn); + rtx mem_op = SET_SRC (body); + rtx tgt_reg = SET_DEST (body); + machine_mode mode = GET_MODE (tgt_reg); + int n_elts = GET_MODE_NUNITS (mode); + int half_elts = n_elts / 2; + rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts)); + int i, j; + for (i = 0, j = half_elts; i < half_elts; ++i, ++j) + XVECEXP (par, 0, i) = GEN_INT (j); + for (i = half_elts, j = 0; j < half_elts; ++i, ++j) + XVECEXP (par, 0, i) = GEN_INT (j); + rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par); + SET_SRC (body) = sel; + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Replacing load %d with permuted load\n", + INSN_UID (insn)); +} + +/* Convert the non-permuting store INSN to a permuting one. */ +static void +permute_store (rtx_insn *insn) +{ + rtx body = PATTERN (insn); + rtx src_reg = SET_SRC (body); + machine_mode mode = GET_MODE (src_reg); + int n_elts = GET_MODE_NUNITS (mode); + int half_elts = n_elts / 2; + rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts)); + int i, j; + for (i = 0, j = half_elts; i < half_elts; ++i, ++j) + XVECEXP (par, 0, i) = GEN_INT (j); + for (i = half_elts, j = 0; j < half_elts; ++i, ++j) + XVECEXP (par, 0, i) = GEN_INT (j); + rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par); + SET_SRC (body) = sel; + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Replacing store %d with permuted store\n", + INSN_UID (insn)); +} + +/* Given OP that contains a vector extract operation, adjust the index + of the extracted lane to account for the doubleword swap. */ +static void +adjust_extract (rtx_insn *insn) +{ + rtx pattern = PATTERN (insn); + if (GET_CODE (pattern) == PARALLEL) + pattern = XVECEXP (pattern, 0, 0); + rtx src = SET_SRC (pattern); + /* The vec_select may be wrapped in a vec_duplicate for a splat, so + account for that. */ + rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src; + rtx par = XEXP (sel, 1); + int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1; + int lane = INTVAL (XVECEXP (par, 0, 0)); + lane = lane >= half_elts ? lane - half_elts : lane + half_elts; + XVECEXP (par, 0, 0) = GEN_INT (lane); + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn)); +} + +/* Given OP that contains a vector direct-splat operation, adjust the index + of the source lane to account for the doubleword swap. */ +static void +adjust_splat (rtx_insn *insn) +{ + rtx body = PATTERN (insn); + rtx unspec = XEXP (body, 1); + int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1; + int lane = INTVAL (XVECEXP (unspec, 0, 1)); + lane = lane >= half_elts ? lane - half_elts : lane + half_elts; + XVECEXP (unspec, 0, 1) = GEN_INT (lane); + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn)); +} + +/* Given OP that contains an XXPERMDI operation (that is not a doubleword + swap), reverse the order of the source operands and adjust the indices + of the source lanes to account for doubleword reversal. */ +static void +adjust_xxpermdi (rtx_insn *insn) +{ + rtx set = PATTERN (insn); + rtx select = XEXP (set, 1); + rtx concat = XEXP (select, 0); + rtx src0 = XEXP (concat, 0); + XEXP (concat, 0) = XEXP (concat, 1); + XEXP (concat, 1) = src0; + rtx parallel = XEXP (select, 1); + int lane0 = INTVAL (XVECEXP (parallel, 0, 0)); + int lane1 = INTVAL (XVECEXP (parallel, 0, 1)); + int new_lane0 = 3 - lane1; + int new_lane1 = 3 - lane0; + XVECEXP (parallel, 0, 0) = GEN_INT (new_lane0); + XVECEXP (parallel, 0, 1) = GEN_INT (new_lane1); + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Changing lanes for xxpermdi %d\n", INSN_UID (insn)); +} + +/* Given OP that contains a VEC_CONCAT operation of two doublewords, + reverse the order of those inputs. */ +static void +adjust_concat (rtx_insn *insn) +{ + rtx set = PATTERN (insn); + rtx concat = XEXP (set, 1); + rtx src0 = XEXP (concat, 0); + XEXP (concat, 0) = XEXP (concat, 1); + XEXP (concat, 1) = src0; + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "Reversing inputs for concat %d\n", INSN_UID (insn)); +} + +/* Given an UNSPEC_VPERM insn, modify the mask loaded from the + constant pool to reflect swapped doublewords. */ +static void +adjust_vperm (rtx_insn *insn) +{ + /* We previously determined that the UNSPEC_VPERM was fed by a + swap of a swapping load of a TOC-relative constant pool symbol. + Find the MEM in the swapping load and replace it with a MEM for + the adjusted mask constant. */ + rtx set = PATTERN (insn); + rtx mask_reg = XVECEXP (SET_SRC (set), 0, 2); + + /* Find the swap. */ + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + df_ref use; + rtx_insn *swap_insn = 0; + FOR_EACH_INSN_INFO_USE (use, insn_info) + if (rtx_equal_p (DF_REF_REG (use), mask_reg)) + { + struct df_link *def_link = DF_REF_CHAIN (use); + gcc_assert (def_link && !def_link->next); + swap_insn = DF_REF_INSN (def_link->ref); + break; + } + gcc_assert (swap_insn); + + /* Find the load. */ + insn_info = DF_INSN_INFO_GET (swap_insn); + rtx_insn *load_insn = 0; + FOR_EACH_INSN_INFO_USE (use, insn_info) + { + struct df_link *def_link = DF_REF_CHAIN (use); + gcc_assert (def_link && !def_link->next); + load_insn = DF_REF_INSN (def_link->ref); + break; + } + gcc_assert (load_insn); + + /* Find the TOC-relative symbol access. */ + insn_info = DF_INSN_INFO_GET (load_insn); + rtx_insn *tocrel_insn = 0; + FOR_EACH_INSN_INFO_USE (use, insn_info) + { + struct df_link *def_link = DF_REF_CHAIN (use); + gcc_assert (def_link && !def_link->next); + tocrel_insn = DF_REF_INSN (def_link->ref); + break; + } + gcc_assert (tocrel_insn); + + /* Find the embedded CONST_VECTOR. We have to call toc_relative_expr_p + to set tocrel_base; otherwise it would be unnecessary as we've + already established it will return true. */ + rtx base, offset; + const_rtx tocrel_base; + rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn)); + /* There is an extra level of indirection for small/large code models. */ + if (GET_CODE (tocrel_expr) == MEM) + tocrel_expr = XEXP (tocrel_expr, 0); + if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL)) + gcc_unreachable (); + split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset); + rtx const_vector = get_pool_constant (base); + /* With the extra indirection, get_pool_constant will produce the + real constant from the reg_equal expression, so get the real + constant. */ + if (GET_CODE (const_vector) == SYMBOL_REF) + const_vector = get_pool_constant (const_vector); + gcc_assert (GET_CODE (const_vector) == CONST_VECTOR); + + /* Create an adjusted mask from the initial mask. */ + unsigned int new_mask[16], i, val; + for (i = 0; i < 16; ++i) { + val = INTVAL (XVECEXP (const_vector, 0, i)); + if (val < 16) + new_mask[i] = (val + 8) % 16; + else + new_mask[i] = ((val + 8) % 16) + 16; + } + + /* Create a new CONST_VECTOR and a MEM that references it. */ + rtx vals = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); + for (i = 0; i < 16; ++i) + XVECEXP (vals, 0, i) = GEN_INT (new_mask[i]); + rtx new_const_vector = gen_rtx_CONST_VECTOR (V16QImode, XVEC (vals, 0)); + rtx new_mem = force_const_mem (V16QImode, new_const_vector); + /* This gives us a MEM whose base operand is a SYMBOL_REF, which we + can't recognize. Force the SYMBOL_REF into a register. */ + if (!REG_P (XEXP (new_mem, 0))) { + rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0)); + XEXP (new_mem, 0) = base_reg; + /* Move the newly created insn ahead of the load insn. */ + rtx_insn *force_insn = get_last_insn (); + remove_insn (force_insn); + rtx_insn *before_load_insn = PREV_INSN (load_insn); + add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn)); + df_insn_rescan (before_load_insn); + df_insn_rescan (force_insn); + } + + /* Replace the MEM in the load instruction and rescan it. */ + XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem; + INSN_CODE (load_insn) = -1; /* Force re-recognition. */ + df_insn_rescan (load_insn); + + if (dump_file) + fprintf (dump_file, "Adjusting mask for vperm %d\n", INSN_UID (insn)); +} + +/* The insn described by INSN_ENTRY[I] can be swapped, but only + with special handling. Take care of that here. */ +static void +handle_special_swappables (swap_web_entry *insn_entry, unsigned i) +{ + rtx_insn *insn = insn_entry[i].insn; + rtx body = PATTERN (insn); + + switch (insn_entry[i].special_handling) + { + default: + gcc_unreachable (); + case SH_CONST_VECTOR: + { + /* A CONST_VECTOR will only show up somewhere in the RHS of a SET. */ + gcc_assert (GET_CODE (body) == SET); + rtx rhs = SET_SRC (body); + swap_const_vector_halves (rhs); + if (dump_file) + fprintf (dump_file, "Swapping constant halves in insn %d\n", i); + break; + } + case SH_SUBREG: + /* A subreg of the same size is already safe. For subregs that + select a smaller portion of a reg, adjust the index for + swapped doublewords. */ + adjust_subreg_index (body); + if (dump_file) + fprintf (dump_file, "Adjusting subreg in insn %d\n", i); + break; + case SH_NOSWAP_LD: + /* Convert a non-permuting load to a permuting one. */ + permute_load (insn); + break; + case SH_NOSWAP_ST: + /* Convert a non-permuting store to a permuting one. */ + permute_store (insn); + break; + case SH_EXTRACT: + /* Change the lane on an extract operation. */ + adjust_extract (insn); + break; + case SH_SPLAT: + /* Change the lane on a direct-splat operation. */ + adjust_splat (insn); + break; + case SH_XXPERMDI: + /* Change the lanes on an XXPERMDI operation. */ + adjust_xxpermdi (insn); + break; + case SH_CONCAT: + /* Reverse the order of a concatenation operation. */ + adjust_concat (insn); + break; + case SH_VPERM: + /* Change the mask loaded from the constant pool for a VPERM. */ + adjust_vperm (insn); + break; + } +} + +/* Find the insn from the Ith table entry, which is known to be a + register swap Y = SWAP(X). Replace it with a copy Y = X. */ +static void +replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i) +{ + rtx_insn *insn = insn_entry[i].insn; + rtx body = PATTERN (insn); + rtx src_reg = XEXP (SET_SRC (body), 0); + rtx copy = gen_rtx_SET (SET_DEST (body), src_reg); + rtx_insn *new_insn = emit_insn_before (copy, insn); + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn)); + df_insn_rescan (new_insn); + + if (dump_file) + { + unsigned int new_uid = INSN_UID (new_insn); + fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid); + } + + df_insn_delete (insn); + remove_insn (insn); + insn->set_deleted (); +} + +/* Dump the swap table to DUMP_FILE. */ +static void +dump_swap_insn_table (swap_web_entry *insn_entry) +{ + int e = get_max_uid (); + fprintf (dump_file, "\nRelevant insns with their flag settings\n\n"); + + for (int i = 0; i < e; ++i) + if (insn_entry[i].is_relevant) + { + swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred (); + fprintf (dump_file, "%6d %6d ", i, + pred_entry && pred_entry->insn + ? INSN_UID (pred_entry->insn) : 0); + if (insn_entry[i].is_load) + fputs ("load ", dump_file); + if (insn_entry[i].is_store) + fputs ("store ", dump_file); + if (insn_entry[i].is_swap) + fputs ("swap ", dump_file); + if (insn_entry[i].is_live_in) + fputs ("live-in ", dump_file); + if (insn_entry[i].is_live_out) + fputs ("live-out ", dump_file); + if (insn_entry[i].contains_subreg) + fputs ("subreg ", dump_file); + if (insn_entry[i].is_128_int) + fputs ("int128 ", dump_file); + if (insn_entry[i].is_call) + fputs ("call ", dump_file); + if (insn_entry[i].is_swappable) + { + fputs ("swappable ", dump_file); + if (insn_entry[i].special_handling == SH_CONST_VECTOR) + fputs ("special:constvec ", dump_file); + else if (insn_entry[i].special_handling == SH_SUBREG) + fputs ("special:subreg ", dump_file); + else if (insn_entry[i].special_handling == SH_NOSWAP_LD) + fputs ("special:load ", dump_file); + else if (insn_entry[i].special_handling == SH_NOSWAP_ST) + fputs ("special:store ", dump_file); + else if (insn_entry[i].special_handling == SH_EXTRACT) + fputs ("special:extract ", dump_file); + else if (insn_entry[i].special_handling == SH_SPLAT) + fputs ("special:splat ", dump_file); + else if (insn_entry[i].special_handling == SH_XXPERMDI) + fputs ("special:xxpermdi ", dump_file); + else if (insn_entry[i].special_handling == SH_CONCAT) + fputs ("special:concat ", dump_file); + else if (insn_entry[i].special_handling == SH_VPERM) + fputs ("special:vperm ", dump_file); + } + if (insn_entry[i].web_not_optimizable) + fputs ("unoptimizable ", dump_file); + if (insn_entry[i].will_delete) + fputs ("delete ", dump_file); + fputs ("\n", dump_file); + } + fputs ("\n", dump_file); +} + +/* Return RTX with its address canonicalized to (reg) or (+ reg reg). + Here RTX is an (& addr (const_int -16)). Always return a new copy + to avoid problems with combine. */ +static rtx +alignment_with_canonical_addr (rtx align) +{ + rtx canon; + rtx addr = XEXP (align, 0); + + if (REG_P (addr)) + canon = addr; + + else if (GET_CODE (addr) == PLUS) + { + rtx addrop0 = XEXP (addr, 0); + rtx addrop1 = XEXP (addr, 1); + + if (!REG_P (addrop0)) + addrop0 = force_reg (GET_MODE (addrop0), addrop0); + + if (!REG_P (addrop1)) + addrop1 = force_reg (GET_MODE (addrop1), addrop1); + + canon = gen_rtx_PLUS (GET_MODE (addr), addrop0, addrop1); + } + + else + canon = force_reg (GET_MODE (addr), addr); + + return gen_rtx_AND (GET_MODE (align), canon, GEN_INT (-16)); +} + +/* Check whether an rtx is an alignment mask, and if so, return + a fully-expanded rtx for the masking operation. */ +static rtx +alignment_mask (rtx_insn *insn) +{ + rtx body = PATTERN (insn); + + if (GET_CODE (body) != SET + || GET_CODE (SET_SRC (body)) != AND + || !REG_P (XEXP (SET_SRC (body), 0))) + return 0; + + rtx mask = XEXP (SET_SRC (body), 1); + + if (GET_CODE (mask) == CONST_INT) + { + if (INTVAL (mask) == -16) + return alignment_with_canonical_addr (SET_SRC (body)); + else + return 0; + } + + if (!REG_P (mask)) + return 0; + + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + df_ref use; + rtx real_mask = 0; + + FOR_EACH_INSN_INFO_USE (use, insn_info) + { + if (!rtx_equal_p (DF_REF_REG (use), mask)) + continue; + + struct df_link *def_link = DF_REF_CHAIN (use); + if (!def_link || def_link->next) + return 0; + + rtx_insn *const_insn = DF_REF_INSN (def_link->ref); + rtx const_body = PATTERN (const_insn); + if (GET_CODE (const_body) != SET) + return 0; + + real_mask = SET_SRC (const_body); + + if (GET_CODE (real_mask) != CONST_INT + || INTVAL (real_mask) != -16) + return 0; + } + + if (real_mask == 0) + return 0; + + return alignment_with_canonical_addr (SET_SRC (body)); +} + +/* Given INSN that's a load or store based at BASE_REG, look for a + feeding computation that aligns its address on a 16-byte boundary. */ +static rtx +find_alignment_op (rtx_insn *insn, rtx base_reg) +{ + df_ref base_use; + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + rtx and_operation = 0; + + FOR_EACH_INSN_INFO_USE (base_use, insn_info) + { + if (!rtx_equal_p (DF_REF_REG (base_use), base_reg)) + continue; + + struct df_link *base_def_link = DF_REF_CHAIN (base_use); + if (!base_def_link || base_def_link->next) + break; + + /* With stack-protector code enabled, and possibly in other + circumstances, there may not be an associated insn for + the def. */ + if (DF_REF_IS_ARTIFICIAL (base_def_link->ref)) + break; + + rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref); + and_operation = alignment_mask (and_insn); + if (and_operation != 0) + break; + } + + return and_operation; +} + +struct del_info { bool replace; rtx_insn *replace_insn; }; + +/* If INSN is the load for an lvx pattern, put it in canonical form. */ +static void +recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete) +{ + rtx body = PATTERN (insn); + gcc_assert (GET_CODE (body) == SET + && GET_CODE (SET_SRC (body)) == VEC_SELECT + && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM); + + rtx mem = XEXP (SET_SRC (body), 0); + rtx base_reg = XEXP (mem, 0); + + rtx and_operation = find_alignment_op (insn, base_reg); + + if (and_operation != 0) + { + df_ref def; + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + FOR_EACH_INSN_INFO_DEF (def, insn_info) + { + struct df_link *link = DF_REF_CHAIN (def); + if (!link || link->next) + break; + + rtx_insn *swap_insn = DF_REF_INSN (link->ref); + if (!insn_is_swap_p (swap_insn) + || insn_is_load_p (swap_insn) + || insn_is_store_p (swap_insn)) + break; + + /* Expected lvx pattern found. Change the swap to + a copy, and propagate the AND operation into the + load. */ + to_delete[INSN_UID (swap_insn)].replace = true; + to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn; + + XEXP (mem, 0) = and_operation; + SET_SRC (body) = mem; + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "lvx opportunity found at %d\n", + INSN_UID (insn)); + } + } +} + +/* If INSN is the store for an stvx pattern, put it in canonical form. */ +static void +recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete) +{ + rtx body = PATTERN (insn); + gcc_assert (GET_CODE (body) == SET + && GET_CODE (SET_DEST (body)) == MEM + && GET_CODE (SET_SRC (body)) == VEC_SELECT); + rtx mem = SET_DEST (body); + rtx base_reg = XEXP (mem, 0); + + rtx and_operation = find_alignment_op (insn, base_reg); + + if (and_operation != 0) + { + rtx src_reg = XEXP (SET_SRC (body), 0); + df_ref src_use; + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + FOR_EACH_INSN_INFO_USE (src_use, insn_info) + { + if (!rtx_equal_p (DF_REF_REG (src_use), src_reg)) + continue; + + struct df_link *link = DF_REF_CHAIN (src_use); + if (!link || link->next) + break; + + rtx_insn *swap_insn = DF_REF_INSN (link->ref); + if (!insn_is_swap_p (swap_insn) + || insn_is_load_p (swap_insn) + || insn_is_store_p (swap_insn)) + break; + + /* Expected stvx pattern found. Change the swap to + a copy, and propagate the AND operation into the + store. */ + to_delete[INSN_UID (swap_insn)].replace = true; + to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn; + + XEXP (mem, 0) = and_operation; + SET_SRC (body) = src_reg; + INSN_CODE (insn) = -1; /* Force re-recognition. */ + df_insn_rescan (insn); + + if (dump_file) + fprintf (dump_file, "stvx opportunity found at %d\n", + INSN_UID (insn)); + } + } +} + +/* Look for patterns created from builtin lvx and stvx calls, and + canonicalize them to be properly recognized as such. */ +static void +recombine_lvx_stvx_patterns (function *fun) +{ + int i; + basic_block bb; + rtx_insn *insn; + + int num_insns = get_max_uid (); + del_info *to_delete = XCNEWVEC (del_info, num_insns); + + FOR_ALL_BB_FN (bb, fun) + FOR_BB_INSNS (bb, insn) + { + if (!NONDEBUG_INSN_P (insn)) + continue; + + if (insn_is_load_p (insn) && insn_is_swap_p (insn)) + recombine_lvx_pattern (insn, to_delete); + else if (insn_is_store_p (insn) && insn_is_swap_p (insn)) + recombine_stvx_pattern (insn, to_delete); + } + + /* Turning swaps into copies is delayed until now, to avoid problems + with deleting instructions during the insn walk. */ + for (i = 0; i < num_insns; i++) + if (to_delete[i].replace) + { + rtx swap_body = PATTERN (to_delete[i].replace_insn); + rtx src_reg = XEXP (SET_SRC (swap_body), 0); + rtx copy = gen_rtx_SET (SET_DEST (swap_body), src_reg); + rtx_insn *new_insn = emit_insn_before (copy, + to_delete[i].replace_insn); + set_block_for_insn (new_insn, + BLOCK_FOR_INSN (to_delete[i].replace_insn)); + df_insn_rescan (new_insn); + df_insn_delete (to_delete[i].replace_insn); + remove_insn (to_delete[i].replace_insn); + to_delete[i].replace_insn->set_deleted (); + } + + free (to_delete); +} + +/* Main entry point for this pass. */ +unsigned int +rs6000_analyze_swaps (function *fun) +{ + swap_web_entry *insn_entry; + basic_block bb; + rtx_insn *insn, *curr_insn = 0; + + /* Dataflow analysis for use-def chains. */ + df_set_flags (DF_RD_PRUNE_DEAD_DEFS); + df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); + df_analyze (); + df_set_flags (DF_DEFER_INSN_RESCAN); + + /* Pre-pass to recombine lvx and stvx patterns so we don't lose info. */ + recombine_lvx_stvx_patterns (fun); + + /* Allocate structure to represent webs of insns. */ + insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ()); + + /* Walk the insns to gather basic data. */ + FOR_ALL_BB_FN (bb, fun) + FOR_BB_INSNS_SAFE (bb, insn, curr_insn) + { + unsigned int uid = INSN_UID (insn); + if (NONDEBUG_INSN_P (insn)) + { + insn_entry[uid].insn = insn; + + if (GET_CODE (insn) == CALL_INSN) + insn_entry[uid].is_call = 1; + + /* Walk the uses and defs to see if we mention vector regs. + Record any constraints on optimization of such mentions. */ + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + df_ref mention; + FOR_EACH_INSN_INFO_USE (mention, insn_info) + { + /* We use DF_REF_REAL_REG here to get inside any subregs. */ + machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention)); + + /* If a use gets its value from a call insn, it will be + a hard register and will look like (reg:V4SI 3 3). + The df analysis creates two mentions for GPR3 and GPR4, + both DImode. We must recognize this and treat it as a + vector mention to ensure the call is unioned with this + use. */ + if (mode == DImode && DF_REF_INSN_INFO (mention)) + { + rtx feeder = DF_REF_INSN (mention); + /* FIXME: It is pretty hard to get from the df mention + to the mode of the use in the insn. We arbitrarily + pick a vector mode here, even though the use might + be a real DImode. We can be too conservative + (create a web larger than necessary) because of + this, so consider eventually fixing this. */ + if (GET_CODE (feeder) == CALL_INSN) + mode = V4SImode; + } + + if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode) + { + insn_entry[uid].is_relevant = 1; + if (mode == TImode || mode == V1TImode + || FLOAT128_VECTOR_P (mode)) + insn_entry[uid].is_128_int = 1; + if (DF_REF_INSN_INFO (mention)) + insn_entry[uid].contains_subreg + = !rtx_equal_p (DF_REF_REG (mention), + DF_REF_REAL_REG (mention)); + union_defs (insn_entry, insn, mention); + } + } + FOR_EACH_INSN_INFO_DEF (mention, insn_info) + { + /* We use DF_REF_REAL_REG here to get inside any subregs. */ + machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention)); + + /* If we're loading up a hard vector register for a call, + it looks like (set (reg:V4SI 9 9) (...)). The df + analysis creates two mentions for GPR9 and GPR10, both + DImode. So relying on the mode from the mentions + isn't sufficient to ensure we union the call into the + web with the parameter setup code. */ + if (mode == DImode && GET_CODE (insn) == SET + && ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (SET_DEST (insn)))) + mode = GET_MODE (SET_DEST (insn)); + + if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode) + { + insn_entry[uid].is_relevant = 1; + if (mode == TImode || mode == V1TImode + || FLOAT128_VECTOR_P (mode)) + insn_entry[uid].is_128_int = 1; + if (DF_REF_INSN_INFO (mention)) + insn_entry[uid].contains_subreg + = !rtx_equal_p (DF_REF_REG (mention), + DF_REF_REAL_REG (mention)); + /* REG_FUNCTION_VALUE_P is not valid for subregs. */ + else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention))) + insn_entry[uid].is_live_out = 1; + union_uses (insn_entry, insn, mention); + } + } + + if (insn_entry[uid].is_relevant) + { + /* Determine if this is a load or store. */ + insn_entry[uid].is_load = insn_is_load_p (insn); + insn_entry[uid].is_store = insn_is_store_p (insn); + + /* Determine if this is a doubleword swap. If not, + determine whether it can legally be swapped. */ + if (insn_is_swap_p (insn)) + insn_entry[uid].is_swap = 1; + else + { + unsigned int special = SH_NONE; + insn_entry[uid].is_swappable + = insn_is_swappable_p (insn_entry, insn, &special); + if (special != SH_NONE && insn_entry[uid].contains_subreg) + insn_entry[uid].is_swappable = 0; + else if (special != SH_NONE) + insn_entry[uid].special_handling = special; + else if (insn_entry[uid].contains_subreg) + insn_entry[uid].special_handling = SH_SUBREG; + } + } + } + } + + if (dump_file) + { + fprintf (dump_file, "\nSwap insn entry table when first built\n"); + dump_swap_insn_table (insn_entry); + } + + /* Record unoptimizable webs. */ + unsigned e = get_max_uid (), i; + for (i = 0; i < e; ++i) + { + if (!insn_entry[i].is_relevant) + continue; + + swap_web_entry *root + = (swap_web_entry*)(&insn_entry[i])->unionfind_root (); + + if (insn_entry[i].is_live_in || insn_entry[i].is_live_out + || (insn_entry[i].contains_subreg + && insn_entry[i].special_handling != SH_SUBREG) + || insn_entry[i].is_128_int || insn_entry[i].is_call + || !(insn_entry[i].is_swappable || insn_entry[i].is_swap)) + root->web_not_optimizable = 1; + + /* If we have loads or stores that aren't permuting then the + optimization isn't appropriate. */ + else if ((insn_entry[i].is_load || insn_entry[i].is_store) + && !insn_entry[i].is_swap && !insn_entry[i].is_swappable) + root->web_not_optimizable = 1; + + /* If we have permuting loads or stores that are not accompanied + by a register swap, the optimization isn't appropriate. */ + else if (insn_entry[i].is_load && insn_entry[i].is_swap) + { + rtx insn = insn_entry[i].insn; + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + df_ref def; + + FOR_EACH_INSN_INFO_DEF (def, insn_info) + { + struct df_link *link = DF_REF_CHAIN (def); + + if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS)) + { + root->web_not_optimizable = 1; + break; + } + } + } + else if (insn_entry[i].is_store && insn_entry[i].is_swap) + { + rtx insn = insn_entry[i].insn; + struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); + df_ref use; + + FOR_EACH_INSN_INFO_USE (use, insn_info) + { + struct df_link *link = DF_REF_CHAIN (use); + + if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES)) + { + root->web_not_optimizable = 1; + break; + } + } + } + } + + if (dump_file) + { + fprintf (dump_file, "\nSwap insn entry table after web analysis\n"); + dump_swap_insn_table (insn_entry); + } + + /* For each load and store in an optimizable web (which implies + the loads and stores are permuting), find the associated + register swaps and mark them for removal. Due to various + optimizations we may mark the same swap more than once. Also + perform special handling for swappable insns that require it. */ + for (i = 0; i < e; ++i) + if ((insn_entry[i].is_load || insn_entry[i].is_store) + && insn_entry[i].is_swap) + { + swap_web_entry* root_entry + = (swap_web_entry*)((&insn_entry[i])->unionfind_root ()); + if (!root_entry->web_not_optimizable) + mark_swaps_for_removal (insn_entry, i); + } + else if (insn_entry[i].is_swappable && insn_entry[i].special_handling) + { + swap_web_entry* root_entry + = (swap_web_entry*)((&insn_entry[i])->unionfind_root ()); + if (!root_entry->web_not_optimizable) + handle_special_swappables (insn_entry, i); + } + + /* Now delete the swaps marked for removal. */ + for (i = 0; i < e; ++i) + if (insn_entry[i].will_delete) + replace_swap_with_copy (insn_entry, i); + + /* Clean up. */ + free (insn_entry); + return 0; +} + +const pass_data pass_data_analyze_swaps = +{ + RTL_PASS, /* type */ + "swaps", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_df_finish, /* todo_flags_finish */ +}; + +class pass_analyze_swaps : public rtl_opt_pass +{ +public: + pass_analyze_swaps(gcc::context *ctxt) + : rtl_opt_pass(pass_data_analyze_swaps, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) + { + return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX + && !TARGET_P9_VECTOR && rs6000_optimize_swaps); + } + + virtual unsigned int execute (function *fun) + { + return rs6000_analyze_swaps (fun); + } + + opt_pass *clone () + { + return new pass_analyze_swaps (m_ctxt); + } + +}; // class pass_analyze_swaps + +rtl_opt_pass * +make_pass_analyze_swaps (gcc::context *ctxt) +{ + return new pass_analyze_swaps (ctxt); +} + diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index f2680854f33..200f43acde8 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -39030,1862 +39030,6 @@ emit_fusion_p9_store (rtx mem, rtx reg, rtx tmp_reg) return ""; } - -/* Analyze vector computations and remove unnecessary doubleword - swaps (xxswapdi instructions). This pass is performed only - for little-endian VSX code generation. - - For this specific case, loads and stores of 4x32 and 2x64 vectors - are inefficient. These are implemented using the lvx2dx and - stvx2dx instructions, which invert the order of doublewords in - a vector register. Thus the code generation inserts an xxswapdi - after each such load, and prior to each such store. (For spill - code after register assignment, an additional xxswapdi is inserted - following each store in order to return a hard register to its - unpermuted value.) - - The extra xxswapdi instructions reduce performance. This can be - particularly bad for vectorized code. The purpose of this pass - is to reduce the number of xxswapdi instructions required for - correctness. - - The primary insight is that much code that operates on vectors - does not care about the relative order of elements in a register, - so long as the correct memory order is preserved. If we have - a computation where all input values are provided by lvxd2x/xxswapdi - sequences, all outputs are stored using xxswapdi/stvxd2x sequences, - and all intermediate computations are pure SIMD (independent of - element order), then all the xxswapdi's associated with the loads - and stores may be removed. - - This pass uses some of the infrastructure and logical ideas from - the "web" pass in web.c. We create maximal webs of computations - fitting the description above using union-find. Each such web is - then optimized by removing its unnecessary xxswapdi instructions. - - The pass is placed prior to global optimization so that we can - perform the optimization in the safest and simplest way possible; - that is, by replacing each xxswapdi insn with a register copy insn. - Subsequent forward propagation will remove copies where possible. - - There are some operations sensitive to element order for which we - can still allow the operation, provided we modify those operations. - These include CONST_VECTORs, for which we must swap the first and - second halves of the constant vector; and SUBREGs, for which we - must adjust the byte offset to account for the swapped doublewords. - A remaining opportunity would be non-immediate-form splats, for - which we should adjust the selected lane of the input. We should - also make code generation adjustments for sum-across operations, - since this is a common vectorizer reduction. - - Because we run prior to the first split, we can see loads and stores - here that match *vsx_le_perm_{load,store}_. These are vanilla - vector loads and stores that have not yet been split into a permuting - load/store and a swap. (One way this can happen is with a builtin - call to vec_vsx_{ld,st}.) We can handle these as well, but rather - than deleting a swap, we convert the load/store into a permuting - load/store (which effectively removes the swap). */ - -/* Notes on Permutes - - We do not currently handle computations that contain permutes. There - is a general transformation that can be performed correctly, but it - may introduce more expensive code than it replaces. To handle these - would require a cost model to determine when to perform the optimization. - This commentary records how this could be done if desired. - - The most general permute is something like this (example for V16QI): - - (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI)) - (parallel [(const_int a0) (const_int a1) - ... - (const_int a14) (const_int a15)])) - - where a0,...,a15 are in [0,31] and select elements from op1 and op2 - to produce in the result. - - Regardless of mode, we can convert the PARALLEL to a mask of 16 - byte-element selectors. Let's call this M, with M[i] representing - the ith byte-element selector value. Then if we swap doublewords - throughout the computation, we can get correct behavior by replacing - M with M' as follows: - - M'[i] = { (M[i]+8)%16 : M[i] in [0,15] - { ((M[i]+8)%16)+16 : M[i] in [16,31] - - This seems promising at first, since we are just replacing one mask - with another. But certain masks are preferable to others. If M - is a mask that matches a vmrghh pattern, for example, M' certainly - will not. Instead of a single vmrghh, we would generate a load of - M' and a vperm. So we would need to know how many xxswapd's we can - remove as a result of this transformation to determine if it's - profitable; and preferably the logic would need to be aware of all - the special preferable masks. - - Another form of permute is an UNSPEC_VPERM, in which the mask is - already in a register. In some cases, this mask may be a constant - that we can discover with ud-chains, in which case the above - transformation is ok. However, the common usage here is for the - mask to be produced by an UNSPEC_LVSL, in which case the mask - cannot be known at compile time. In such a case we would have to - generate several instructions to compute M' as above at run time, - and a cost model is needed again. - - However, when the mask M for an UNSPEC_VPERM is loaded from the - constant pool, we can replace M with M' as above at no cost - beyond adding a constant pool entry. */ - -/* This is based on the union-find logic in web.c. web_entry_base is - defined in df.h. */ -class swap_web_entry : public web_entry_base -{ - public: - /* Pointer to the insn. */ - rtx_insn *insn; - /* Set if insn contains a mention of a vector register. All other - fields are undefined if this field is unset. */ - unsigned int is_relevant : 1; - /* Set if insn is a load. */ - unsigned int is_load : 1; - /* Set if insn is a store. */ - unsigned int is_store : 1; - /* Set if insn is a doubleword swap. This can either be a register swap - or a permuting load or store (test is_load and is_store for this). */ - unsigned int is_swap : 1; - /* Set if the insn has a live-in use of a parameter register. */ - unsigned int is_live_in : 1; - /* Set if the insn has a live-out def of a return register. */ - unsigned int is_live_out : 1; - /* Set if the insn contains a subreg reference of a vector register. */ - unsigned int contains_subreg : 1; - /* Set if the insn contains a 128-bit integer operand. */ - unsigned int is_128_int : 1; - /* Set if this is a call-insn. */ - unsigned int is_call : 1; - /* Set if this insn does not perform a vector operation for which - element order matters, or if we know how to fix it up if it does. - Undefined if is_swap is set. */ - unsigned int is_swappable : 1; - /* A nonzero value indicates what kind of special handling for this - insn is required if doublewords are swapped. Undefined if - is_swappable is not set. */ - unsigned int special_handling : 4; - /* Set if the web represented by this entry cannot be optimized. */ - unsigned int web_not_optimizable : 1; - /* Set if this insn should be deleted. */ - unsigned int will_delete : 1; -}; - -enum special_handling_values { - SH_NONE = 0, - SH_CONST_VECTOR, - SH_SUBREG, - SH_NOSWAP_LD, - SH_NOSWAP_ST, - SH_EXTRACT, - SH_SPLAT, - SH_XXPERMDI, - SH_CONCAT, - SH_VPERM -}; - -/* Union INSN with all insns containing definitions that reach USE. - Detect whether USE is live-in to the current function. */ -static void -union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use) -{ - struct df_link *link = DF_REF_CHAIN (use); - - if (!link) - insn_entry[INSN_UID (insn)].is_live_in = 1; - - while (link) - { - if (DF_REF_IS_ARTIFICIAL (link->ref)) - insn_entry[INSN_UID (insn)].is_live_in = 1; - - if (DF_REF_INSN_INFO (link->ref)) - { - rtx def_insn = DF_REF_INSN (link->ref); - (void)unionfind_union (insn_entry + INSN_UID (insn), - insn_entry + INSN_UID (def_insn)); - } - - link = link->next; - } -} - -/* Union INSN with all insns containing uses reached from DEF. - Detect whether DEF is live-out from the current function. */ -static void -union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def) -{ - struct df_link *link = DF_REF_CHAIN (def); - - if (!link) - insn_entry[INSN_UID (insn)].is_live_out = 1; - - while (link) - { - /* This could be an eh use or some other artificial use; - we treat these all the same (killing the optimization). */ - if (DF_REF_IS_ARTIFICIAL (link->ref)) - insn_entry[INSN_UID (insn)].is_live_out = 1; - - if (DF_REF_INSN_INFO (link->ref)) - { - rtx use_insn = DF_REF_INSN (link->ref); - (void)unionfind_union (insn_entry + INSN_UID (insn), - insn_entry + INSN_UID (use_insn)); - } - - link = link->next; - } -} - -/* Return 1 iff INSN is a load insn, including permuting loads that - represent an lvxd2x instruction; else return 0. */ -static unsigned int -insn_is_load_p (rtx insn) -{ - rtx body = PATTERN (insn); - - if (GET_CODE (body) == SET) - { - if (GET_CODE (SET_SRC (body)) == MEM) - return 1; - - if (GET_CODE (SET_SRC (body)) == VEC_SELECT - && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM) - return 1; - - return 0; - } - - if (GET_CODE (body) != PARALLEL) - return 0; - - rtx set = XVECEXP (body, 0, 0); - - if (GET_CODE (set) == SET && GET_CODE (SET_SRC (set)) == MEM) - return 1; - - return 0; -} - -/* Return 1 iff INSN is a store insn, including permuting stores that - represent an stvxd2x instruction; else return 0. */ -static unsigned int -insn_is_store_p (rtx insn) -{ - rtx body = PATTERN (insn); - if (GET_CODE (body) == SET && GET_CODE (SET_DEST (body)) == MEM) - return 1; - if (GET_CODE (body) != PARALLEL) - return 0; - rtx set = XVECEXP (body, 0, 0); - if (GET_CODE (set) == SET && GET_CODE (SET_DEST (set)) == MEM) - return 1; - return 0; -} - -/* Return 1 iff INSN swaps doublewords. This may be a reg-reg swap, - a permuting load, or a permuting store. */ -static unsigned int -insn_is_swap_p (rtx insn) -{ - rtx body = PATTERN (insn); - if (GET_CODE (body) != SET) - return 0; - rtx rhs = SET_SRC (body); - if (GET_CODE (rhs) != VEC_SELECT) - return 0; - rtx parallel = XEXP (rhs, 1); - if (GET_CODE (parallel) != PARALLEL) - return 0; - unsigned int len = XVECLEN (parallel, 0); - if (len != 2 && len != 4 && len != 8 && len != 16) - return 0; - for (unsigned int i = 0; i < len / 2; ++i) - { - rtx op = XVECEXP (parallel, 0, i); - if (GET_CODE (op) != CONST_INT || INTVAL (op) != len / 2 + i) - return 0; - } - for (unsigned int i = len / 2; i < len; ++i) - { - rtx op = XVECEXP (parallel, 0, i); - if (GET_CODE (op) != CONST_INT || INTVAL (op) != i - len / 2) - return 0; - } - return 1; -} - -/* Return TRUE if insn is a swap fed by a load from the constant pool. */ -static bool -const_load_sequence_p (swap_web_entry *insn_entry, rtx insn) -{ - unsigned uid = INSN_UID (insn); - if (!insn_entry[uid].is_swap || insn_entry[uid].is_load) - return false; - - const_rtx tocrel_base; - - /* Find the unique use in the swap and locate its def. If the def - isn't unique, punt. */ - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - df_ref use; - FOR_EACH_INSN_INFO_USE (use, insn_info) - { - struct df_link *def_link = DF_REF_CHAIN (use); - if (!def_link || def_link->next) - return false; - - rtx def_insn = DF_REF_INSN (def_link->ref); - unsigned uid2 = INSN_UID (def_insn); - if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap) - return false; - - rtx body = PATTERN (def_insn); - if (GET_CODE (body) != SET - || GET_CODE (SET_SRC (body)) != VEC_SELECT - || GET_CODE (XEXP (SET_SRC (body), 0)) != MEM) - return false; - - rtx mem = XEXP (SET_SRC (body), 0); - rtx base_reg = XEXP (mem, 0); - - df_ref base_use; - insn_info = DF_INSN_INFO_GET (def_insn); - FOR_EACH_INSN_INFO_USE (base_use, insn_info) - { - if (!rtx_equal_p (DF_REF_REG (base_use), base_reg)) - continue; - - struct df_link *base_def_link = DF_REF_CHAIN (base_use); - if (!base_def_link || base_def_link->next) - return false; - - rtx tocrel_insn = DF_REF_INSN (base_def_link->ref); - rtx tocrel_body = PATTERN (tocrel_insn); - rtx base, offset; - if (GET_CODE (tocrel_body) != SET) - return false; - /* There is an extra level of indirection for small/large - code models. */ - rtx tocrel_expr = SET_SRC (tocrel_body); - if (GET_CODE (tocrel_expr) == MEM) - tocrel_expr = XEXP (tocrel_expr, 0); - if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL)) - return false; - split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset); - if (GET_CODE (base) != SYMBOL_REF || !CONSTANT_POOL_ADDRESS_P (base)) - return false; - } - } - return true; -} - -/* Return TRUE iff OP matches a V2DF reduction pattern. See the - definition of vsx_reduc__v2df in vsx.md. */ -static bool -v2df_reduction_p (rtx op) -{ - if (GET_MODE (op) != V2DFmode) - return false; - - enum rtx_code code = GET_CODE (op); - if (code != PLUS && code != SMIN && code != SMAX) - return false; - - rtx concat = XEXP (op, 0); - if (GET_CODE (concat) != VEC_CONCAT) - return false; - - rtx select0 = XEXP (concat, 0); - rtx select1 = XEXP (concat, 1); - if (GET_CODE (select0) != VEC_SELECT || GET_CODE (select1) != VEC_SELECT) - return false; - - rtx reg0 = XEXP (select0, 0); - rtx reg1 = XEXP (select1, 0); - if (!rtx_equal_p (reg0, reg1) || !REG_P (reg0)) - return false; - - rtx parallel0 = XEXP (select0, 1); - rtx parallel1 = XEXP (select1, 1); - if (GET_CODE (parallel0) != PARALLEL || GET_CODE (parallel1) != PARALLEL) - return false; - - if (!rtx_equal_p (XVECEXP (parallel0, 0, 0), const1_rtx) - || !rtx_equal_p (XVECEXP (parallel1, 0, 0), const0_rtx)) - return false; - - return true; -} - -/* Return 1 iff OP is an operand that will not be affected by having - vector doublewords swapped in memory. */ -static unsigned int -rtx_is_swappable_p (rtx op, unsigned int *special) -{ - enum rtx_code code = GET_CODE (op); - int i, j; - rtx parallel; - - switch (code) - { - case LABEL_REF: - case SYMBOL_REF: - case CLOBBER: - case REG: - return 1; - - case VEC_CONCAT: - case ASM_INPUT: - case ASM_OPERANDS: - return 0; - - case CONST_VECTOR: - { - *special = SH_CONST_VECTOR; - return 1; - } - - case VEC_DUPLICATE: - /* Opportunity: If XEXP (op, 0) has the same mode as the result, - and XEXP (op, 1) is a PARALLEL with a single QImode const int, - it represents a vector splat for which we can do special - handling. */ - if (GET_CODE (XEXP (op, 0)) == CONST_INT) - return 1; - else if (REG_P (XEXP (op, 0)) - && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0))) - /* This catches V2DF and V2DI splat, at a minimum. */ - return 1; - else if (GET_CODE (XEXP (op, 0)) == TRUNCATE - && REG_P (XEXP (XEXP (op, 0), 0)) - && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0))) - /* This catches splat of a truncated value. */ - return 1; - else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT) - /* If the duplicated item is from a select, defer to the select - processing to see if we can change the lane for the splat. */ - return rtx_is_swappable_p (XEXP (op, 0), special); - else - return 0; - - case VEC_SELECT: - /* A vec_extract operation is ok if we change the lane. */ - if (GET_CODE (XEXP (op, 0)) == REG - && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op) - && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL - && XVECLEN (parallel, 0) == 1 - && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT) - { - *special = SH_EXTRACT; - return 1; - } - /* An XXPERMDI is ok if we adjust the lanes. Note that if the - XXPERMDI is a swap operation, it will be identified by - insn_is_swap_p and therefore we won't get here. */ - else if (GET_CODE (XEXP (op, 0)) == VEC_CONCAT - && (GET_MODE (XEXP (op, 0)) == V4DFmode - || GET_MODE (XEXP (op, 0)) == V4DImode) - && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL - && XVECLEN (parallel, 0) == 2 - && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT - && GET_CODE (XVECEXP (parallel, 0, 1)) == CONST_INT) - { - *special = SH_XXPERMDI; - return 1; - } - else if (v2df_reduction_p (op)) - return 1; - else - return 0; - - case UNSPEC: - { - /* Various operations are unsafe for this optimization, at least - without significant additional work. Permutes are obviously - problematic, as both the permute control vector and the ordering - of the target values are invalidated by doubleword swapping. - Vector pack and unpack modify the number of vector lanes. - Merge-high/low will not operate correctly on swapped operands. - Vector shifts across element boundaries are clearly uncool, - as are vector select and concatenate operations. Vector - sum-across instructions define one operand with a specific - order-dependent element, so additional fixup code would be - needed to make those work. Vector set and non-immediate-form - vector splat are element-order sensitive. A few of these - cases might be workable with special handling if required. - Adding cost modeling would be appropriate in some cases. */ - int val = XINT (op, 1); - switch (val) - { - default: - break; - case UNSPEC_VMRGH_DIRECT: - case UNSPEC_VMRGL_DIRECT: - case UNSPEC_VPACK_SIGN_SIGN_SAT: - case UNSPEC_VPACK_SIGN_UNS_SAT: - case UNSPEC_VPACK_UNS_UNS_MOD: - case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT: - case UNSPEC_VPACK_UNS_UNS_SAT: - case UNSPEC_VPERM: - case UNSPEC_VPERM_UNS: - case UNSPEC_VPERMHI: - case UNSPEC_VPERMSI: - case UNSPEC_VPKPX: - case UNSPEC_VSLDOI: - case UNSPEC_VSLO: - case UNSPEC_VSRO: - case UNSPEC_VSUM2SWS: - case UNSPEC_VSUM4S: - case UNSPEC_VSUM4UBS: - case UNSPEC_VSUMSWS: - case UNSPEC_VSUMSWS_DIRECT: - case UNSPEC_VSX_CONCAT: - case UNSPEC_VSX_SET: - case UNSPEC_VSX_SLDWI: - case UNSPEC_VUNPACK_HI_SIGN: - case UNSPEC_VUNPACK_HI_SIGN_DIRECT: - case UNSPEC_VUNPACK_LO_SIGN: - case UNSPEC_VUNPACK_LO_SIGN_DIRECT: - case UNSPEC_VUPKHPX: - case UNSPEC_VUPKHS_V4SF: - case UNSPEC_VUPKHU_V4SF: - case UNSPEC_VUPKLPX: - case UNSPEC_VUPKLS_V4SF: - case UNSPEC_VUPKLU_V4SF: - case UNSPEC_VSX_CVDPSPN: - case UNSPEC_VSX_CVSPDP: - case UNSPEC_VSX_CVSPDPN: - case UNSPEC_VSX_EXTRACT: - case UNSPEC_VSX_VSLO: - case UNSPEC_VSX_VEC_INIT: - return 0; - case UNSPEC_VSPLT_DIRECT: - case UNSPEC_VSX_XXSPLTD: - *special = SH_SPLAT; - return 1; - case UNSPEC_REDUC_PLUS: - case UNSPEC_REDUC: - return 1; - } - } - - default: - break; - } - - const char *fmt = GET_RTX_FORMAT (code); - int ok = 1; - - for (i = 0; i < GET_RTX_LENGTH (code); ++i) - if (fmt[i] == 'e' || fmt[i] == 'u') - { - unsigned int special_op = SH_NONE; - ok &= rtx_is_swappable_p (XEXP (op, i), &special_op); - if (special_op == SH_NONE) - continue; - /* Ensure we never have two kinds of special handling - for the same insn. */ - if (*special != SH_NONE && *special != special_op) - return 0; - *special = special_op; - } - else if (fmt[i] == 'E') - for (j = 0; j < XVECLEN (op, i); ++j) - { - unsigned int special_op = SH_NONE; - ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op); - if (special_op == SH_NONE) - continue; - /* Ensure we never have two kinds of special handling - for the same insn. */ - if (*special != SH_NONE && *special != special_op) - return 0; - *special = special_op; - } - - return ok; -} - -/* Return 1 iff INSN is an operand that will not be affected by - having vector doublewords swapped in memory (in which case - *SPECIAL is unchanged), or that can be modified to be correct - if vector doublewords are swapped in memory (in which case - *SPECIAL is changed to a value indicating how). */ -static unsigned int -insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn, - unsigned int *special) -{ - /* Calls are always bad. */ - if (GET_CODE (insn) == CALL_INSN) - return 0; - - /* Loads and stores seen here are not permuting, but we can still - fix them up by converting them to permuting ones. Exceptions: - UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL - body instead of a SET; and UNSPEC_STVE, which has an UNSPEC - for the SET source. Also we must now make an exception for lvx - and stvx when they are not in the UNSPEC_LVX/STVX form (with the - explicit "& -16") since this leads to unrecognizable insns. */ - rtx body = PATTERN (insn); - int i = INSN_UID (insn); - - if (insn_entry[i].is_load) - { - if (GET_CODE (body) == SET) - { - rtx rhs = SET_SRC (body); - /* Even without a swap, the RHS might be a vec_select for, say, - a byte-reversing load. */ - if (GET_CODE (rhs) != MEM) - return 0; - if (GET_CODE (XEXP (rhs, 0)) == AND) - return 0; - - *special = SH_NOSWAP_LD; - return 1; - } - else - return 0; - } - - if (insn_entry[i].is_store) - { - if (GET_CODE (body) == SET - && GET_CODE (SET_SRC (body)) != UNSPEC) - { - rtx lhs = SET_DEST (body); - /* Even without a swap, the LHS might be a vec_select for, say, - a byte-reversing store. */ - if (GET_CODE (lhs) != MEM) - return 0; - if (GET_CODE (XEXP (lhs, 0)) == AND) - return 0; - - *special = SH_NOSWAP_ST; - return 1; - } - else - return 0; - } - - /* A convert to single precision can be left as is provided that - all of its uses are in xxspltw instructions that splat BE element - zero. */ - if (GET_CODE (body) == SET - && GET_CODE (SET_SRC (body)) == UNSPEC - && XINT (SET_SRC (body), 1) == UNSPEC_VSX_CVDPSPN) - { - df_ref def; - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - - FOR_EACH_INSN_INFO_DEF (def, insn_info) - { - struct df_link *link = DF_REF_CHAIN (def); - if (!link) - return 0; - - for (; link; link = link->next) { - rtx use_insn = DF_REF_INSN (link->ref); - rtx use_body = PATTERN (use_insn); - if (GET_CODE (use_body) != SET - || GET_CODE (SET_SRC (use_body)) != UNSPEC - || XINT (SET_SRC (use_body), 1) != UNSPEC_VSX_XXSPLTW - || XVECEXP (SET_SRC (use_body), 0, 1) != const0_rtx) - return 0; - } - } - - return 1; - } - - /* A concatenation of two doublewords is ok if we reverse the - order of the inputs. */ - if (GET_CODE (body) == SET - && GET_CODE (SET_SRC (body)) == VEC_CONCAT - && (GET_MODE (SET_SRC (body)) == V2DFmode - || GET_MODE (SET_SRC (body)) == V2DImode)) - { - *special = SH_CONCAT; - return 1; - } - - /* V2DF reductions are always swappable. */ - if (GET_CODE (body) == PARALLEL) - { - rtx expr = XVECEXP (body, 0, 0); - if (GET_CODE (expr) == SET - && v2df_reduction_p (SET_SRC (expr))) - return 1; - } - - /* An UNSPEC_VPERM is ok if the mask operand is loaded from the - constant pool. */ - if (GET_CODE (body) == SET - && GET_CODE (SET_SRC (body)) == UNSPEC - && XINT (SET_SRC (body), 1) == UNSPEC_VPERM - && XVECLEN (SET_SRC (body), 0) == 3 - && GET_CODE (XVECEXP (SET_SRC (body), 0, 2)) == REG) - { - rtx mask_reg = XVECEXP (SET_SRC (body), 0, 2); - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - df_ref use; - FOR_EACH_INSN_INFO_USE (use, insn_info) - if (rtx_equal_p (DF_REF_REG (use), mask_reg)) - { - struct df_link *def_link = DF_REF_CHAIN (use); - /* Punt if multiple definitions for this reg. */ - if (def_link && !def_link->next && - const_load_sequence_p (insn_entry, - DF_REF_INSN (def_link->ref))) - { - *special = SH_VPERM; - return 1; - } - } - } - - /* Otherwise check the operands for vector lane violations. */ - return rtx_is_swappable_p (body, special); -} - -enum chain_purpose { FOR_LOADS, FOR_STORES }; - -/* Return true if the UD or DU chain headed by LINK is non-empty, - and every entry on the chain references an insn that is a - register swap. Furthermore, if PURPOSE is FOR_LOADS, each such - register swap must have only permuting loads as reaching defs. - If PURPOSE is FOR_STORES, each such register swap must have only - register swaps or permuting stores as reached uses. */ -static bool -chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link, - enum chain_purpose purpose) -{ - if (!link) - return false; - - for (; link; link = link->next) - { - if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (DF_REF_REG (link->ref)))) - continue; - - if (DF_REF_IS_ARTIFICIAL (link->ref)) - return false; - - rtx reached_insn = DF_REF_INSN (link->ref); - unsigned uid = INSN_UID (reached_insn); - struct df_insn_info *insn_info = DF_INSN_INFO_GET (reached_insn); - - if (!insn_entry[uid].is_swap || insn_entry[uid].is_load - || insn_entry[uid].is_store) - return false; - - if (purpose == FOR_LOADS) - { - df_ref use; - FOR_EACH_INSN_INFO_USE (use, insn_info) - { - struct df_link *swap_link = DF_REF_CHAIN (use); - - while (swap_link) - { - if (DF_REF_IS_ARTIFICIAL (link->ref)) - return false; - - rtx swap_def_insn = DF_REF_INSN (swap_link->ref); - unsigned uid2 = INSN_UID (swap_def_insn); - - /* Only permuting loads are allowed. */ - if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load) - return false; - - swap_link = swap_link->next; - } - } - } - else if (purpose == FOR_STORES) - { - df_ref def; - FOR_EACH_INSN_INFO_DEF (def, insn_info) - { - struct df_link *swap_link = DF_REF_CHAIN (def); - - while (swap_link) - { - if (DF_REF_IS_ARTIFICIAL (link->ref)) - return false; - - rtx swap_use_insn = DF_REF_INSN (swap_link->ref); - unsigned uid2 = INSN_UID (swap_use_insn); - - /* Permuting stores or register swaps are allowed. */ - if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load) - return false; - - swap_link = swap_link->next; - } - } - } - } - - return true; -} - -/* Mark the xxswapdi instructions associated with permuting loads and - stores for removal. Note that we only flag them for deletion here, - as there is a possibility of a swap being reached from multiple - loads, etc. */ -static void -mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i) -{ - rtx insn = insn_entry[i].insn; - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - - if (insn_entry[i].is_load) - { - df_ref def; - FOR_EACH_INSN_INFO_DEF (def, insn_info) - { - struct df_link *link = DF_REF_CHAIN (def); - - /* We know by now that these are swaps, so we can delete - them confidently. */ - while (link) - { - rtx use_insn = DF_REF_INSN (link->ref); - insn_entry[INSN_UID (use_insn)].will_delete = 1; - link = link->next; - } - } - } - else if (insn_entry[i].is_store) - { - df_ref use; - FOR_EACH_INSN_INFO_USE (use, insn_info) - { - /* Ignore uses for addressability. */ - machine_mode mode = GET_MODE (DF_REF_REG (use)); - if (!ALTIVEC_OR_VSX_VECTOR_MODE (mode)) - continue; - - struct df_link *link = DF_REF_CHAIN (use); - - /* We know by now that these are swaps, so we can delete - them confidently. */ - while (link) - { - rtx def_insn = DF_REF_INSN (link->ref); - insn_entry[INSN_UID (def_insn)].will_delete = 1; - link = link->next; - } - } - } -} - -/* OP is either a CONST_VECTOR or an expression containing one. - Swap the first half of the vector with the second in the first - case. Recurse to find it in the second. */ -static void -swap_const_vector_halves (rtx op) -{ - int i; - enum rtx_code code = GET_CODE (op); - if (GET_CODE (op) == CONST_VECTOR) - { - int half_units = GET_MODE_NUNITS (GET_MODE (op)) / 2; - for (i = 0; i < half_units; ++i) - { - rtx temp = CONST_VECTOR_ELT (op, i); - CONST_VECTOR_ELT (op, i) = CONST_VECTOR_ELT (op, i + half_units); - CONST_VECTOR_ELT (op, i + half_units) = temp; - } - } - else - { - int j; - const char *fmt = GET_RTX_FORMAT (code); - for (i = 0; i < GET_RTX_LENGTH (code); ++i) - if (fmt[i] == 'e' || fmt[i] == 'u') - swap_const_vector_halves (XEXP (op, i)); - else if (fmt[i] == 'E') - for (j = 0; j < XVECLEN (op, i); ++j) - swap_const_vector_halves (XVECEXP (op, i, j)); - } -} - -/* Find all subregs of a vector expression that perform a narrowing, - and adjust the subreg index to account for doubleword swapping. */ -static void -adjust_subreg_index (rtx op) -{ - enum rtx_code code = GET_CODE (op); - if (code == SUBREG - && (GET_MODE_SIZE (GET_MODE (op)) - < GET_MODE_SIZE (GET_MODE (XEXP (op, 0))))) - { - unsigned int index = SUBREG_BYTE (op); - if (index < 8) - index += 8; - else - index -= 8; - SUBREG_BYTE (op) = index; - } - - const char *fmt = GET_RTX_FORMAT (code); - int i,j; - for (i = 0; i < GET_RTX_LENGTH (code); ++i) - if (fmt[i] == 'e' || fmt[i] == 'u') - adjust_subreg_index (XEXP (op, i)); - else if (fmt[i] == 'E') - for (j = 0; j < XVECLEN (op, i); ++j) - adjust_subreg_index (XVECEXP (op, i, j)); -} - -/* Convert the non-permuting load INSN to a permuting one. */ -static void -permute_load (rtx_insn *insn) -{ - rtx body = PATTERN (insn); - rtx mem_op = SET_SRC (body); - rtx tgt_reg = SET_DEST (body); - machine_mode mode = GET_MODE (tgt_reg); - int n_elts = GET_MODE_NUNITS (mode); - int half_elts = n_elts / 2; - rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts)); - int i, j; - for (i = 0, j = half_elts; i < half_elts; ++i, ++j) - XVECEXP (par, 0, i) = GEN_INT (j); - for (i = half_elts, j = 0; j < half_elts; ++i, ++j) - XVECEXP (par, 0, i) = GEN_INT (j); - rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par); - SET_SRC (body) = sel; - INSN_CODE (insn) = -1; /* Force re-recognition. */ - df_insn_rescan (insn); - - if (dump_file) - fprintf (dump_file, "Replacing load %d with permuted load\n", - INSN_UID (insn)); -} - -/* Convert the non-permuting store INSN to a permuting one. */ -static void -permute_store (rtx_insn *insn) -{ - rtx body = PATTERN (insn); - rtx src_reg = SET_SRC (body); - machine_mode mode = GET_MODE (src_reg); - int n_elts = GET_MODE_NUNITS (mode); - int half_elts = n_elts / 2; - rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts)); - int i, j; - for (i = 0, j = half_elts; i < half_elts; ++i, ++j) - XVECEXP (par, 0, i) = GEN_INT (j); - for (i = half_elts, j = 0; j < half_elts; ++i, ++j) - XVECEXP (par, 0, i) = GEN_INT (j); - rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par); - SET_SRC (body) = sel; - INSN_CODE (insn) = -1; /* Force re-recognition. */ - df_insn_rescan (insn); - - if (dump_file) - fprintf (dump_file, "Replacing store %d with permuted store\n", - INSN_UID (insn)); -} - -/* Given OP that contains a vector extract operation, adjust the index - of the extracted lane to account for the doubleword swap. */ -static void -adjust_extract (rtx_insn *insn) -{ - rtx pattern = PATTERN (insn); - if (GET_CODE (pattern) == PARALLEL) - pattern = XVECEXP (pattern, 0, 0); - rtx src = SET_SRC (pattern); - /* The vec_select may be wrapped in a vec_duplicate for a splat, so - account for that. */ - rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src; - rtx par = XEXP (sel, 1); - int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1; - int lane = INTVAL (XVECEXP (par, 0, 0)); - lane = lane >= half_elts ? lane - half_elts : lane + half_elts; - XVECEXP (par, 0, 0) = GEN_INT (lane); - INSN_CODE (insn) = -1; /* Force re-recognition. */ - df_insn_rescan (insn); - - if (dump_file) - fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn)); -} - -/* Given OP that contains a vector direct-splat operation, adjust the index - of the source lane to account for the doubleword swap. */ -static void -adjust_splat (rtx_insn *insn) -{ - rtx body = PATTERN (insn); - rtx unspec = XEXP (body, 1); - int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1; - int lane = INTVAL (XVECEXP (unspec, 0, 1)); - lane = lane >= half_elts ? lane - half_elts : lane + half_elts; - XVECEXP (unspec, 0, 1) = GEN_INT (lane); - INSN_CODE (insn) = -1; /* Force re-recognition. */ - df_insn_rescan (insn); - - if (dump_file) - fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn)); -} - -/* Given OP that contains an XXPERMDI operation (that is not a doubleword - swap), reverse the order of the source operands and adjust the indices - of the source lanes to account for doubleword reversal. */ -static void -adjust_xxpermdi (rtx_insn *insn) -{ - rtx set = PATTERN (insn); - rtx select = XEXP (set, 1); - rtx concat = XEXP (select, 0); - rtx src0 = XEXP (concat, 0); - XEXP (concat, 0) = XEXP (concat, 1); - XEXP (concat, 1) = src0; - rtx parallel = XEXP (select, 1); - int lane0 = INTVAL (XVECEXP (parallel, 0, 0)); - int lane1 = INTVAL (XVECEXP (parallel, 0, 1)); - int new_lane0 = 3 - lane1; - int new_lane1 = 3 - lane0; - XVECEXP (parallel, 0, 0) = GEN_INT (new_lane0); - XVECEXP (parallel, 0, 1) = GEN_INT (new_lane1); - INSN_CODE (insn) = -1; /* Force re-recognition. */ - df_insn_rescan (insn); - - if (dump_file) - fprintf (dump_file, "Changing lanes for xxpermdi %d\n", INSN_UID (insn)); -} - -/* Given OP that contains a VEC_CONCAT operation of two doublewords, - reverse the order of those inputs. */ -static void -adjust_concat (rtx_insn *insn) -{ - rtx set = PATTERN (insn); - rtx concat = XEXP (set, 1); - rtx src0 = XEXP (concat, 0); - XEXP (concat, 0) = XEXP (concat, 1); - XEXP (concat, 1) = src0; - INSN_CODE (insn) = -1; /* Force re-recognition. */ - df_insn_rescan (insn); - - if (dump_file) - fprintf (dump_file, "Reversing inputs for concat %d\n", INSN_UID (insn)); -} - -/* Given an UNSPEC_VPERM insn, modify the mask loaded from the - constant pool to reflect swapped doublewords. */ -static void -adjust_vperm (rtx_insn *insn) -{ - /* We previously determined that the UNSPEC_VPERM was fed by a - swap of a swapping load of a TOC-relative constant pool symbol. - Find the MEM in the swapping load and replace it with a MEM for - the adjusted mask constant. */ - rtx set = PATTERN (insn); - rtx mask_reg = XVECEXP (SET_SRC (set), 0, 2); - - /* Find the swap. */ - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - df_ref use; - rtx_insn *swap_insn = 0; - FOR_EACH_INSN_INFO_USE (use, insn_info) - if (rtx_equal_p (DF_REF_REG (use), mask_reg)) - { - struct df_link *def_link = DF_REF_CHAIN (use); - gcc_assert (def_link && !def_link->next); - swap_insn = DF_REF_INSN (def_link->ref); - break; - } - gcc_assert (swap_insn); - - /* Find the load. */ - insn_info = DF_INSN_INFO_GET (swap_insn); - rtx_insn *load_insn = 0; - FOR_EACH_INSN_INFO_USE (use, insn_info) - { - struct df_link *def_link = DF_REF_CHAIN (use); - gcc_assert (def_link && !def_link->next); - load_insn = DF_REF_INSN (def_link->ref); - break; - } - gcc_assert (load_insn); - - /* Find the TOC-relative symbol access. */ - insn_info = DF_INSN_INFO_GET (load_insn); - rtx_insn *tocrel_insn = 0; - FOR_EACH_INSN_INFO_USE (use, insn_info) - { - struct df_link *def_link = DF_REF_CHAIN (use); - gcc_assert (def_link && !def_link->next); - tocrel_insn = DF_REF_INSN (def_link->ref); - break; - } - gcc_assert (tocrel_insn); - - /* Find the embedded CONST_VECTOR. We have to call toc_relative_expr_p - to set tocrel_base; otherwise it would be unnecessary as we've - already established it will return true. */ - rtx base, offset; - const_rtx tocrel_base; - rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn)); - /* There is an extra level of indirection for small/large code models. */ - if (GET_CODE (tocrel_expr) == MEM) - tocrel_expr = XEXP (tocrel_expr, 0); - if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL)) - gcc_unreachable (); - split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset); - rtx const_vector = get_pool_constant (base); - /* With the extra indirection, get_pool_constant will produce the - real constant from the reg_equal expression, so get the real - constant. */ - if (GET_CODE (const_vector) == SYMBOL_REF) - const_vector = get_pool_constant (const_vector); - gcc_assert (GET_CODE (const_vector) == CONST_VECTOR); - - /* Create an adjusted mask from the initial mask. */ - unsigned int new_mask[16], i, val; - for (i = 0; i < 16; ++i) { - val = INTVAL (XVECEXP (const_vector, 0, i)); - if (val < 16) - new_mask[i] = (val + 8) % 16; - else - new_mask[i] = ((val + 8) % 16) + 16; - } - - /* Create a new CONST_VECTOR and a MEM that references it. */ - rtx vals = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); - for (i = 0; i < 16; ++i) - XVECEXP (vals, 0, i) = GEN_INT (new_mask[i]); - rtx new_const_vector = gen_rtx_CONST_VECTOR (V16QImode, XVEC (vals, 0)); - rtx new_mem = force_const_mem (V16QImode, new_const_vector); - /* This gives us a MEM whose base operand is a SYMBOL_REF, which we - can't recognize. Force the SYMBOL_REF into a register. */ - if (!REG_P (XEXP (new_mem, 0))) { - rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0)); - XEXP (new_mem, 0) = base_reg; - /* Move the newly created insn ahead of the load insn. */ - rtx_insn *force_insn = get_last_insn (); - remove_insn (force_insn); - rtx_insn *before_load_insn = PREV_INSN (load_insn); - add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn)); - df_insn_rescan (before_load_insn); - df_insn_rescan (force_insn); - } - - /* Replace the MEM in the load instruction and rescan it. */ - XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem; - INSN_CODE (load_insn) = -1; /* Force re-recognition. */ - df_insn_rescan (load_insn); - - if (dump_file) - fprintf (dump_file, "Adjusting mask for vperm %d\n", INSN_UID (insn)); -} - -/* The insn described by INSN_ENTRY[I] can be swapped, but only - with special handling. Take care of that here. */ -static void -handle_special_swappables (swap_web_entry *insn_entry, unsigned i) -{ - rtx_insn *insn = insn_entry[i].insn; - rtx body = PATTERN (insn); - - switch (insn_entry[i].special_handling) - { - default: - gcc_unreachable (); - case SH_CONST_VECTOR: - { - /* A CONST_VECTOR will only show up somewhere in the RHS of a SET. */ - gcc_assert (GET_CODE (body) == SET); - rtx rhs = SET_SRC (body); - swap_const_vector_halves (rhs); - if (dump_file) - fprintf (dump_file, "Swapping constant halves in insn %d\n", i); - break; - } - case SH_SUBREG: - /* A subreg of the same size is already safe. For subregs that - select a smaller portion of a reg, adjust the index for - swapped doublewords. */ - adjust_subreg_index (body); - if (dump_file) - fprintf (dump_file, "Adjusting subreg in insn %d\n", i); - break; - case SH_NOSWAP_LD: - /* Convert a non-permuting load to a permuting one. */ - permute_load (insn); - break; - case SH_NOSWAP_ST: - /* Convert a non-permuting store to a permuting one. */ - permute_store (insn); - break; - case SH_EXTRACT: - /* Change the lane on an extract operation. */ - adjust_extract (insn); - break; - case SH_SPLAT: - /* Change the lane on a direct-splat operation. */ - adjust_splat (insn); - break; - case SH_XXPERMDI: - /* Change the lanes on an XXPERMDI operation. */ - adjust_xxpermdi (insn); - break; - case SH_CONCAT: - /* Reverse the order of a concatenation operation. */ - adjust_concat (insn); - break; - case SH_VPERM: - /* Change the mask loaded from the constant pool for a VPERM. */ - adjust_vperm (insn); - break; - } -} - -/* Find the insn from the Ith table entry, which is known to be a - register swap Y = SWAP(X). Replace it with a copy Y = X. */ -static void -replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i) -{ - rtx_insn *insn = insn_entry[i].insn; - rtx body = PATTERN (insn); - rtx src_reg = XEXP (SET_SRC (body), 0); - rtx copy = gen_rtx_SET (SET_DEST (body), src_reg); - rtx_insn *new_insn = emit_insn_before (copy, insn); - set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn)); - df_insn_rescan (new_insn); - - if (dump_file) - { - unsigned int new_uid = INSN_UID (new_insn); - fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid); - } - - df_insn_delete (insn); - remove_insn (insn); - insn->set_deleted (); -} - -/* Dump the swap table to DUMP_FILE. */ -static void -dump_swap_insn_table (swap_web_entry *insn_entry) -{ - int e = get_max_uid (); - fprintf (dump_file, "\nRelevant insns with their flag settings\n\n"); - - for (int i = 0; i < e; ++i) - if (insn_entry[i].is_relevant) - { - swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred (); - fprintf (dump_file, "%6d %6d ", i, - pred_entry && pred_entry->insn - ? INSN_UID (pred_entry->insn) : 0); - if (insn_entry[i].is_load) - fputs ("load ", dump_file); - if (insn_entry[i].is_store) - fputs ("store ", dump_file); - if (insn_entry[i].is_swap) - fputs ("swap ", dump_file); - if (insn_entry[i].is_live_in) - fputs ("live-in ", dump_file); - if (insn_entry[i].is_live_out) - fputs ("live-out ", dump_file); - if (insn_entry[i].contains_subreg) - fputs ("subreg ", dump_file); - if (insn_entry[i].is_128_int) - fputs ("int128 ", dump_file); - if (insn_entry[i].is_call) - fputs ("call ", dump_file); - if (insn_entry[i].is_swappable) - { - fputs ("swappable ", dump_file); - if (insn_entry[i].special_handling == SH_CONST_VECTOR) - fputs ("special:constvec ", dump_file); - else if (insn_entry[i].special_handling == SH_SUBREG) - fputs ("special:subreg ", dump_file); - else if (insn_entry[i].special_handling == SH_NOSWAP_LD) - fputs ("special:load ", dump_file); - else if (insn_entry[i].special_handling == SH_NOSWAP_ST) - fputs ("special:store ", dump_file); - else if (insn_entry[i].special_handling == SH_EXTRACT) - fputs ("special:extract ", dump_file); - else if (insn_entry[i].special_handling == SH_SPLAT) - fputs ("special:splat ", dump_file); - else if (insn_entry[i].special_handling == SH_XXPERMDI) - fputs ("special:xxpermdi ", dump_file); - else if (insn_entry[i].special_handling == SH_CONCAT) - fputs ("special:concat ", dump_file); - else if (insn_entry[i].special_handling == SH_VPERM) - fputs ("special:vperm ", dump_file); - } - if (insn_entry[i].web_not_optimizable) - fputs ("unoptimizable ", dump_file); - if (insn_entry[i].will_delete) - fputs ("delete ", dump_file); - fputs ("\n", dump_file); - } - fputs ("\n", dump_file); -} - -/* Return RTX with its address canonicalized to (reg) or (+ reg reg). - Here RTX is an (& addr (const_int -16)). Always return a new copy - to avoid problems with combine. */ -static rtx -alignment_with_canonical_addr (rtx align) -{ - rtx canon; - rtx addr = XEXP (align, 0); - - if (REG_P (addr)) - canon = addr; - - else if (GET_CODE (addr) == PLUS) - { - rtx addrop0 = XEXP (addr, 0); - rtx addrop1 = XEXP (addr, 1); - - if (!REG_P (addrop0)) - addrop0 = force_reg (GET_MODE (addrop0), addrop0); - - if (!REG_P (addrop1)) - addrop1 = force_reg (GET_MODE (addrop1), addrop1); - - canon = gen_rtx_PLUS (GET_MODE (addr), addrop0, addrop1); - } - - else - canon = force_reg (GET_MODE (addr), addr); - - return gen_rtx_AND (GET_MODE (align), canon, GEN_INT (-16)); -} - -/* Check whether an rtx is an alignment mask, and if so, return - a fully-expanded rtx for the masking operation. */ -static rtx -alignment_mask (rtx_insn *insn) -{ - rtx body = PATTERN (insn); - - if (GET_CODE (body) != SET - || GET_CODE (SET_SRC (body)) != AND - || !REG_P (XEXP (SET_SRC (body), 0))) - return 0; - - rtx mask = XEXP (SET_SRC (body), 1); - - if (GET_CODE (mask) == CONST_INT) - { - if (INTVAL (mask) == -16) - return alignment_with_canonical_addr (SET_SRC (body)); - else - return 0; - } - - if (!REG_P (mask)) - return 0; - - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - df_ref use; - rtx real_mask = 0; - - FOR_EACH_INSN_INFO_USE (use, insn_info) - { - if (!rtx_equal_p (DF_REF_REG (use), mask)) - continue; - - struct df_link *def_link = DF_REF_CHAIN (use); - if (!def_link || def_link->next) - return 0; - - rtx_insn *const_insn = DF_REF_INSN (def_link->ref); - rtx const_body = PATTERN (const_insn); - if (GET_CODE (const_body) != SET) - return 0; - - real_mask = SET_SRC (const_body); - - if (GET_CODE (real_mask) != CONST_INT - || INTVAL (real_mask) != -16) - return 0; - } - - if (real_mask == 0) - return 0; - - return alignment_with_canonical_addr (SET_SRC (body)); -} - -/* Given INSN that's a load or store based at BASE_REG, look for a - feeding computation that aligns its address on a 16-byte boundary. */ -static rtx -find_alignment_op (rtx_insn *insn, rtx base_reg) -{ - df_ref base_use; - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - rtx and_operation = 0; - - FOR_EACH_INSN_INFO_USE (base_use, insn_info) - { - if (!rtx_equal_p (DF_REF_REG (base_use), base_reg)) - continue; - - struct df_link *base_def_link = DF_REF_CHAIN (base_use); - if (!base_def_link || base_def_link->next) - break; - - /* With stack-protector code enabled, and possibly in other - circumstances, there may not be an associated insn for - the def. */ - if (DF_REF_IS_ARTIFICIAL (base_def_link->ref)) - break; - - rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref); - and_operation = alignment_mask (and_insn); - if (and_operation != 0) - break; - } - - return and_operation; -} - -struct del_info { bool replace; rtx_insn *replace_insn; }; - -/* If INSN is the load for an lvx pattern, put it in canonical form. */ -static void -recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete) -{ - rtx body = PATTERN (insn); - gcc_assert (GET_CODE (body) == SET - && GET_CODE (SET_SRC (body)) == VEC_SELECT - && GET_CODE (XEXP (SET_SRC (body), 0)) == MEM); - - rtx mem = XEXP (SET_SRC (body), 0); - rtx base_reg = XEXP (mem, 0); - - rtx and_operation = find_alignment_op (insn, base_reg); - - if (and_operation != 0) - { - df_ref def; - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - FOR_EACH_INSN_INFO_DEF (def, insn_info) - { - struct df_link *link = DF_REF_CHAIN (def); - if (!link || link->next) - break; - - rtx_insn *swap_insn = DF_REF_INSN (link->ref); - if (!insn_is_swap_p (swap_insn) - || insn_is_load_p (swap_insn) - || insn_is_store_p (swap_insn)) - break; - - /* Expected lvx pattern found. Change the swap to - a copy, and propagate the AND operation into the - load. */ - to_delete[INSN_UID (swap_insn)].replace = true; - to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn; - - XEXP (mem, 0) = and_operation; - SET_SRC (body) = mem; - INSN_CODE (insn) = -1; /* Force re-recognition. */ - df_insn_rescan (insn); - - if (dump_file) - fprintf (dump_file, "lvx opportunity found at %d\n", - INSN_UID (insn)); - } - } -} - -/* If INSN is the store for an stvx pattern, put it in canonical form. */ -static void -recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete) -{ - rtx body = PATTERN (insn); - gcc_assert (GET_CODE (body) == SET - && GET_CODE (SET_DEST (body)) == MEM - && GET_CODE (SET_SRC (body)) == VEC_SELECT); - rtx mem = SET_DEST (body); - rtx base_reg = XEXP (mem, 0); - - rtx and_operation = find_alignment_op (insn, base_reg); - - if (and_operation != 0) - { - rtx src_reg = XEXP (SET_SRC (body), 0); - df_ref src_use; - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - FOR_EACH_INSN_INFO_USE (src_use, insn_info) - { - if (!rtx_equal_p (DF_REF_REG (src_use), src_reg)) - continue; - - struct df_link *link = DF_REF_CHAIN (src_use); - if (!link || link->next) - break; - - rtx_insn *swap_insn = DF_REF_INSN (link->ref); - if (!insn_is_swap_p (swap_insn) - || insn_is_load_p (swap_insn) - || insn_is_store_p (swap_insn)) - break; - - /* Expected stvx pattern found. Change the swap to - a copy, and propagate the AND operation into the - store. */ - to_delete[INSN_UID (swap_insn)].replace = true; - to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn; - - XEXP (mem, 0) = and_operation; - SET_SRC (body) = src_reg; - INSN_CODE (insn) = -1; /* Force re-recognition. */ - df_insn_rescan (insn); - - if (dump_file) - fprintf (dump_file, "stvx opportunity found at %d\n", - INSN_UID (insn)); - } - } -} - -/* Look for patterns created from builtin lvx and stvx calls, and - canonicalize them to be properly recognized as such. */ -static void -recombine_lvx_stvx_patterns (function *fun) -{ - int i; - basic_block bb; - rtx_insn *insn; - - int num_insns = get_max_uid (); - del_info *to_delete = XCNEWVEC (del_info, num_insns); - - FOR_ALL_BB_FN (bb, fun) - FOR_BB_INSNS (bb, insn) - { - if (!NONDEBUG_INSN_P (insn)) - continue; - - if (insn_is_load_p (insn) && insn_is_swap_p (insn)) - recombine_lvx_pattern (insn, to_delete); - else if (insn_is_store_p (insn) && insn_is_swap_p (insn)) - recombine_stvx_pattern (insn, to_delete); - } - - /* Turning swaps into copies is delayed until now, to avoid problems - with deleting instructions during the insn walk. */ - for (i = 0; i < num_insns; i++) - if (to_delete[i].replace) - { - rtx swap_body = PATTERN (to_delete[i].replace_insn); - rtx src_reg = XEXP (SET_SRC (swap_body), 0); - rtx copy = gen_rtx_SET (SET_DEST (swap_body), src_reg); - rtx_insn *new_insn = emit_insn_before (copy, - to_delete[i].replace_insn); - set_block_for_insn (new_insn, - BLOCK_FOR_INSN (to_delete[i].replace_insn)); - df_insn_rescan (new_insn); - df_insn_delete (to_delete[i].replace_insn); - remove_insn (to_delete[i].replace_insn); - to_delete[i].replace_insn->set_deleted (); - } - - free (to_delete); -} - -/* Main entry point for this pass. */ -unsigned int -rs6000_analyze_swaps (function *fun) -{ - swap_web_entry *insn_entry; - basic_block bb; - rtx_insn *insn, *curr_insn = 0; - - /* Dataflow analysis for use-def chains. */ - df_set_flags (DF_RD_PRUNE_DEAD_DEFS); - df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN); - df_analyze (); - df_set_flags (DF_DEFER_INSN_RESCAN); - - /* Pre-pass to recombine lvx and stvx patterns so we don't lose info. */ - recombine_lvx_stvx_patterns (fun); - - /* Allocate structure to represent webs of insns. */ - insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ()); - - /* Walk the insns to gather basic data. */ - FOR_ALL_BB_FN (bb, fun) - FOR_BB_INSNS_SAFE (bb, insn, curr_insn) - { - unsigned int uid = INSN_UID (insn); - if (NONDEBUG_INSN_P (insn)) - { - insn_entry[uid].insn = insn; - - if (GET_CODE (insn) == CALL_INSN) - insn_entry[uid].is_call = 1; - - /* Walk the uses and defs to see if we mention vector regs. - Record any constraints on optimization of such mentions. */ - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - df_ref mention; - FOR_EACH_INSN_INFO_USE (mention, insn_info) - { - /* We use DF_REF_REAL_REG here to get inside any subregs. */ - machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention)); - - /* If a use gets its value from a call insn, it will be - a hard register and will look like (reg:V4SI 3 3). - The df analysis creates two mentions for GPR3 and GPR4, - both DImode. We must recognize this and treat it as a - vector mention to ensure the call is unioned with this - use. */ - if (mode == DImode && DF_REF_INSN_INFO (mention)) - { - rtx feeder = DF_REF_INSN (mention); - /* FIXME: It is pretty hard to get from the df mention - to the mode of the use in the insn. We arbitrarily - pick a vector mode here, even though the use might - be a real DImode. We can be too conservative - (create a web larger than necessary) because of - this, so consider eventually fixing this. */ - if (GET_CODE (feeder) == CALL_INSN) - mode = V4SImode; - } - - if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode) - { - insn_entry[uid].is_relevant = 1; - if (mode == TImode || mode == V1TImode - || FLOAT128_VECTOR_P (mode)) - insn_entry[uid].is_128_int = 1; - if (DF_REF_INSN_INFO (mention)) - insn_entry[uid].contains_subreg - = !rtx_equal_p (DF_REF_REG (mention), - DF_REF_REAL_REG (mention)); - union_defs (insn_entry, insn, mention); - } - } - FOR_EACH_INSN_INFO_DEF (mention, insn_info) - { - /* We use DF_REF_REAL_REG here to get inside any subregs. */ - machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention)); - - /* If we're loading up a hard vector register for a call, - it looks like (set (reg:V4SI 9 9) (...)). The df - analysis creates two mentions for GPR9 and GPR10, both - DImode. So relying on the mode from the mentions - isn't sufficient to ensure we union the call into the - web with the parameter setup code. */ - if (mode == DImode && GET_CODE (insn) == SET - && ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (SET_DEST (insn)))) - mode = GET_MODE (SET_DEST (insn)); - - if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode) - { - insn_entry[uid].is_relevant = 1; - if (mode == TImode || mode == V1TImode - || FLOAT128_VECTOR_P (mode)) - insn_entry[uid].is_128_int = 1; - if (DF_REF_INSN_INFO (mention)) - insn_entry[uid].contains_subreg - = !rtx_equal_p (DF_REF_REG (mention), - DF_REF_REAL_REG (mention)); - /* REG_FUNCTION_VALUE_P is not valid for subregs. */ - else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention))) - insn_entry[uid].is_live_out = 1; - union_uses (insn_entry, insn, mention); - } - } - - if (insn_entry[uid].is_relevant) - { - /* Determine if this is a load or store. */ - insn_entry[uid].is_load = insn_is_load_p (insn); - insn_entry[uid].is_store = insn_is_store_p (insn); - - /* Determine if this is a doubleword swap. If not, - determine whether it can legally be swapped. */ - if (insn_is_swap_p (insn)) - insn_entry[uid].is_swap = 1; - else - { - unsigned int special = SH_NONE; - insn_entry[uid].is_swappable - = insn_is_swappable_p (insn_entry, insn, &special); - if (special != SH_NONE && insn_entry[uid].contains_subreg) - insn_entry[uid].is_swappable = 0; - else if (special != SH_NONE) - insn_entry[uid].special_handling = special; - else if (insn_entry[uid].contains_subreg) - insn_entry[uid].special_handling = SH_SUBREG; - } - } - } - } - - if (dump_file) - { - fprintf (dump_file, "\nSwap insn entry table when first built\n"); - dump_swap_insn_table (insn_entry); - } - - /* Record unoptimizable webs. */ - unsigned e = get_max_uid (), i; - for (i = 0; i < e; ++i) - { - if (!insn_entry[i].is_relevant) - continue; - - swap_web_entry *root - = (swap_web_entry*)(&insn_entry[i])->unionfind_root (); - - if (insn_entry[i].is_live_in || insn_entry[i].is_live_out - || (insn_entry[i].contains_subreg - && insn_entry[i].special_handling != SH_SUBREG) - || insn_entry[i].is_128_int || insn_entry[i].is_call - || !(insn_entry[i].is_swappable || insn_entry[i].is_swap)) - root->web_not_optimizable = 1; - - /* If we have loads or stores that aren't permuting then the - optimization isn't appropriate. */ - else if ((insn_entry[i].is_load || insn_entry[i].is_store) - && !insn_entry[i].is_swap && !insn_entry[i].is_swappable) - root->web_not_optimizable = 1; - - /* If we have permuting loads or stores that are not accompanied - by a register swap, the optimization isn't appropriate. */ - else if (insn_entry[i].is_load && insn_entry[i].is_swap) - { - rtx insn = insn_entry[i].insn; - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - df_ref def; - - FOR_EACH_INSN_INFO_DEF (def, insn_info) - { - struct df_link *link = DF_REF_CHAIN (def); - - if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS)) - { - root->web_not_optimizable = 1; - break; - } - } - } - else if (insn_entry[i].is_store && insn_entry[i].is_swap) - { - rtx insn = insn_entry[i].insn; - struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn); - df_ref use; - - FOR_EACH_INSN_INFO_USE (use, insn_info) - { - struct df_link *link = DF_REF_CHAIN (use); - - if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES)) - { - root->web_not_optimizable = 1; - break; - } - } - } - } - - if (dump_file) - { - fprintf (dump_file, "\nSwap insn entry table after web analysis\n"); - dump_swap_insn_table (insn_entry); - } - - /* For each load and store in an optimizable web (which implies - the loads and stores are permuting), find the associated - register swaps and mark them for removal. Due to various - optimizations we may mark the same swap more than once. Also - perform special handling for swappable insns that require it. */ - for (i = 0; i < e; ++i) - if ((insn_entry[i].is_load || insn_entry[i].is_store) - && insn_entry[i].is_swap) - { - swap_web_entry* root_entry - = (swap_web_entry*)((&insn_entry[i])->unionfind_root ()); - if (!root_entry->web_not_optimizable) - mark_swaps_for_removal (insn_entry, i); - } - else if (insn_entry[i].is_swappable && insn_entry[i].special_handling) - { - swap_web_entry* root_entry - = (swap_web_entry*)((&insn_entry[i])->unionfind_root ()); - if (!root_entry->web_not_optimizable) - handle_special_swappables (insn_entry, i); - } - - /* Now delete the swaps marked for removal. */ - for (i = 0; i < e; ++i) - if (insn_entry[i].will_delete) - replace_swap_with_copy (insn_entry, i); - - /* Clean up. */ - free (insn_entry); - return 0; -} - -const pass_data pass_data_analyze_swaps = -{ - RTL_PASS, /* type */ - "swaps", /* name */ - OPTGROUP_NONE, /* optinfo_flags */ - TV_NONE, /* tv_id */ - 0, /* properties_required */ - 0, /* properties_provided */ - 0, /* properties_destroyed */ - 0, /* todo_flags_start */ - TODO_df_finish, /* todo_flags_finish */ -}; - -class pass_analyze_swaps : public rtl_opt_pass -{ -public: - pass_analyze_swaps(gcc::context *ctxt) - : rtl_opt_pass(pass_data_analyze_swaps, ctxt) - {} - - /* opt_pass methods: */ - virtual bool gate (function *) - { - return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX - && !TARGET_P9_VECTOR && rs6000_optimize_swaps); - } - - virtual unsigned int execute (function *fun) - { - return rs6000_analyze_swaps (fun); - } - - opt_pass *clone () - { - return new pass_analyze_swaps (m_ctxt); - } - -}; // class pass_analyze_swaps - -rtl_opt_pass * -make_pass_analyze_swaps (gcc::context *ctxt) -{ - return new pass_analyze_swaps (ctxt); -} - #ifdef RS6000_GLIBC_ATOMIC_FENV /* Function declarations for rs6000_atomic_assign_expand_fenv. */ static tree atomic_hold_decl, atomic_clear_decl, atomic_update_decl; diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index a47091ad1b4..304f322f435 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -30,6 +30,10 @@ rs6000-string.o: $(srcdir)/config/rs6000/rs6000-string.c $(COMPILE) $< $(POSTCOMPILE) +rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.c + $(COMPILE) $< + $(POSTCOMPILE) + $(srcdir)/config/rs6000/rs6000-tables.opt: $(srcdir)/config/rs6000/genopt.sh \ $(srcdir)/config/rs6000/rs6000-cpus.def $(SHELL) $(srcdir)/config/rs6000/genopt.sh $(srcdir)/config/rs6000 > \ -- 2.30.2