From 48a31a09839b12127ce7c40d7adc4bd5bf1d3407 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 26 Aug 2019 10:35:59 +0000 Subject: [PATCH] re PR target/91522 (STV is slow) 2019-08-26 Richard Biener PR target/91522 PR target/91527 * config/i386/i386-features.h (general_scalar_chain::defs_map): New member. (general_scalar_chain::replace_with_subreg): Remove. (general_scalar_chain::replace_with_subreg_in_insn): Likewise. (general_scalar_chain::convert_reg): Adjust signature. * config/i386/i386-features.c (scalar_chain::add_insn): Do not iterate over all defs of a reg. (general_scalar_chain::replace_with_subreg): Remove. (general_scalar_chain::replace_with_subreg_in_insn): Likewise. (general_scalar_chain::make_vector_copies): Populate defs_map, place copy only after defs that are used as vectors in the chain. (general_scalar_chain::convert_reg): Emit a copy for a specific def in a specific instruction. (general_scalar_chain::convert_op): All reg uses are converted here. (general_scalar_chain::convert_insn): Emit copies for scalar uses of defs here. Replace uses with the copies we created. Replace and convert the def. Adjust REG_DEAD notes, remove REG_EQUIV/EQUAL notes. (general_scalar_chain::convert_registers): Only handle copies into the chain here. From-SVN: r274926 --- gcc/ChangeLog | 25 ++ gcc/config/i386/i386-features.c | 419 +++++++++++++------------------- gcc/config/i386/i386-features.h | 5 +- 3 files changed, 200 insertions(+), 249 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index abccd69b1e4..f4e291950aa 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,28 @@ +2019-08-26 Richard Biener + + PR target/91522 + PR target/91527 + * config/i386/i386-features.h (general_scalar_chain::defs_map): + New member. + (general_scalar_chain::replace_with_subreg): Remove. + (general_scalar_chain::replace_with_subreg_in_insn): Likewise. + (general_scalar_chain::convert_reg): Adjust signature. + * config/i386/i386-features.c (scalar_chain::add_insn): Do not + iterate over all defs of a reg. + (general_scalar_chain::replace_with_subreg): Remove. + (general_scalar_chain::replace_with_subreg_in_insn): Likewise. + (general_scalar_chain::make_vector_copies): Populate defs_map, + place copy only after defs that are used as vectors in the chain. + (general_scalar_chain::convert_reg): Emit a copy for a specific + def in a specific instruction. + (general_scalar_chain::convert_op): All reg uses are converted here. + (general_scalar_chain::convert_insn): Emit copies for scalar + uses of defs here. Replace uses with the copies we created. + Replace and convert the def. Adjust REG_DEAD notes, remove + REG_EQUIV/EQUAL notes. + (general_scalar_chain::convert_registers): Only handle copies + into the chain here. + 2019-08-26 Robin Dapp * match.pd: Add (T)(A) + CST -> (T)(A + CST). diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index fb7ac1b7d10..b70757e3fef 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -416,13 +416,9 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid) iterates over all refs to look for dual-mode regs. Instead this should be done separately for all regs mentioned in the chain once. */ df_ref ref; - df_ref def; for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) if (!HARD_REGISTER_P (DF_REF_REG (ref))) - for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref)); - def; - def = DF_REF_NEXT_REG (def)) - analyze_register_chain (candidates, def); + analyze_register_chain (candidates, ref); for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref)) if (!DF_REF_REG_MEM_P (ref)) analyze_register_chain (candidates, ref); @@ -605,42 +601,6 @@ general_scalar_chain::compute_convert_gain () return gain; } -/* Replace REG in X with a V2DI subreg of NEW_REG. */ - -rtx -general_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg) -{ - if (x == reg) - return gen_rtx_SUBREG (vmode, new_reg, 0); - - /* But not in memory addresses. */ - if (MEM_P (x)) - return x; - - const char *fmt = GET_RTX_FORMAT (GET_CODE (x)); - int i, j; - for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--) - { - if (fmt[i] == 'e') - XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg); - else if (fmt[i] == 'E') - for (j = XVECLEN (x, i) - 1; j >= 0; j--) - XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j), - reg, new_reg); - } - - return x; -} - -/* Replace REG in INSN with a V2DI subreg of NEW_REG. */ - -void -general_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn, - rtx reg, rtx new_reg) -{ - replace_with_subreg (single_set (insn), reg, new_reg); -} - /* Insert generated conversion instruction sequence INSNS after instruction AFTER. New BB may be required in case instruction has EH region attached. */ @@ -691,204 +651,147 @@ general_scalar_chain::make_vector_copies (unsigned regno) rtx vreg = gen_reg_rtx (smode); df_ref ref; - for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) - if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) - { - start_sequence (); - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) - { - rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP); - if (smode == DImode && !TARGET_64BIT) - { - emit_move_insn (adjust_address (tmp, SImode, 0), - gen_rtx_SUBREG (SImode, reg, 0)); - emit_move_insn (adjust_address (tmp, SImode, 4), - gen_rtx_SUBREG (SImode, reg, 4)); - } - else - emit_move_insn (copy_rtx (tmp), reg); - emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), - gen_gpr_to_xmm_move_src (vmode, tmp))); - } - else if (!TARGET_64BIT && smode == DImode) - { - if (TARGET_SSE4_1) - { - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 0))); - emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (SImode, reg, 4), - GEN_INT (2))); - } - else - { - rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 0))); - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 4))); - emit_insn (gen_vec_interleave_lowv4si - (gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, tmp, 0))); - } - } - else - emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), - gen_gpr_to_xmm_move_src (vmode, reg))); - rtx_insn *seq = get_insns (); - end_sequence (); - rtx_insn *insn = DF_REF_INSN (ref); - emit_conversion_insns (seq, insn); - - if (dump_file) - fprintf (dump_file, - " Copied r%d to a vector register r%d for insn %d\n", - regno, REGNO (vreg), INSN_UID (insn)); - } - - for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) - if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) - { - rtx_insn *insn = DF_REF_INSN (ref); - replace_with_subreg_in_insn (insn, reg, vreg); - - if (dump_file) - fprintf (dump_file, " Replaced r%d with r%d in insn %d\n", - regno, REGNO (vreg), INSN_UID (insn)); - } -} - -/* Convert all definitions of register REGNO - and fix its uses. Scalar copies may be created - in case register is used in not convertible insn. */ - -void -general_scalar_chain::convert_reg (unsigned regno) -{ - bool scalar_copy = bitmap_bit_p (defs_conv, regno); - rtx reg = regno_reg_rtx[regno]; - rtx scopy = NULL_RTX; - df_ref ref; - bitmap conv; - - conv = BITMAP_ALLOC (NULL); - bitmap_copy (conv, insns); - - if (scalar_copy) - scopy = gen_reg_rtx (smode); + defs_map.put (reg, vreg); + /* For each insn defining REGNO, see if it is defined by an insn + not part of the chain but with uses in insns part of the chain + and insert a copy in that case. */ for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) { - rtx_insn *insn = DF_REF_INSN (ref); - rtx def_set = single_set (insn); - rtx src = SET_SRC (def_set); - rtx reg = DF_REF_REG (ref); + if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) + continue; + df_link *use; + for (use = DF_REF_CHAIN (ref); use; use = use->next) + if (!DF_REF_REG_MEM_P (use->ref) + && bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))) + break; + if (!use) + continue; - if (!MEM_P (src)) + start_sequence (); + if (!TARGET_INTER_UNIT_MOVES_TO_VEC) { - replace_with_subreg_in_insn (insn, reg, reg); - bitmap_clear_bit (conv, INSN_UID (insn)); + rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP); + if (smode == DImode && !TARGET_64BIT) + { + emit_move_insn (adjust_address (tmp, SImode, 0), + gen_rtx_SUBREG (SImode, reg, 0)); + emit_move_insn (adjust_address (tmp, SImode, 4), + gen_rtx_SUBREG (SImode, reg, 4)); + } + else + emit_move_insn (copy_rtx (tmp), reg); + emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), + gen_gpr_to_xmm_move_src (vmode, tmp))); } - - if (scalar_copy) + else if (!TARGET_64BIT && smode == DImode) { - start_sequence (); - if (!TARGET_INTER_UNIT_MOVES_FROM_VEC) + if (TARGET_SSE4_1) { - rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP); - emit_move_insn (tmp, reg); - if (!TARGET_64BIT && smode == DImode) - { - emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), - adjust_address (tmp, SImode, 0)); - emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), - adjust_address (tmp, SImode, 4)); - } - else - emit_move_insn (scopy, copy_rtx (tmp)); + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), + CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, reg, 0))); + emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (SImode, reg, 4), + GEN_INT (2))); } - else if (!TARGET_64BIT && smode == DImode) + else { - if (TARGET_SSE4_1) - { - rtx tmp = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (1, const0_rtx)); - emit_insn - (gen_rtx_SET - (gen_rtx_SUBREG (SImode, scopy, 0), - gen_rtx_VEC_SELECT (SImode, - gen_rtx_SUBREG (V4SImode, reg, 0), - tmp))); - - tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx)); - emit_insn - (gen_rtx_SET - (gen_rtx_SUBREG (SImode, scopy, 4), - gen_rtx_VEC_SELECT (SImode, - gen_rtx_SUBREG (V4SImode, reg, 0), - tmp))); - } - else - { - rtx vcopy = gen_reg_rtx (V2DImode); - emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0)); - emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0), - gen_rtx_SUBREG (SImode, vcopy, 0)); - emit_move_insn (vcopy, - gen_rtx_LSHIFTRT (V2DImode, - vcopy, GEN_INT (32))); - emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4), - gen_rtx_SUBREG (SImode, vcopy, 0)); - } + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), + CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, reg, 0))); + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0), + CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, reg, 4))); + emit_insn (gen_vec_interleave_lowv4si + (gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (V4SImode, tmp, 0))); } - else - emit_move_insn (scopy, reg); - - rtx_insn *seq = get_insns (); - end_sequence (); - emit_conversion_insns (seq, insn); - - if (dump_file) - fprintf (dump_file, - " Copied r%d to a scalar register r%d for insn %d\n", - regno, REGNO (scopy), INSN_UID (insn)); } - } - - for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) - if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) - { - if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref))) - { - rtx_insn *insn = DF_REF_INSN (ref); + else + emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), + gen_gpr_to_xmm_move_src (vmode, reg))); + rtx_insn *seq = get_insns (); + end_sequence (); + rtx_insn *insn = DF_REF_INSN (ref); + emit_conversion_insns (seq, insn); - rtx def_set = single_set (insn); - gcc_assert (def_set); + if (dump_file) + fprintf (dump_file, + " Copied r%d to a vector register r%d for insn %d\n", + regno, REGNO (vreg), INSN_UID (insn)); + } +} - rtx src = SET_SRC (def_set); - rtx dst = SET_DEST (def_set); +/* Copy the definition SRC of INSN inside the chain to DST for + scalar uses outside of the chain. */ - if (!MEM_P (dst) || !REG_P (src)) - replace_with_subreg_in_insn (insn, reg, reg); +void +general_scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src) +{ + start_sequence (); + if (!TARGET_INTER_UNIT_MOVES_FROM_VEC) + { + rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP); + emit_move_insn (tmp, src); + if (!TARGET_64BIT && smode == DImode) + { + emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0), + adjust_address (tmp, SImode, 0)); + emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4), + adjust_address (tmp, SImode, 4)); + } + else + emit_move_insn (dst, copy_rtx (tmp)); + } + else if (!TARGET_64BIT && smode == DImode) + { + if (TARGET_SSE4_1) + { + rtx tmp = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (1, const0_rtx)); + emit_insn + (gen_rtx_SET + (gen_rtx_SUBREG (SImode, dst, 0), + gen_rtx_VEC_SELECT (SImode, + gen_rtx_SUBREG (V4SImode, src, 0), + tmp))); + + tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx)); + emit_insn + (gen_rtx_SET + (gen_rtx_SUBREG (SImode, dst, 4), + gen_rtx_VEC_SELECT (SImode, + gen_rtx_SUBREG (V4SImode, src, 0), + tmp))); + } + else + { + rtx vcopy = gen_reg_rtx (V2DImode); + emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0)); + emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0), + gen_rtx_SUBREG (SImode, vcopy, 0)); + emit_move_insn (vcopy, + gen_rtx_LSHIFTRT (V2DImode, + vcopy, GEN_INT (32))); + emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4), + gen_rtx_SUBREG (SImode, vcopy, 0)); + } + } + else + emit_move_insn (dst, src); - bitmap_clear_bit (conv, INSN_UID (insn)); - } - } - /* Skip debug insns and uninitialized uses. */ - else if (DF_REF_CHAIN (ref) - && NONDEBUG_INSN_P (DF_REF_INSN (ref))) - { - gcc_assert (scopy); - replace_rtx (DF_REF_INSN (ref), reg, scopy); - df_insn_rescan (DF_REF_INSN (ref)); - } + rtx_insn *seq = get_insns (); + end_sequence (); + emit_conversion_insns (seq, insn); - BITMAP_FREE (conv); + if (dump_file) + fprintf (dump_file, + " Copied r%d to a scalar register r%d for insn %d\n", + REGNO (src), REGNO (dst), INSN_UID (insn)); } /* Convert operand OP in INSN. We should handle @@ -921,16 +824,6 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn) } else if (REG_P (*op)) { - /* We may have not converted register usage in case - this register has no definition. Otherwise it - should be converted in convert_reg. */ - df_ref ref; - FOR_EACH_INSN_USE (ref, insn) - if (DF_REF_REGNO (ref) == REGNO (*op)) - { - gcc_assert (!DF_REF_CHAIN (ref)); - break; - } *op = gen_rtx_SUBREG (vmode, *op, 0); } else if (CONST_INT_P (*op)) @@ -975,6 +868,32 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn) void general_scalar_chain::convert_insn (rtx_insn *insn) { + /* Generate copies for out-of-chain uses of defs. */ + for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref)) + if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref))) + { + df_link *use; + for (use = DF_REF_CHAIN (ref); use; use = use->next) + if (DF_REF_REG_MEM_P (use->ref) + || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))) + break; + if (use) + convert_reg (insn, DF_REF_REG (ref), + *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)])); + } + + /* Replace uses in this insn with the defs we use in the chain. */ + for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref)) + if (!DF_REF_REG_MEM_P (ref)) + if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)])) + { + /* Also update a corresponding REG_DEAD note. */ + rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref)); + if (note) + XEXP (note, 0) = *vreg; + *DF_REF_REAL_LOC (ref) = *vreg; + } + rtx def_set = single_set (insn); rtx src = SET_SRC (def_set); rtx dst = SET_DEST (def_set); @@ -988,6 +907,20 @@ general_scalar_chain::convert_insn (rtx_insn *insn) emit_conversion_insns (gen_move_insn (dst, tmp), insn); dst = gen_rtx_SUBREG (vmode, tmp, 0); } + else if (REG_P (dst)) + { + /* Replace the definition with a SUBREG to the definition we + use inside the chain. */ + rtx *vdef = defs_map.get (dst); + if (vdef) + dst = *vdef; + dst = gen_rtx_SUBREG (vmode, dst, 0); + /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST + is a non-REG_P. So kill those off. */ + rtx note = find_reg_equal_equiv_note (insn); + if (note) + remove_note (insn, note); + } switch (GET_CODE (src)) { @@ -1045,20 +978,15 @@ general_scalar_chain::convert_insn (rtx_insn *insn) case COMPARE: src = SUBREG_REG (XEXP (XEXP (src, 0), 0)); - gcc_assert ((REG_P (src) && GET_MODE (src) == DImode) - || (SUBREG_P (src) && GET_MODE (src) == V2DImode)); - - if (REG_P (src)) - subreg = gen_rtx_SUBREG (V2DImode, src, 0); - else - subreg = copy_rtx_if_shared (src); + gcc_assert (REG_P (src) && GET_MODE (src) == DImode); + subreg = gen_rtx_SUBREG (V2DImode, src, 0); emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg), copy_rtx_if_shared (subreg), copy_rtx_if_shared (subreg)), insn); dst = gen_rtx_REG (CCmode, FLAGS_REG); - src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src), - copy_rtx_if_shared (src)), + src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg), + copy_rtx_if_shared (subreg)), UNSPEC_PTEST); break; @@ -1217,16 +1145,15 @@ timode_scalar_chain::convert_insn (rtx_insn *insn) df_insn_rescan (insn); } +/* Generate copies from defs used by the chain but not defined therein. + Also populates defs_map which is used later by convert_insn. */ + void general_scalar_chain::convert_registers () { bitmap_iterator bi; unsigned id; - - EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi) - convert_reg (id); - - EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi) + EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi) make_vector_copies (id); } diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h index c85ac45e347..8381efe812c 100644 --- a/gcc/config/i386/i386-features.h +++ b/gcc/config/i386/i386-features.h @@ -171,12 +171,11 @@ class general_scalar_chain : public scalar_chain : scalar_chain (smode_, vmode_) {} int compute_convert_gain (); private: + hash_map defs_map; void mark_dual_mode_def (df_ref def); - rtx replace_with_subreg (rtx x, rtx reg, rtx subreg); - void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg); void convert_insn (rtx_insn *insn); void convert_op (rtx *op, rtx_insn *insn); - void convert_reg (unsigned regno); + void convert_reg (rtx_insn *insn, rtx dst, rtx src); void make_vector_copies (unsigned regno); void convert_registers (); int vector_const_cost (rtx exp); -- 2.30.2