From 0b1fe8cf6f1dde656c505dde6d27279dff388962 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Tue, 13 Aug 2019 21:33:51 +0000 Subject: [PATCH] Optimise constant IFN_WHILE_ULTs This patch is a combination of two changes that have to be committed as a single unit: (1) Try to fold IFN_WHILE_ULTs with constant arguments to a VECTOR_CST (which is always possible for fixed-length vectors but is not necessarily so for variable-length vectors) (2) Make the SVE port recognise constants that map to PTRUE VLn, which includes those generated by the new fold. (2) can't be tested without (1) and (1) would be a significant pessimisation without (2). The target-specific parts also start moving towards doing predicate manipulation in a canonical VNx16BImode form, using rtx_vector_builders. 2019-08-13 Richard Sandiford gcc/ * tree.h (build_vector_a_then_b): Declare. * tree.c (build_vector_a_then_b): New function. * fold-const-call.c (fold_while_ult): Likewise. (fold_const_call): Use it to handle IFN_WHILE_ULT. * config/aarch64/aarch64-protos.h (AARCH64_FOR_SVPATTERN): New macro. (aarch64_svpattern): New enum. * config/aarch64/aarch64-sve.md (mov): Pass constants through aarch64_expand_mov_immediate. (*aarch64_sve_mov): Use aarch64_mov_operand rather than general_operand as the predicate for operand 1. (while_ult): Add a '@' marker. * config/aarch64/aarch64.c (simd_immediate_info::PTRUE): New insn_type. (simd_immediate_info::simd_immediate_info): New overload that takes a scalar_int_mode and an svpattern. (simd_immediate_info::u): Add a "pattern" field. (svpattern_token): New function. (aarch64_get_sve_pred_bits, aarch64_widest_sve_pred_elt_size) (aarch64_partial_ptrue_length, aarch64_svpattern_for_vl) (aarch64_sve_move_pred_via_while): New functions. (aarch64_expand_mov_immediate): Try using aarch64_sve_move_pred_via_while for predicates that contain N ones followed by M zeros but that do not correspond to a VLnnn pattern. (aarch64_sve_pred_valid_immediate): New function. (aarch64_simd_valid_immediate): Use it instead of dealing directly with PTRUE and PFALSE. (aarch64_output_sve_mov_immediate): Handle new simd_immediate_info forms. gcc/testsuite/ * gcc.target/aarch64/sve/spill_2.c: Increase iteration counts beyond the range of a PTRUE. * gcc.target/aarch64/sve/while_6.c: New test. * gcc.target/aarch64/sve/while_7.c: Likewise. * gcc.target/aarch64/sve/while_8.c: Likewise. * gcc.target/aarch64/sve/while_9.c: Likewise. * gcc.target/aarch64/sve/while_10.c: Likewise. From-SVN: r274402 --- gcc/ChangeLog | 31 ++ gcc/config/aarch64/aarch64-protos.h | 27 ++ gcc/config/aarch64/aarch64-sve.md | 10 +- gcc/config/aarch64/aarch64.c | 275 ++++++++++++++++-- gcc/fold-const-call.c | 38 +++ gcc/testsuite/ChangeLog | 10 + .../gcc.target/aarch64/sve/spill_2.c | 17 +- .../gcc.target/aarch64/sve/while_10.c | 25 ++ .../gcc.target/aarch64/sve/while_6.c | 25 ++ .../gcc.target/aarch64/sve/while_7.c | 25 ++ .../gcc.target/aarch64/sve/while_8.c | 25 ++ .../gcc.target/aarch64/sve/while_9.c | 25 ++ gcc/tree.c | 17 ++ gcc/tree.h | 1 + 14 files changed, 518 insertions(+), 33 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/while_10.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/while_6.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/while_7.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/while_8.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/while_9.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 070986ec3f7..307f5360ea0 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,34 @@ +2019-08-13 Richard Sandiford + + * tree.h (build_vector_a_then_b): Declare. + * tree.c (build_vector_a_then_b): New function. + * fold-const-call.c (fold_while_ult): Likewise. + (fold_const_call): Use it to handle IFN_WHILE_ULT. + * config/aarch64/aarch64-protos.h (AARCH64_FOR_SVPATTERN): New macro. + (aarch64_svpattern): New enum. + * config/aarch64/aarch64-sve.md (mov): Pass + constants through aarch64_expand_mov_immediate. + (*aarch64_sve_mov): Use aarch64_mov_operand rather + than general_operand as the predicate for operand 1. + (while_ult): Add a '@' marker. + * config/aarch64/aarch64.c (simd_immediate_info::PTRUE): New + insn_type. + (simd_immediate_info::simd_immediate_info): New overload that + takes a scalar_int_mode and an svpattern. + (simd_immediate_info::u): Add a "pattern" field. + (svpattern_token): New function. + (aarch64_get_sve_pred_bits, aarch64_widest_sve_pred_elt_size) + (aarch64_partial_ptrue_length, aarch64_svpattern_for_vl) + (aarch64_sve_move_pred_via_while): New functions. + (aarch64_expand_mov_immediate): Try using + aarch64_sve_move_pred_via_while for predicates that contain N ones + followed by M zeros but that do not correspond to a VLnnn pattern. + (aarch64_sve_pred_valid_immediate): New function. + (aarch64_simd_valid_immediate): Use it instead of dealing directly + with PTRUE and PFALSE. + (aarch64_output_sve_mov_immediate): Handle new simd_immediate_info + forms. + 2019-08-13 Iain Sandoe * config/darwin.c (machopic_indirect_call_target): Rename symbol stub diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index ad818a4ec7f..86d53c5ce1e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -406,6 +406,33 @@ extern enum aarch64_key_type aarch64_ra_sign_key; extern struct tune_params aarch64_tune_params; +/* The available SVE predicate patterns, known in the ACLE as "svpattern". */ +#define AARCH64_FOR_SVPATTERN(T) \ + T (POW2, pow2, 0) \ + T (VL1, vl1, 1) \ + T (VL2, vl2, 2) \ + T (VL3, vl3, 3) \ + T (VL4, vl4, 4) \ + T (VL5, vl5, 5) \ + T (VL6, vl6, 6) \ + T (VL7, vl7, 7) \ + T (VL8, vl8, 8) \ + T (VL16, vl16, 9) \ + T (VL32, vl32, 10) \ + T (VL64, vl64, 11) \ + T (VL128, vl128, 12) \ + T (VL256, vl256, 13) \ + T (MUL4, mul4, 29) \ + T (MUL3, mul3, 30) \ + T (ALL, all, 31) + +#define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE, +enum aarch64_svpattern { + AARCH64_FOR_SVPATTERN (AARCH64_SVENUM) + AARCH64_NUM_SVPATTERNS +}; +#undef AARCH64_SVENUM + void aarch64_post_cfi_startproc (void); poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned); int aarch64_get_condition_code (rtx); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 950f39781af..53d93a367db 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -481,12 +481,18 @@ { if (GET_CODE (operands[0]) == MEM) operands[1] = force_reg (mode, operands[1]); + + if (CONSTANT_P (operands[1])) + { + aarch64_expand_mov_immediate (operands[0], operands[1]); + DONE; + } } ) (define_insn "*aarch64_sve_mov" [(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa") - (match_operand:PRED_ALL 1 "general_operand" "Upa, Upa, m, Dn"))] + (match_operand:PRED_ALL 1 "aarch64_mov_operand" "Upa, Upa, m, Dn"))] "TARGET_SVE && (register_operand (operands[0], mode) || register_operand (operands[1], mode))" @@ -2923,7 +2929,7 @@ ;; Set element I of the result if operand1 + J < operand2 for all J in [0, I], ;; with the comparison being unsigned. -(define_insn "while_ult" +(define_insn "@while_ult" [(set (match_operand:PRED_ALL 0 "register_operand" "=Upa") (unspec:PRED_ALL [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ") (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")] diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index fe968459241..2b3ea9f164c 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -83,7 +83,7 @@ /* Information about a legitimate vector immediate operand. */ struct simd_immediate_info { - enum insn_type { MOV, MVN, INDEX }; + enum insn_type { MOV, MVN, INDEX, PTRUE }; enum modifier_type { LSL, MSL }; simd_immediate_info () {} @@ -92,6 +92,7 @@ struct simd_immediate_info insn_type = MOV, modifier_type = LSL, unsigned int = 0); simd_immediate_info (scalar_mode, rtx, rtx); + simd_immediate_info (scalar_int_mode, aarch64_svpattern); /* The mode of the elements. */ scalar_mode elt_mode; @@ -120,6 +121,9 @@ struct simd_immediate_info subsequent element. */ rtx base, step; } index; + + /* For PTRUE. */ + aarch64_svpattern pattern; } u; }; @@ -159,6 +163,16 @@ inline simd_immediate_info u.index.step = step_in; } +/* Construct a predicate that controls elements of mode ELT_MODE_IN + and has PTRUE pattern PATTERN_IN. */ +inline simd_immediate_info +::simd_immediate_info (scalar_int_mode elt_mode_in, + aarch64_svpattern pattern_in) + : elt_mode (elt_mode_in), insn (PTRUE) +{ + u.pattern = pattern_in; +} + /* The current code model. */ enum aarch64_code_model aarch64_cmodel; @@ -1334,6 +1348,22 @@ static const char *const aarch64_sve_condition_codes[] = "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv" }; +/* Return the assembly token for svpattern value VALUE. */ + +static const char * +svpattern_token (enum aarch64_svpattern pattern) +{ + switch (pattern) + { +#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER; + AARCH64_FOR_SVPATTERN (CASE) +#undef CASE + case AARCH64_NUM_SVPATTERNS: + break; + } + gcc_unreachable (); +} + /* Generate code to enable conditional branches in functions over 1 MiB. */ const char * aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest, @@ -2529,6 +2559,146 @@ aarch64_force_temporary (machine_mode mode, rtx x, rtx value) } } +/* Return true if predicate value X is a constant in which every element + is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI + value, i.e. as a predicate in which all bits are significant. */ + +static bool +aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x) +{ + if (GET_CODE (x) != CONST_VECTOR) + return false; + + unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode), + GET_MODE_NUNITS (GET_MODE (x))); + unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor; + unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x); + builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern); + + unsigned int nelts = const_vector_encoded_nelts (x); + for (unsigned int i = 0; i < nelts; ++i) + { + rtx elt = CONST_VECTOR_ENCODED_ELT (x, i); + if (!CONST_INT_P (elt)) + return false; + + builder.quick_push (elt); + for (unsigned int j = 1; j < factor; ++j) + builder.quick_push (const0_rtx); + } + builder.finalize (); + return true; +} + +/* BUILDER contains a predicate constant of mode VNx16BI. Return the + widest predicate element size it can have (that is, the largest size + for which each element would still be 0 or 1). */ + +unsigned int +aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder) +{ + /* Start with the most optimistic assumption: that we only need + one bit per pattern. This is what we will use if only the first + bit in each pattern is ever set. */ + unsigned int mask = GET_MODE_SIZE (DImode); + mask |= builder.npatterns (); + + /* Look for set bits. */ + unsigned int nelts = builder.encoded_nelts (); + for (unsigned int i = 1; i < nelts; ++i) + if (INTVAL (builder.elt (i)) != 0) + { + if (i & 1) + return 1; + mask |= i; + } + return mask & -mask; +} + +/* BUILDER is a predicate constant of mode VNx16BI. Consider the value + that the constant would have with predicate element size ELT_SIZE + (ignoring the upper bits in each element) and return: + + * -1 if all bits are set + * N if the predicate has N leading set bits followed by all clear bits + * 0 if the predicate does not have any of these forms. */ + +int +aarch64_partial_ptrue_length (rtx_vector_builder &builder, + unsigned int elt_size) +{ + /* If nelts_per_pattern is 3, we have set bits followed by clear bits + followed by set bits. */ + if (builder.nelts_per_pattern () == 3) + return 0; + + /* Skip over leading set bits. */ + unsigned int nelts = builder.encoded_nelts (); + unsigned int i = 0; + for (; i < nelts; i += elt_size) + if (INTVAL (builder.elt (i)) == 0) + break; + unsigned int vl = i / elt_size; + + /* Check for the all-true case. */ + if (i == nelts) + return -1; + + /* If nelts_per_pattern is 1, then either VL is zero, or we have a + repeating pattern of set bits followed by clear bits. */ + if (builder.nelts_per_pattern () != 2) + return 0; + + /* We have a "foreground" value and a duplicated "background" value. + If the background might repeat and the last set bit belongs to it, + we might have set bits followed by clear bits followed by set bits. */ + if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ())) + return 0; + + /* Make sure that the rest are all clear. */ + for (; i < nelts; i += elt_size) + if (INTVAL (builder.elt (i)) != 0) + return 0; + + return vl; +} + +/* See if there is an svpattern that encodes an SVE predicate of mode + PRED_MODE in which the first VL bits are set and the rest are clear. + Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS. + A VL of -1 indicates an all-true vector. */ + +aarch64_svpattern +aarch64_svpattern_for_vl (machine_mode pred_mode, int vl) +{ + if (vl < 0) + return AARCH64_SV_ALL; + + if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode))) + return AARCH64_NUM_SVPATTERNS; + + if (vl >= 1 && vl <= 8) + return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1)); + + if (vl >= 16 && vl <= 256 && pow2p_hwi (vl)) + return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4)); + + int max_vl; + if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl)) + { + if (vl == (max_vl / 3) * 3) + return AARCH64_SV_MUL3; + /* These would only trigger for non-power-of-2 lengths. */ + if (vl == (max_vl & -4)) + return AARCH64_SV_MUL4; + if (vl == (1 << floor_log2 (max_vl))) + return AARCH64_SV_POW2; + if (vl == max_vl) + return AARCH64_SV_ALL; + } + return AARCH64_NUM_SVPATTERNS; +} + /* Return an all-true predicate register of mode MODE. */ rtx @@ -3447,6 +3617,17 @@ aarch64_expand_sve_const_vector (rtx target, rtx src) return target; } +/* Use WHILE to set predicate register DEST so that the first VL bits + are set and the rest are clear. */ + +static void +aarch64_sve_move_pred_via_while (rtx dest, unsigned int vl) +{ + rtx limit = force_reg (DImode, gen_int_mode (vl, DImode)); + emit_insn (gen_while_ult (DImode, GET_MODE (dest), + dest, const0_rtx, limit)); +} + /* Set DEST to immediate IMM. */ void @@ -3580,6 +3761,19 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm) return; } + rtx_vector_builder builder; + if (GET_MODE_CLASS (GET_MODE (imm)) == MODE_VECTOR_BOOL + && aarch64_get_sve_pred_bits (builder, imm)) + { + unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder); + int vl = aarch64_partial_ptrue_length (builder, elt_size); + if (vl > 0) + { + aarch64_sve_move_pred_via_while (dest, vl); + return; + } + } + if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode)) if (rtx res = aarch64_expand_sve_const_vector (dest, imm)) { @@ -14776,6 +14970,44 @@ aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64, return false; } +/* Return true if X is a valid SVE predicate. If INFO is nonnull, use + it to describe valid immediates. */ + +static bool +aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info) +{ + if (x == CONST0_RTX (GET_MODE (x))) + { + if (info) + *info = simd_immediate_info (DImode, 0); + return true; + } + + /* Analyze the value as a VNx16BImode. This should be relatively + efficient, since rtx_vector_builder has enough built-in capacity + to store all VLA predicate constants without needing the heap. */ + rtx_vector_builder builder; + if (!aarch64_get_sve_pred_bits (builder, x)) + return false; + + unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder); + if (int vl = aarch64_partial_ptrue_length (builder, elt_size)) + { + machine_mode mode = aarch64_sve_pred_mode (elt_size).require (); + aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl); + if (pattern != AARCH64_NUM_SVPATTERNS) + { + if (info) + { + scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode); + *info = simd_immediate_info (int_mode, pattern); + } + return true; + } + } + return false; +} + /* Return true if OP is a valid SIMD immediate for the operation described by WHICH. If INFO is nonnull, use it to describe valid immediates. */ @@ -14788,6 +15020,9 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT)) return false; + if (vec_flags & VEC_SVE_PRED) + return aarch64_sve_pred_valid_immediate (op, info); + scalar_mode elt_mode = GET_MODE_INNER (mode); rtx base, step; unsigned int n_elts; @@ -14812,21 +15047,6 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, else return false; - /* Handle PFALSE and PTRUE. */ - if (vec_flags & VEC_SVE_PRED) - { - if (op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode)) - { - if (info) - { - scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode); - *info = simd_immediate_info (int_mode, op == CONSTM1_RTX (mode)); - } - return true; - } - return false; - } - scalar_float_mode elt_float_mode; if (n_elts == 1 && is_a (elt_mode, &elt_float_mode)) @@ -16570,14 +16790,23 @@ aarch64_output_sve_mov_immediate (rtx const_vector) if (aarch64_sve_pred_mode_p (vec_mode)) { static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")]; - unsigned int total_bytes; - if (info.u.mov.value == const0_rtx) - snprintf (buf, sizeof (buf), "pfalse\t%%0.b"); - else if (BYTES_PER_SVE_VECTOR.is_constant (&total_bytes)) - snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char, - total_bytes / GET_MODE_SIZE (info.elt_mode)); + if (info.insn == simd_immediate_info::MOV) + { + gcc_assert (info.u.mov.value == const0_rtx); + snprintf (buf, sizeof (buf), "pfalse\t%%0.b"); + } else - snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", element_char); + { + gcc_assert (info.insn == simd_immediate_info::PTRUE); + unsigned int total_bytes; + if (info.u.pattern == AARCH64_SV_ALL + && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes)) + snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char, + total_bytes / GET_MODE_SIZE (info.elt_mode)); + else + snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char, + svpattern_token (info.u.pattern)); + } return buf; } diff --git a/gcc/fold-const-call.c b/gcc/fold-const-call.c index 702c8b4057a..e21d8e11072 100644 --- a/gcc/fold-const-call.c +++ b/gcc/fold-const-call.c @@ -689,6 +689,36 @@ fold_const_vec_convert (tree ret_type, tree arg) return elts.build (); } +/* Try to evaluate: + + IFN_WHILE_ULT (ARG0, ARG1, (TYPE) { ... }) + + Return the value on success and null on failure. */ + +static tree +fold_while_ult (tree type, poly_uint64 arg0, poly_uint64 arg1) +{ + if (known_ge (arg0, arg1)) + return build_zero_cst (type); + + if (maybe_ge (arg0, arg1)) + return NULL_TREE; + + poly_uint64 diff = arg1 - arg0; + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); + if (known_ge (diff, nelts)) + return build_all_ones_cst (type); + + unsigned HOST_WIDE_INT const_diff; + if (known_le (diff, nelts) && diff.is_constant (&const_diff)) + { + tree minus_one = build_minus_one_cst (TREE_TYPE (type)); + tree zero = build_zero_cst (TREE_TYPE (type)); + return build_vector_a_then_b (type, const_diff, minus_one, zero); + } + return NULL_TREE; +} + /* Try to evaluate: *RESULT = FN (*ARG) @@ -1782,6 +1812,14 @@ fold_const_call (combined_fn fn, tree type, tree arg0, tree arg1, tree arg2) } return NULL_TREE; + case CFN_WHILE_ULT: + { + poly_uint64 parg0, parg1; + if (poly_int_tree_p (arg0, &parg0) && poly_int_tree_p (arg1, &parg1)) + return fold_while_ult (type, parg0, parg1); + return NULL_TREE; + } + default: return fold_const_call_1 (fn, type, arg0, arg1, arg2); } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index c0a7d2dc236..1147dc86a0e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,13 @@ +2019-08-13 Richard Sandiford + + * gcc.target/aarch64/sve/spill_2.c: Increase iteration counts + beyond the range of a PTRUE. + * gcc.target/aarch64/sve/while_6.c: New test. + * gcc.target/aarch64/sve/while_7.c: Likewise. + * gcc.target/aarch64/sve/while_8.c: Likewise. + * gcc.target/aarch64/sve/while_9.c: Likewise. + * gcc.target/aarch64/sve/while_10.c: Likewise. + 2019-08-13 Steven G. Kargl PR fortran/88072 diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c index 28fcc442975..fcd481611ec 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_2.c @@ -9,29 +9,30 @@ void consumer (void *); void \ multi_loop_##TYPE (TYPE *x, TYPE val) \ { \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 9; ++i) \ x[i] += val; \ consumer (x); \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 9; ++i) \ x[i] += val; \ consumer (x); \ - for (int i = 0; i < 7; ++i) \ + for (int i = 0; i < 9; ++i) \ x[i] += val; \ consumer (x); \ } /* One iteration is enough. */ TEST_LOOP (uint8_t); +/* Two iterations are enough. We specialize the second two loops based + on whether the first executes once or twice. */ TEST_LOOP (uint16_t); -/* Two iterations are enough. Complete unrolling makes sense - even at -O2. */ +/* Three iterations are needed; ought to stay a loop. */ TEST_LOOP (uint32_t); -/* Four iterations are needed; ought to stay a loop. */ +/* Five iterations are needed; ought to stay a loop. */ TEST_LOOP (uint64_t); /* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.b} 3 } } */ -/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 3 } } */ -/* { dg-final { scan-assembler {\twhilelo\tp[0-9]\.s} } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 8 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.s} 6 } } */ /* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.d} 6 } } */ /* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */ /* { dg-final { scan-assembler-not {\tstr\tz[0-9]} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_10.c b/gcc/testsuite/gcc.target/aarch64/sve/while_10.c new file mode 100644 index 00000000000..eaed326f999 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/while_10.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=512" } */ + +#include + +#define ADD_LOOP(TYPE, COUNT) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t, 63) \ + T (int16_t, 30) \ + T (int32_t, 15) \ + T (int64_t, 6) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, mul3\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, mul3\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, mul3\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl6\n} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_6.c b/gcc/testsuite/gcc.target/aarch64/sve/while_6.c new file mode 100644 index 00000000000..b4cc596efe7 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/while_6.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */ + +#include + +#define ADD_LOOP(TYPE) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < 7; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t) \ + T (int16_t) \ + T (int32_t) \ + T (int64_t) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl7\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl7\n} 1 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_7.c b/gcc/testsuite/gcc.target/aarch64/sve/while_7.c new file mode 100644 index 00000000000..d5ffb66a142 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/while_7.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */ + +#include + +#define ADD_LOOP(TYPE) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < 8; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t) \ + T (int16_t) \ + T (int32_t) \ + T (int64_t) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl8\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl8\n} 1 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_8.c b/gcc/testsuite/gcc.target/aarch64/sve/while_8.c new file mode 100644 index 00000000000..1c11aa849a2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/while_8.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */ + +#include + +#define ADD_LOOP(TYPE) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < 9; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t) \ + T (int16_t) \ + T (int32_t) \ + T (int64_t) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 1 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/while_9.c b/gcc/testsuite/gcc.target/aarch64/sve/while_9.c new file mode 100644 index 00000000000..9a8e5fe12fb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/while_9.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */ + +#include + +#define ADD_LOOP(TYPE) \ + TYPE __attribute__ ((noinline, noclone)) \ + vec_while_##TYPE (TYPE *restrict a) \ + { \ + for (int i = 0; i < 16; ++i) \ + a[i] += 1; \ + } + +#define TEST_ALL(T) \ + T (int8_t) \ + T (int16_t) \ + T (int32_t) \ + T (int64_t) + +TEST_ALL (ADD_LOOP) + +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl16\n} 1 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */ diff --git a/gcc/tree.c b/gcc/tree.c index 8f80012c6e8..ae292281b1f 100644 --- a/gcc/tree.c +++ b/gcc/tree.c @@ -1981,6 +1981,23 @@ build_index_vector (tree vec_type, poly_uint64 base, poly_uint64 step) return v.build (); } +/* Return a VECTOR_CST of type VEC_TYPE in which the first NUM_A + elements are A and the rest are B. */ + +tree +build_vector_a_then_b (tree vec_type, unsigned int num_a, tree a, tree b) +{ + gcc_assert (known_le (num_a, TYPE_VECTOR_SUBPARTS (vec_type))); + unsigned int count = constant_lower_bound (TYPE_VECTOR_SUBPARTS (vec_type)); + /* Optimize the constant case. */ + if ((count & 1) == 0 && TYPE_VECTOR_SUBPARTS (vec_type).is_constant ()) + count /= 2; + tree_vector_builder builder (vec_type, count, 2); + for (unsigned int i = 0; i < count * 2; ++i) + builder.quick_push (i < num_a ? a : b); + return builder.build (); +} + /* Something has messed with the elements of CONSTRUCTOR C after it was built; calculate TREE_CONSTANT and TREE_SIDE_EFFECTS. */ diff --git a/gcc/tree.h b/gcc/tree.h index 94dbb95a78a..dd54f4d2af5 100644 --- a/gcc/tree.h +++ b/gcc/tree.h @@ -4314,6 +4314,7 @@ extern tree build_vector_from_val (tree, tree); extern tree build_uniform_cst (tree, tree); extern tree build_vec_series (tree, tree, tree); extern tree build_index_vector (tree, poly_uint64, poly_uint64); +extern tree build_vector_a_then_b (tree, unsigned int, tree, tree); extern void recompute_constructor_flags (tree); extern void verify_constructor_flags (tree); extern tree build_constructor (tree, vec * CXX_MEM_STAT_INFO); -- 2.30.2