+2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
+
+ * config/aarch64/iterators.md (SVE_24, SVE_2, SVE_4): New mode
+ iterators.
+ * config/aarch64/aarch64-sve.md
+ (gather_load<SVE_FULL_SD:mode><v_int_equiv>): Extend to...
+ (gather_load<SVE_24:mode><v_int_container>): ...this.
+ (mask_gather_load<SVE_FULL_S:mode><v_int_equiv>): Extend to...
+ (mask_gather_load<SVE_4:mode><v_int_container>): ...this.
+ (mask_gather_load<SVE_FULL_D:mode><v_int_equiv>): Extend to...
+ (mask_gather_load<SVE_2:mode><v_int_container>): ...this.
+ (*mask_gather_load<SVE_2:mode><v_int_container>_<su>xtw_unpacked):
+ New pattern.
+ (*mask_gather_load<SVE_FULL_D:mode><v_int_equiv>_sxtw): Extend to...
+ (*mask_gather_load<SVE_2:mode><v_int_equiv>_sxtw): ...this.
+ Allow the nominal extension predicate to be different from the
+ load predicate.
+ (*mask_gather_load<SVE_FULL_D:mode><v_int_equiv>_uxtw): Extend to...
+ (*mask_gather_load<SVE_2:mode><v_int_equiv>_uxtw): ...this.
+
2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
* config/aarch64/aarch64-sve.md
;; -------------------------------------------------------------------------
;; Unpredicated gather loads.
-(define_expand "gather_load<mode><v_int_equiv>"
- [(set (match_operand:SVE_FULL_SD 0 "register_operand")
- (unspec:SVE_FULL_SD
+(define_expand "gather_load<mode><v_int_container>"
+ [(set (match_operand:SVE_24 0 "register_operand")
+ (unspec:SVE_24
[(match_dup 5)
(match_operand:DI 1 "aarch64_sve_gather_offset_<Vesize>")
- (match_operand:<V_INT_EQUIV> 2 "register_operand")
+ (match_operand:<V_INT_CONTAINER> 2 "register_operand")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>")
(mem:BLK (scratch))]
;; Predicated gather loads for 32-bit elements. Operand 3 is true for
;; unsigned extension and false for signed extension.
-(define_insn "mask_gather_load<mode><v_int_equiv>"
- [(set (match_operand:SVE_FULL_S 0 "register_operand" "=w, w, w, w, w, w")
- (unspec:SVE_FULL_S
+(define_insn "mask_gather_load<mode><v_int_container>"
+ [(set (match_operand:SVE_4 0 "register_operand" "=w, w, w, w, w, w")
+ (unspec:SVE_4
[(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
- (match_operand:DI 1 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk")
+ (match_operand:DI 1 "aarch64_sve_gather_offset_<Vesize>" "Z, vgw, rk, rk, rk, rk")
(match_operand:VNx4SI 2 "register_operand" "w, w, w, w, w, w")
(match_operand:DI 3 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
- (match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i")
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, Ui1, Ui1, Ui1, i, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
- ld1w\t%0.s, %5/z, [%2.s]
- ld1w\t%0.s, %5/z, [%2.s, #%1]
- ld1w\t%0.s, %5/z, [%1, %2.s, sxtw]
- ld1w\t%0.s, %5/z, [%1, %2.s, uxtw]
- ld1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
- ld1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
+ ld1<Vesize>\t%0.s, %5/z, [%2.s]
+ ld1<Vesize>\t%0.s, %5/z, [%2.s, #%1]
+ ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw]
+ ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw]
+ ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
+ ld1<Vesize>\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
)
;; Predicated gather loads for 64-bit elements. The value of operand 3
;; doesn't matter in this case.
-(define_insn "mask_gather_load<mode><v_int_equiv>"
- [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w, w, w")
- (unspec:SVE_FULL_D
+(define_insn "mask_gather_load<mode><v_int_container>"
+ [(set (match_operand:SVE_2 0 "register_operand" "=w, w, w, w")
+ (unspec:SVE_2
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
- (match_operand:DI 1 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk")
+ (match_operand:DI 1 "aarch64_sve_gather_offset_<Vesize>" "Z, vgd, rk, rk")
(match_operand:VNx2DI 2 "register_operand" "w, w, w, w")
(match_operand:DI 3 "const_int_operand")
- (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i")
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, Ui1, Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
- ld1d\t%0.d, %5/z, [%2.d]
- ld1d\t%0.d, %5/z, [%2.d, #%1]
- ld1d\t%0.d, %5/z, [%1, %2.d]
- ld1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
+ ld1<Vesize>\t%0.d, %5/z, [%2.d]
+ ld1<Vesize>\t%0.d, %5/z, [%2.d, #%1]
+ ld1<Vesize>\t%0.d, %5/z, [%1, %2.d]
+ ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
)
-;; Likewise, but with the offset being sign-extended from 32 bits.
-(define_insn "*mask_gather_load<mode><v_int_equiv>_sxtw"
- [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
- (unspec:SVE_FULL_D
+;; Likewise, but with the offset being extended from 32 bits.
+(define_insn_and_rewrite "*mask_gather_load<mode><v_int_container>_<su>xtw_unpacked"
+ [(set (match_operand:SVE_2 0 "register_operand" "=w, w")
+ (unspec:SVE_2
+ [(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
+ (match_operand:DI 1 "register_operand" "rk, rk")
+ (unspec:VNx2DI
+ [(match_operand 6)
+ (ANY_EXTEND:VNx2DI
+ (match_operand:VNx2SI 2 "register_operand" "w, w"))]
+ UNSPEC_PRED_X)
+ (match_operand:DI 3 "const_int_operand")
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, i")
+ (mem:BLK (scratch))]
+ UNSPEC_LD1_GATHER))]
+ "TARGET_SVE"
+ "@
+ ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, <su>xtw]
+ ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, <su>xtw %p4]"
+ "&& !CONSTANT_P (operands[6])"
+ {
+ operands[6] = CONSTM1_RTX (VNx2BImode);
+ }
+)
+
+;; Likewise, but with the offset being truncated to 32 bits and then
+;; sign-extended.
+(define_insn_and_rewrite "*mask_gather_load<mode><v_int_container>_sxtw"
+ [(set (match_operand:SVE_2 0 "register_operand" "=w, w")
+ (unspec:SVE_2
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "register_operand" "rk, rk")
(unspec:VNx2DI
- [(match_dup 5)
+ [(match_operand 6)
(sign_extend:VNx2DI
(truncate:VNx2SI
(match_operand:VNx2DI 2 "register_operand" "w, w")))]
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
- (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
- ld1d\t%0.d, %5/z, [%1, %2.d, sxtw]
- ld1d\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
+ ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw]
+ ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
+ "&& !CONSTANT_P (operands[6])"
+ {
+ operands[6] = CONSTM1_RTX (VNx2BImode);
+ }
)
-;; Likewise, but with the offset being zero-extended from 32 bits.
-(define_insn "*mask_gather_load<mode><v_int_equiv>_uxtw"
- [(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
- (unspec:SVE_FULL_D
+;; Likewise, but with the offset being truncated to 32 bits and then
+;; zero-extended.
+(define_insn "*mask_gather_load<mode><v_int_container>_uxtw"
+ [(set (match_operand:SVE_2 0 "register_operand" "=w, w")
+ (unspec:SVE_2
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "register_operand" "rk, rk")
(and:VNx2DI
(match_operand:VNx2DI 2 "register_operand" "w, w")
(match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
(match_operand:DI 3 "const_int_operand")
- (match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
+ (match_operand:DI 4 "aarch64_gather_scale_operand_<Vesize>" "Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
- ld1d\t%0.d, %5/z, [%1, %2.d, uxtw]
- ld1d\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
+ ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw]
+ ld1<Vesize>\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
)
;; -------------------------------------------------------------------------
VNx4SI VNx2SI
VNx2DI])
+;; SVE modes with 2 or 4 elements.
+(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF
+ VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+
+;; SVE modes with 2 elements.
+(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF])
+
+;; SVE modes with 4 elements.
+(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+
;; Modes involved in extending or truncating SVE data, for 8 elements per
;; 128-bit block.
(define_mode_iterator VNx8_NARROW [VNx8QI])
(VNx4SI "vnx4sf") (VNx4SF "vnx4sf")
(VNx2DI "vnx2df") (VNx2DF "vnx2df")])
+;; Maps full and partial vector modes of any element type to a full-vector
+;; integer mode with the same number of units.
+(define_mode_attr V_INT_CONTAINER [(VNx16QI "VNx16QI") (VNx8QI "VNx8HI")
+ (VNx4QI "VNx4SI") (VNx2QI "VNx2DI")
+ (VNx8HI "VNx8HI") (VNx4HI "VNx4SI")
+ (VNx2HI "VNx2DI")
+ (VNx4SI "VNx4SI") (VNx2SI "VNx2DI")
+ (VNx2DI "VNx2DI")
+ (VNx8HF "VNx8HI") (VNx4HF "VNx4SI")
+ (VNx2HF "VNx2DI")
+ (VNx4SF "VNx4SI") (VNx2SF "VNx2SI")
+ (VNx2DF "VNx2DI")])
+
+;; Lower-case version of V_INT_CONTAINER.
+(define_mode_attr v_int_container [(VNx16QI "vnx16qi") (VNx8QI "vnx8hi")
+ (VNx4QI "vnx4si") (VNx2QI "vnx2di")
+ (VNx8HI "vnx8hi") (VNx4HI "vnx4si")
+ (VNx2HI "vnx2di")
+ (VNx4SI "vnx4si") (VNx2SI "vnx2di")
+ (VNx2DI "vnx2di")
+ (VNx8HF "vnx8hi") (VNx4HF "vnx4si")
+ (VNx2HF "vnx2di")
+ (VNx4SF "vnx4si") (VNx2SF "vnx2di")
+ (VNx2DF "vnx2di")])
+
;; Mode for vector conditional operations where the comparison has
;; different type from the lhs.
(define_mode_attr V_cmp_mixed [(V2SI "V2SF") (V4SI "V4SF")
+2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
+
+ * gcc.target/aarch64/sve/gather_load_1.c (TEST_LOOP): Start at 0.
+ (TEST_ALL): Add tests for 8-bit and 16-bit elements.
+ * gcc.target/aarch64/sve/gather_load_2.c: Update accordingly.
+ * gcc.target/aarch64/sve/gather_load_3.c (TEST_LOOP): Start at 0.
+ (TEST_ALL): Add tests for 8-bit and 16-bit elements.
+ * gcc.target/aarch64/sve/gather_load_4.c: Update accordingly.
+ * gcc.target/aarch64/sve/gather_load_5.c (TEST_LOOP): Start at 0.
+ (TEST_ALL): Add tests for 8-bit, 16-bit and 32-bit elements.
+ * gcc.target/aarch64/sve/gather_load_6.c: Add
+ --param aarch64-sve-compare-costs=0.
+ (TEST_LOOP): Start at 0.
+ * gcc.target/aarch64/sve/gather_load_7.c: Add
+ --param aarch64-sve-compare-costs=0.
+ * gcc.target/aarch64/sve/gather_load_8.c: New test.
+ * gcc.target/aarch64/sve/gather_load_9.c: Likewise.
+ * gcc.target/aarch64/sve/mask_gather_load_6.c: Add
+ --param aarch64-sve-compare-costs=0.
+
2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
* gcc.target/aarch64/sve/mask_struct_load_1.c: Add
#define INDEX64 int64_t
#endif
-/* Invoked 18 times for each data size. */
#define TEST_LOOP(DATA_TYPE, BITS) \
void __attribute__ ((noinline, noclone)) \
f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
INDEX##BITS *indices, int n) \
{ \
- for (int i = 9; i < n; ++i) \
+ for (int i = 0; i < n; ++i) \
dest[i] += src[indices[i]]; \
}
#define TEST_ALL(T) \
+ T (int8_t, 32) \
+ T (uint8_t, 32) \
+ T (int16_t, 32) \
+ T (uint16_t, 32) \
T (int32_t, 32) \
T (uint32_t, 32) \
T (float, 32) \
TEST_ALL (TEST_LOOP)
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 1\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
#include "gather_load_1.c"
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 1\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
#define INDEX64 int64_t
#endif
-/* Invoked 18 times for each data size. */
#define TEST_LOOP(DATA_TYPE, BITS) \
void __attribute__ ((noinline, noclone)) \
f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
INDEX##BITS *indices, int n) \
{ \
- for (int i = 9; i < n; ++i) \
+ for (int i = 0; i < n; ++i) \
dest[i] += *(DATA_TYPE *) ((char *) src + indices[i]); \
}
#define TEST_ALL(T) \
+ T (int8_t, 32) \
+ T (uint8_t, 32) \
+ T (int16_t, 32) \
+ T (uint16_t, 32) \
T (int32_t, 32) \
T (uint32_t, 32) \
T (float, 32) \
TEST_ALL (TEST_LOOP)
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
#include "gather_load_3.c"
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
#include <stdint.h>
-/* Invoked 18 times for each data size. */
#define TEST_LOOP(DATA_TYPE) \
void __attribute__ ((noinline, noclone)) \
f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict *src, \
int n) \
{ \
- for (int i = 9; i < n; ++i) \
+ for (int i = 0; i < n; ++i) \
dest[i] += *src[i]; \
}
#define TEST_ALL(T) \
+ T (int8_t) \
+ T (uint8_t) \
+ T (int16_t) \
+ T (uint16_t) \
+ T (int32_t) \
+ T (uint32_t) \
T (int64_t) \
T (uint64_t) \
T (double)
TEST_ALL (TEST_LOOP)
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 2 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[z[0-9]+.d\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps --param aarch64-sve-compare-costs=0" } */
#include <stdint.h>
#define INDEX32 int32_t
#endif
-/* Invoked 18 times for each data size. */
#define TEST_LOOP(DATA_TYPE, BITS) \
void __attribute__ ((noinline, noclone)) \
f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
INDEX##BITS *indices, INDEX##BITS mask, int n) \
{ \
- for (int i = 9; i < n; ++i) \
+ for (int i = 0; i < n; ++i) \
dest[i] = src[(INDEX##BITS) (indices[i] | mask)]; \
}
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps --param aarch64-sve-compare-costs=0" } */
#define INDEX16 uint16_t
#define INDEX32 uint32_t
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */
+
+#include <stdint.h>
+
+#ifndef INDEX32
+#define INDEX16 int16_t
+#define INDEX32 int32_t
+#endif
+
+#define TEST_LOOP(DATA_TYPE, BITS) \
+ void __attribute__ ((noinline, noclone)) \
+ f_##DATA_TYPE (DATA_TYPE *restrict dest, DATA_TYPE *restrict src, \
+ INDEX##BITS *indices, INDEX##BITS mask, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dest[i] = src[(INDEX##BITS) (indices[i] + mask)]; \
+ }
+
+#define TEST_ALL(T) \
+ T (int8_t, 16) \
+ T (uint8_t, 16) \
+ T (int16_t, 16) \
+ T (uint16_t, 16) \
+ T (_Float16, 16) \
+ T (int32_t, 16) \
+ T (uint32_t, 16) \
+ T (float, 16) \
+ T (int64_t, 32) \
+ T (uint64_t, 32) \
+ T (double, 32)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, sxtw 3\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tsxt.\tz} 8 } } */
+/* { dg-final { scan-assembler-times {\tsxth\tz[0-9]+\.s,} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -fwrapv --save-temps" } */
+
+#define INDEX16 uint16_t
+#define INDEX32 uint32_t
+
+#include "gather_load_8.c"
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, uxtw 3\]\n} 3 } } */
+
+/* { dg-final { scan-assembler-times {\tuxt.\tz} 8 } } */
+/* { dg-final { scan-assembler-times {\tuxth\tz[0-9]+\.s,} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps --param aarch64-sve-compare-costs=0" } */
#include <stdint.h>