From 46c705e70e078f6a1920d92e49042125d5e18495 Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Wed, 11 Nov 2020 11:42:46 +0000
Subject: [PATCH] aarch64: Support SVE comparisons for unpacked integers

This patch adds support for comparing unpacked SVE integer vectors,
such as byte elements stored in the bottom bytes of halfword
containers.  It also adds support for selects between unpacked
SVE vectors (both integer and floating-point), since selects and
compares are closely tied via the vcond optab interface.

gcc/
	* config/aarch64/aarch64-sve.md (@vcond_mask_<mode><vpred>): Extend
	from SVE_FULL to SVE_ALL.
	(*vcond_mask_<mode><vpred>): Likewise.
	(@aarch64_sel_dup<mode>): Likewise.
	(vcond<SVE_FULL:mode><v_int_equiv>): Extend to...
	(vcond<SVE_ALL:mode><SVE_I:mode>): ...this, but requiring the
	sizes of the container modes to match.
	(vcondu<SVE_FULL:mode><v_int_equiv>): Extend to...
	(vcondu<SVE_ALL:mode><SVE_I:mode>): ...this.
	(vec_cmp<SVE_FULL_I:mode><vpred>): Extend to...
	(vec_cmp<SVE_I:mode><vpred>): ...this.
	(vec_cmpu<SVE_FULL_I:mode><vpred>): Extend to...
	(vec_cmpu<SVE_I:mode><vpred>): ...this.
	(@aarch64_pred_cmp<cmp_op><SVE_FULL_I:mode>): Extend to...
	(@aarch64_pred_cmp<cmp_op><SVE_I:mode>): ...this.
	(*cmp<cmp_op><SVE_FULL_I:mode>_cc): Extend to...
	(*cmp<cmp_op><SVE_I:mode>_cc): ...this.
	(*cmp<cmp_op><SVE_FULL_I:mode>_ptest): Extend to...
	(*cmp<cmp_op><SVE_I:mode>_ptest): ...this.
	(*cmp<cmp_op><SVE_FULL_I:mode>_and): Extend to...
	(*cmp<cmp_op><SVE_I:mode>_and): ...this.

gcc/testsuite/
	* gcc.target/aarch64/sve/cmp_1.c: New test.
	* gcc.target/aarch64/sve/cmp_2.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_1.c: Add --param
	aarch64-sve-compare-costs=0
	* gcc.target/aarch64/sve/cond_arith_1_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_3.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_3_run.c: Likewise.
	* gcc.target/aarch64/sve/mask_gather_load_7.c: Likewise.
	* gcc.target/aarch64/sve/mask_load_slp_1.c: Likewise.
	* gcc.target/aarch64/sve/vcond_11.c: Likewise.
	* gcc.target/aarch64/sve/vcond_11_run.c: Likewise.
---
 gcc/config/aarch64/aarch64-sve.md             | 121 ++++++++++++------
 gcc/testsuite/gcc.target/aarch64/sve/cmp_1.c  |  57 +++++++++
 gcc/testsuite/gcc.target/aarch64/sve/cmp_2.c  |  72 +++++++++++
 .../gcc.target/aarch64/sve/cond_arith_1.c     |   2 +-
 .../gcc.target/aarch64/sve/cond_arith_1_run.c |   2 +-
 .../gcc.target/aarch64/sve/cond_arith_3.c     |   2 +-
 .../gcc.target/aarch64/sve/cond_arith_3_run.c |   2 +-
 .../aarch64/sve/mask_gather_load_7.c          |   2 +-
 .../gcc.target/aarch64/sve/mask_load_slp_1.c  |   2 +-
 .../gcc.target/aarch64/sve/vcond_11.c         |   2 +-
 .../gcc.target/aarch64/sve/vcond_11_run.c     |   2 +-
 11 files changed, 216 insertions(+), 50 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cmp_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cmp_2.c
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 4b0a1ebe9e1..455b025521f 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7379,11 +7379,11 @@
 ;; UNSPEC_SEL operand order: mask, true, false (as for VEC_COND_EXPR)
 ;; SEL operand order:        mask, true, false
 (define_expand "@vcond_mask_<mode><vpred>"
-  [(set (match_operand:SVE_FULL 0 "register_operand")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+	(unspec:SVE_ALL
 	  [(match_operand:<VPRED> 3 "register_operand")
-	   (match_operand:SVE_FULL 1 "aarch64_sve_reg_or_dup_imm")
-	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero")]
+	   (match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm")
+	   (match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE"
   {
@@ -7396,12 +7396,25 @@
 ;; - two registers
 ;; - a duplicated immediate and a register
 ;; - a duplicated immediate and zero
+;;
+;; For unpacked vectors, it doesn't really matter whether SEL uses the
+;; the container size or the element size.  If SEL used the container size,
+;; it would ignore undefined bits of the predicate but would copy the
+;; upper (undefined) bits of each container along with the defined bits.
+;; If SEL used the element size, it would use undefined bits of the predicate
+;; to select between undefined elements in each input vector.  Thus the only
+;; difference is whether the undefined bits in a container always come from
+;; the same input as the defined bits, or whether the choice can vary
+;; independently of the defined bits.
+;;
+;; For the other instructions, using the element size is more natural,
+;; so we do that for SEL as well.
 (define_insn "*vcond_mask_<mode><vpred>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w, w, w, ?w, ?&w, ?&w")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w, w, ?w, ?&w, ?&w")
+	(unspec:SVE_ALL
 	  [(match_operand:<VPRED> 3 "register_operand" "Upa, Upa, Upa, Upa, Upl, Upl, Upl")
-	   (match_operand:SVE_FULL 1 "aarch64_sve_reg_or_dup_imm" "w, vss, vss, Ufc, Ufc, vss, Ufc")
-	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero" "w, 0, Dz, 0, Dz, w, w")]
+	   (match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm" "w, vss, vss, Ufc, Ufc, vss, Ufc")
+	   (match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero" "w, 0, Dz, 0, Dz, w, w")]
 	  UNSPEC_SEL))]
   "TARGET_SVE
    && (!register_operand (operands[1], <MODE>mode)
@@ -7422,12 +7435,12 @@
 ;; of GPRs as being more expensive than duplicates of FPRs, since they
 ;; involve a cross-file move.
 (define_insn "@aarch64_sel_dup<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=?w, w, ??w, ?&w, ??&w, ?&w")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=?w, w, ??w, ?&w, ??&w, ?&w")
+	(unspec:SVE_ALL
 	  [(match_operand:<VPRED> 3 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
-	   (vec_duplicate:SVE_FULL
+	   (vec_duplicate:SVE_ALL
 	     (match_operand:<VEL> 1 "register_operand" "r, w, r, w, r, w"))
-	   (match_operand:SVE_FULL 2 "aarch64_simd_reg_or_zero" "0, 0, Dz, Dz, w, w")]
+	   (match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero" "0, 0, Dz, Dz, w, w")]
 	  UNSPEC_SEL))]
   "TARGET_SVE"
   "@
@@ -7448,34 +7461,34 @@
 
 ;; Integer (signed) vcond.  Don't enforce an immediate range here, since it
 ;; depends on the comparison; leave it to aarch64_expand_sve_vcond instead.
-(define_expand "vcond<mode><v_int_equiv>"
-  [(set (match_operand:SVE_FULL 0 "register_operand")
-	(if_then_else:SVE_FULL
+(define_expand "vcond<SVE_ALL:mode><SVE_I:mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+	(if_then_else:SVE_ALL
 	  (match_operator 3 "comparison_operator"
-	    [(match_operand:<V_INT_EQUIV> 4 "register_operand")
-	     (match_operand:<V_INT_EQUIV> 5 "nonmemory_operand")])
-	  (match_operand:SVE_FULL 1 "nonmemory_operand")
-	  (match_operand:SVE_FULL 2 "nonmemory_operand")))]
-  "TARGET_SVE"
+	    [(match_operand:SVE_I 4 "register_operand")
+	     (match_operand:SVE_I 5 "nonmemory_operand")])
+	  (match_operand:SVE_ALL 1 "nonmemory_operand")
+	  (match_operand:SVE_ALL 2 "nonmemory_operand")))]
+  "TARGET_SVE && <SVE_ALL:container_bits> == <SVE_I:container_bits>"
   {
-    aarch64_expand_sve_vcond (<MODE>mode, <V_INT_EQUIV>mode, operands);
+    aarch64_expand_sve_vcond (<SVE_ALL:MODE>mode, <SVE_I:MODE>mode, operands);
     DONE;
   }
 )
 
 ;; Integer vcondu.  Don't enforce an immediate range here, since it
 ;; depends on the comparison; leave it to aarch64_expand_sve_vcond instead.
-(define_expand "vcondu<mode><v_int_equiv>"
-  [(set (match_operand:SVE_FULL 0 "register_operand")
-	(if_then_else:SVE_FULL
+(define_expand "vcondu<SVE_ALL:mode><SVE_I:mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+	(if_then_else:SVE_ALL
 	  (match_operator 3 "comparison_operator"
-	    [(match_operand:<V_INT_EQUIV> 4 "register_operand")
-	     (match_operand:<V_INT_EQUIV> 5 "nonmemory_operand")])
-	  (match_operand:SVE_FULL 1 "nonmemory_operand")
-	  (match_operand:SVE_FULL 2 "nonmemory_operand")))]
-  "TARGET_SVE"
+	    [(match_operand:SVE_I 4 "register_operand")
+	     (match_operand:SVE_I 5 "nonmemory_operand")])
+	  (match_operand:SVE_ALL 1 "nonmemory_operand")
+	  (match_operand:SVE_ALL 2 "nonmemory_operand")))]
+  "TARGET_SVE && <SVE_ALL:container_bits> == <SVE_I:container_bits>"
   {
-    aarch64_expand_sve_vcond (<MODE>mode, <V_INT_EQUIV>mode, operands);
+    aarch64_expand_sve_vcond (<SVE_ALL:MODE>mode, <SVE_I:MODE>mode, operands);
     DONE;
   }
 )
@@ -7520,8 +7533,8 @@
   [(parallel
     [(set (match_operand:<VPRED> 0 "register_operand")
 	  (match_operator:<VPRED> 1 "comparison_operator"
-	    [(match_operand:SVE_FULL_I 2 "register_operand")
-	     (match_operand:SVE_FULL_I 3 "nonmemory_operand")]))
+	    [(match_operand:SVE_I 2 "register_operand")
+	     (match_operand:SVE_I 3 "nonmemory_operand")]))
      (clobber (reg:CC_NZC CC_REGNUM))])]
   "TARGET_SVE"
   {
@@ -7538,8 +7551,8 @@
   [(parallel
     [(set (match_operand:<VPRED> 0 "register_operand")
 	  (match_operator:<VPRED> 1 "comparison_operator"
-	    [(match_operand:SVE_FULL_I 2 "register_operand")
-	     (match_operand:SVE_FULL_I 3 "nonmemory_operand")]))
+	    [(match_operand:SVE_I 2 "register_operand")
+	     (match_operand:SVE_I 3 "nonmemory_operand")]))
      (clobber (reg:CC_NZC CC_REGNUM))])]
   "TARGET_SVE"
   {
@@ -7550,14 +7563,38 @@
 )
 
 ;; Predicated integer comparisons.
+;;
+;; For unpacked vectors, only the lowpart element in each input container
+;; has a defined value, and only the predicate bits associated with
+;; those elements are defined.  For example, when comparing two VNx2SIs:
+;;
+;; - The VNx2SIs can be seem as VNx2DIs in which the low halves of each
+;;   DI container store an SI element.  The upper bits of each DI container
+;;   are undefined.
+;;
+;; - Alternatively, the VNx2SIs can be seen as VNx4SIs in which the
+;;   even elements are defined and the odd elements are undefined.
+;;
+;; - The associated predicate mode is VNx2BI.  This means that only the
+;;   low bit in each predicate byte is defined (on input and on output).
+;;
+;; - We use a .s comparison to compare VNx2SIs, under the control of a
+;;   VNx2BI governing predicate, to produce a VNx2BI result.  If we view
+;;   the .s operation as operating on VNx4SIs then for odd lanes:
+;;
+;;   - the input governing predicate bit is undefined
+;;   - the SI elements being compared are undefined
+;;   - the predicate result bit is therefore undefined, but
+;;   - the predicate result bit is in the undefined part of a VNx2BI,
+;;     so its value doesn't matter anyway.
 (define_insn "@aarch64_pred_cmp<cmp_op><mode>"
   [(set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
 	(unspec:<VPRED>
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
 	   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
 	   (SVE_INT_CMP:<VPRED>
-	     (match_operand:SVE_FULL_I 3 "register_operand" "w, w")
-	     (match_operand:SVE_FULL_I 4 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+	     (match_operand:SVE_I 3 "register_operand" "w, w")
+	     (match_operand:SVE_I 4 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
 	  UNSPEC_PRED_Z))
    (clobber (reg:CC_NZC CC_REGNUM))]
   "TARGET_SVE"
@@ -7578,8 +7615,8 @@
 	     [(match_operand 6)
 	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
 	      (SVE_INT_CMP:<VPRED>
-		(match_operand:SVE_FULL_I 2 "register_operand" "w, w")
-		(match_operand:SVE_FULL_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+		(match_operand:SVE_I 2 "register_operand" "w, w")
+		(match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
 	     UNSPEC_PRED_Z)]
 	  UNSPEC_PTEST))
    (set (match_operand:<VPRED> 0 "register_operand" "=Upa, Upa")
@@ -7614,8 +7651,8 @@
 	     [(match_operand 6)
 	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
 	      (SVE_INT_CMP:<VPRED>
-		(match_operand:SVE_FULL_I 2 "register_operand" "w, w")
-		(match_operand:SVE_FULL_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+		(match_operand:SVE_I 2 "register_operand" "w, w")
+		(match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
 	     UNSPEC_PRED_Z)]
 	  UNSPEC_PTEST))
    (clobber (match_scratch:<VPRED> 0 "=Upa, Upa"))]
@@ -7642,8 +7679,8 @@
 	    [(match_operand 4)
 	     (const_int SVE_KNOWN_PTRUE)
 	     (SVE_INT_CMP:<VPRED>
-	       (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
-	       (match_operand:SVE_FULL_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
+	       (match_operand:SVE_I 2 "register_operand" "w, w")
+	       (match_operand:SVE_I 3 "aarch64_sve_cmp_<sve_imm_con>_operand" "<sve_imm_con>, w"))]
 	    UNSPEC_PRED_Z)
 	  (match_operand:<VPRED> 1 "register_operand" "Upl, Upl")))
    (clobber (reg:CC_NZC CC_REGNUM))]
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cmp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cmp_1.c
new file mode 100644
index 00000000000..7cf66c571c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cmp_1.c
@@ -0,0 +1,57 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST_PAIR(TYPE1, TYPE2)				\
+  void							\
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict x,		\
+		       TYPE2 *restrict g, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      if (g[i] < 4)					\
+	x[i] += 1;					\
+  }
+
+#define TEST_SINGLE(TYPE)			\
+  TEST_PAIR (TYPE, int8_t)			\
+  TEST_PAIR (TYPE, uint8_t)			\
+  TEST_PAIR (TYPE, int16_t)			\
+  TEST_PAIR (TYPE, uint16_t)			\
+  TEST_PAIR (TYPE, int32_t)			\
+  TEST_PAIR (TYPE, uint32_t)			\
+  TEST_PAIR (TYPE, int64_t)			\
+  TEST_PAIR (TYPE, uint64_t)
+
+TEST_SINGLE (int8_t)
+TEST_SINGLE (uint8_t)
+TEST_SINGLE (int16_t)
+TEST_SINGLE (uint16_t)
+TEST_SINGLE (int32_t)
+TEST_SINGLE (uint32_t)
+TEST_SINGLE (int64_t)
+TEST_SINGLE (uint64_t)
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b,} 8 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.h,} 8 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s,} 8 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d,} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 16 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s,} 8 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d,} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 24 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 32 } } */
+
+/* { dg-final { scan-assembler-times {\tcmpl[et]\tp[0-9]+\.b,} 8 } } */
+/* { dg-final { scan-assembler-times {\tcmpl[so]\tp[0-9]+\.b,} 8 } } */
+/* { dg-final { scan-assembler-times {\tcmpl[et]\tp[0-9]+\.h,} 8 } } */
+/* { dg-final { scan-assembler-times {\tcmpl[so]\tp[0-9]+\.h,} 8 } } */
+/* { dg-final { scan-assembler-times {\tcmpl[et]\tp[0-9]+\.s,} 8 } } */
+/* { dg-final { scan-assembler-times {\tcmpl[so]\tp[0-9]+\.s,} 8 } } */
+/* { dg-final { scan-assembler-times {\tcmpl[et]\tp[0-9]+\.d,} 8 } } */
+/* { dg-final { scan-assembler-times {\tcmpl[so]\tp[0-9]+\.d,} 8 } } */
+
+/* { dg-final { scan-assembler-not {\tpunpk} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cmp_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cmp_2.c
new file mode 100644
index 00000000000..b22120695e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cmp_2.c
@@ -0,0 +1,72 @@
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST_PAIR(TYPE1, TYPE2)					\
+  void								\
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict x, TYPE1 y, TYPE1 z,	\
+		       TYPE2 *restrict g, TYPE2 h, int n)	\
+  {								\
+    for (int i = 0; i < n; ++i)					\
+      x[i] = g[i] < h ? y : z;					\
+  }
+
+#define TEST_SINGLE(TYPE)			\
+  TEST_PAIR (TYPE, int8_t)			\
+  TEST_PAIR (TYPE, uint8_t)			\
+  TEST_PAIR (TYPE, int16_t)			\
+  TEST_PAIR (TYPE, uint16_t)			\
+  TEST_PAIR (TYPE, int32_t)			\
+  TEST_PAIR (TYPE, uint32_t)			\
+  TEST_PAIR (TYPE, int64_t)			\
+  TEST_PAIR (TYPE, uint64_t)
+
+TEST_SINGLE (int8_t)
+TEST_SINGLE (uint8_t)
+TEST_SINGLE (int16_t)
+TEST_SINGLE (uint16_t)
+TEST_SINGLE (int32_t)
+TEST_SINGLE (uint32_t)
+TEST_SINGLE (float)
+TEST_SINGLE (int64_t)
+TEST_SINGLE (uint64_t)
+TEST_SINGLE (double)
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.b,} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.h,} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s,} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d,} 6 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 8 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s,} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d,} 6 } } */
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 14 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 6 } } */
+
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 20 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.d,} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 8 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.d,} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 18 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 6 } } */
+
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 24 } } */
+
+/* { dg-final { scan-assembler-times {\tcmp(?:h[is]|l[os])\tp[0-9]+\.b,} 10 } } */
+/* { dg-final { scan-assembler-times {\tcmp[lg][et]\tp[0-9]+\.b,} 10 } } */
+/* { dg-final { scan-assembler-times {\tcmp(?:h[is]|l[os])\tp[0-9]+\.h,} 10 } } */
+/* { dg-final { scan-assembler-times {\tcmp[lg][et]\tp[0-9]+\.h,} 10 } } */
+/* { dg-final { scan-assembler-times {\tcmp(?:h[is]|l[os])\tp[0-9]+\.s,} 10 } } */
+/* { dg-final { scan-assembler-times {\tcmp[lg][et]\tp[0-9]+\.s,} 10 } } */
+/* { dg-final { scan-assembler-times {\tcmp(?:h[is]|l[os])\tp[0-9]+\.d,} 10 } } */
+/* { dg-final { scan-assembler-times {\tcmp[lg][et]\tp[0-9]+\.d,} 10 } } */
+
+/* { dg-final { scan-assembler-not {\tpunpk} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c
index 52138d2b023..d831e9c1142 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-sve-compare-costs=0" } */
 
 #include <stdint.h>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c
index 876f98f6ec2..5808e0a9663 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-sve-compare-costs=0" } */
 
 #include "cond_arith_1.c"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c
index 94eb255c969..068e0b64793 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-sve-compare-costs=0" } */
 
 #include <stdint.h>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c
index 31457da523b..d2580046dab 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-sve-compare-costs=0" } */
 
 #include "cond_arith_3.c"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_7.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_7.c
index cd2661ef3a5..687716e7176 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_7.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_gather_load_7.c
@@ -1,5 +1,5 @@
 /* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math --save-temps --param aarch64-sve-compare-costs=0" } */
 
 #include <stdint.h>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
index 78c70b2be32..a38b92dc53e 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-sve-compare-costs=0" } */
 
 #include <stdint.h>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_11.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_11.c
index 3c9e340475a..4efcf3ab32a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_11.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_11.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --param aarch64-sve-compare-costs=0" } */
 
 #include <stdint.h>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_11_run.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_11_run.c
index 9a4edb81448..4cbe4a6f867 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_11_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_11_run.c
@@ -1,5 +1,5 @@
 /* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve" } */
+/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve --param aarch64-sve-compare-costs=0" } */
 
 #include "vcond_11.c"
 
-- 
2.30.2