+2014-09-05 Alan Lawrence <alan.lawrence@arm.com>
+
+ * config/aarch64/aarch64-builtins.c (aarch64_types_cmtst_qualifiers,
+ TYPES_TST): Define.
+ (aarch64_fold_builtin): Update pattern for cmtst.
+
+ * config/aarch64/aarch64-protos.h (aarch64_const_vec_all_same_int_p):
+ Declare.
+
+ * config/aarch64/aarch64-simd-builtins.def (cmtst): Update qualifiers.
+
+ * config/aarch64/aarch64-simd.md (aarch64_vcond_internal<mode><mode>):
+ Switch operands, separate out more cases, refactor.
+
+ (aarch64_cmtst<mode>): Rewrite pattern to match (plus ... -1).
+
+ * config/aarch64.c (aarch64_const_vec_all_same_int_p): Take single
+ argument; rename old version to...
+ (aarch64_const_vec_all_same_in_range_p): ...this.
+ (aarch64_print_operand, aarch64_simd_shift_imm_p): Follow renaming.
+
+ * config/aarch64/predicates.md (aarch64_simd_imm_minus_one): Define.
+
2014-09-05 Alan Lawrence <alan.lawrence@arm.com>
* config/aarch64/aarch64-builtins.c (enum aarch64_type_qualifiers):
= { qualifier_none, qualifier_none, qualifier_maybe_immediate };
#define TYPES_BINOP (aarch64_types_binop_qualifiers)
static enum aarch64_type_qualifiers
+aarch64_types_cmtst_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_none, qualifier_none,
+ qualifier_internal, qualifier_internal };
+#define TYPES_TST (aarch64_types_cmtst_qualifiers)
+static enum aarch64_type_qualifiers
aarch64_types_binopv_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_void, qualifier_none, qualifier_none };
#define TYPES_BINOPV (aarch64_types_binopv_qualifiers)
BUILTIN_VALLDI (BINOP, cmeq, 0)
return fold_build2 (EQ_EXPR, type, args[0], args[1]);
break;
- BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0)
+ BUILTIN_VSDQ_I_DI (TST, cmtst, 0)
{
tree and_node = fold_build2 (BIT_AND_EXPR, type, args[0], args[1]);
tree vec_zero_node = build_zero_cst (type);
enum reg_class);
enum aarch64_symbol_type
aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context);
+bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
bool aarch64_constant_address_p (rtx);
bool aarch64_expand_movmem (rtx *);
bool aarch64_float_const_zero_rtx_p (rtx);
/* Implemented by aarch64_cm<cmp><mode>. */
BUILTIN_VSDQ_I_DI (BINOP, cmgeu, 0)
BUILTIN_VSDQ_I_DI (BINOP, cmgtu, 0)
- BUILTIN_VSDQ_I_DI (BINOP, cmtst, 0)
+ BUILTIN_VSDQ_I_DI (TST, cmtst, 0)
/* Implemented by reduc_<sur>plus_<mode>. */
BUILTIN_VALL (UNOP, reduc_splus_, 10)
(match_operand:VDQ 2 "nonmemory_operand")))]
"TARGET_SIMD"
{
- int inverse = 0, has_zero_imm_form = 0;
rtx op1 = operands[1];
rtx op2 = operands[2];
rtx mask = gen_reg_rtx (<MODE>mode);
+ enum rtx_code code = GET_CODE (operands[3]);
+
+ /* Switching OP1 and OP2 is necessary for NE (to output a cmeq insn),
+ and desirable for other comparisons if it results in FOO ? -1 : 0
+ (this allows direct use of the comparison result without a bsl). */
+ if (code == NE
+ || (code != EQ
+ && op1 == CONST0_RTX (<V_cmp_result>mode)
+ && op2 == CONSTM1_RTX (<V_cmp_result>mode)))
+ {
+ op1 = operands[2];
+ op2 = operands[1];
+ switch (code)
+ {
+ case LE: code = GT; break;
+ case LT: code = GE; break;
+ case GE: code = LT; break;
+ case GT: code = LE; break;
+ /* No case EQ. */
+ case NE: code = EQ; break;
+ case LTU: code = GEU; break;
+ case LEU: code = GTU; break;
+ case GTU: code = LEU; break;
+ case GEU: code = LTU; break;
+ default: gcc_unreachable ();
+ }
+ }
- switch (GET_CODE (operands[3]))
+ /* Make sure we can handle the last operand. */
+ switch (code)
{
+ case NE:
+ /* Normalized to EQ above. */
+ gcc_unreachable ();
+
case LE:
case LT:
- case NE:
- inverse = 1;
- /* Fall through. */
case GE:
case GT:
case EQ:
- has_zero_imm_form = 1;
- break;
- case LEU:
- case LTU:
- inverse = 1;
- break;
+ /* These instructions have a form taking an immediate zero. */
+ if (operands[5] == CONST0_RTX (<MODE>mode))
+ break;
+ /* Fall through, as may need to load into register. */
default:
+ if (!REG_P (operands[5]))
+ operands[5] = force_reg (<MODE>mode, operands[5]);
break;
}
- if (!REG_P (operands[5])
- && (operands[5] != CONST0_RTX (<MODE>mode) || !has_zero_imm_form))
- operands[5] = force_reg (<MODE>mode, operands[5]);
-
- switch (GET_CODE (operands[3]))
+ switch (code)
{
case LT:
+ emit_insn (gen_aarch64_cmlt<mode> (mask, operands[4], operands[5]));
+ break;
+
case GE:
emit_insn (gen_aarch64_cmge<mode> (mask, operands[4], operands[5]));
break;
case LE:
+ emit_insn (gen_aarch64_cmle<mode> (mask, operands[4], operands[5]));
+ break;
+
case GT:
emit_insn (gen_aarch64_cmgt<mode> (mask, operands[4], operands[5]));
break;
case LTU:
+ emit_insn (gen_aarch64_cmgtu<mode> (mask, operands[5], operands[4]));
+ break;
+
case GEU:
emit_insn (gen_aarch64_cmgeu<mode> (mask, operands[4], operands[5]));
break;
case LEU:
+ emit_insn (gen_aarch64_cmgeu<mode> (mask, operands[5], operands[4]));
+ break;
+
case GTU:
emit_insn (gen_aarch64_cmgtu<mode> (mask, operands[4], operands[5]));
break;
- case NE:
+ /* NE has been normalized to EQ above. */
case EQ:
emit_insn (gen_aarch64_cmeq<mode> (mask, operands[4], operands[5]));
break;
gcc_unreachable ();
}
- if (inverse)
- {
- op1 = operands[2];
- op2 = operands[1];
- }
-
/* If we have (a = (b CMP c) ? -1 : 0);
Then we can simply move the generated mask. */
;; cmtst
+;; Although neg (ne (and x y) 0) is the natural way of expressing a cmtst,
+;; we don't have any insns using ne, and aarch64_vcond_internal outputs
+;; not (neg (eq (and x y) 0))
+;; which is rewritten by simplify_rtx as
+;; plus (eq (and x y) 0) -1.
+
(define_insn "aarch64_cmtst<mode>"
[(set (match_operand:<V_cmp_result> 0 "register_operand" "=w")
- (neg:<V_cmp_result>
- (ne:<V_cmp_result>
+ (plus:<V_cmp_result>
+ (eq:<V_cmp_result>
(and:VDQ
(match_operand:VDQ 1 "register_operand" "w")
(match_operand:VDQ 2 "register_operand" "w"))
- (vec_duplicate:<V_cmp_result> (const_int 0)))))]
+ (match_operand:VDQ 3 "aarch64_simd_imm_zero"))
+ (match_operand:<V_cmp_result> 4 "aarch64_simd_imm_minus_one")))
+ ]
"TARGET_SIMD"
"cmtst\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
[(set_attr "type" "neon_tst<q>")]
static void aarch64_override_options_after_change (void);
static bool aarch64_vector_mode_supported_p (enum machine_mode);
static unsigned bit_count (unsigned HOST_WIDE_INT);
-static bool aarch64_const_vec_all_same_int_p (rtx,
- HOST_WIDE_INT, HOST_WIDE_INT);
-
static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
const unsigned char *sel);
static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
}
}
+bool
+aarch64_const_vec_all_same_in_range_p (rtx x,
+ HOST_WIDE_INT minval,
+ HOST_WIDE_INT maxval)
+{
+ HOST_WIDE_INT firstval;
+ int count, i;
+
+ if (GET_CODE (x) != CONST_VECTOR
+ || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
+ return false;
+
+ firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
+ if (firstval < minval || firstval > maxval)
+ return false;
+
+ count = CONST_VECTOR_NUNITS (x);
+ for (i = 1; i < count; i++)
+ if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
+ return false;
+
+ return true;
+}
+
+bool
+aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
+{
+ return aarch64_const_vec_all_same_in_range_p (x, val, val);
+}
+
static unsigned
bit_count (unsigned HOST_WIDE_INT value)
{
case CONST_VECTOR:
if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
{
- gcc_assert (aarch64_const_vec_all_same_int_p (x,
- HOST_WIDE_INT_MIN,
- HOST_WIDE_INT_MAX));
+ gcc_assert (
+ aarch64_const_vec_all_same_in_range_p (x,
+ HOST_WIDE_INT_MIN,
+ HOST_WIDE_INT_MAX));
asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
}
else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
#undef CHECK
}
-static bool
-aarch64_const_vec_all_same_int_p (rtx x,
- HOST_WIDE_INT minval,
- HOST_WIDE_INT maxval)
-{
- HOST_WIDE_INT firstval;
- int count, i;
-
- if (GET_CODE (x) != CONST_VECTOR
- || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
- return false;
-
- firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
- if (firstval < minval || firstval > maxval)
- return false;
-
- count = CONST_VECTOR_NUNITS (x);
- for (i = 1; i < count; i++)
- if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
- return false;
-
- return true;
-}
-
/* Check of immediate shift constants are within range. */
bool
aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
{
int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
if (left)
- return aarch64_const_vec_all_same_int_p (x, 0, bit_width - 1);
+ return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
else
- return aarch64_const_vec_all_same_int_p (x, 1, bit_width);
+ return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
}
/* Return true if X is a uniform vector where all elements
{
return aarch64_simd_imm_zero_p (op, mode);
})
+
+(define_special_predicate "aarch64_simd_imm_minus_one"
+ (match_code "const_vector")
+{
+ return aarch64_const_vec_all_same_int_p (op, -1);
+})
+2014-09-05 Alan Lawrence <alan.lawrence@arm.com>
+
+ * gcc.target/aarch64/simd/int_comparisons.x: New file.
+ * gcc.target/aarch64/simd/int_comparisons_1.c: New test.
+ * gcc.target/aarch64/simd/int_comparisons_2.c: Ditto.
+
2014-09-05 Alan Lawrence <alan.lawrence@arm.com>
* gcc.target/aarch64/simd/vrbit_1.c: New test.
--- /dev/null
+/* test_vcXXX wrappers for all the vcXXX (vector compare) and vtst intrinsics
+ in arm_neon.h (excluding the 64x1 variants as these generally produce scalar
+ not vector ops). */
+#include "arm_neon.h"
+
+#define DONT_FORCE(X)
+
+#define FORCE_SIMD(V1) asm volatile ("mov %d0, %1.d[0]" \
+ : "=w"(V1) \
+ : "w"(V1) \
+ : /* No clobbers */);
+
+#define OP1(SIZE, OP, BASETYPE, SUFFIX, FORCE) uint##SIZE##_t \
+test_v##OP##SUFFIX (BASETYPE##SIZE##_t a) \
+{ \
+ uint##SIZE##_t res; \
+ FORCE (a); \
+ res = v##OP##SUFFIX (a); \
+ FORCE (res); \
+ return res; \
+}
+
+#define OP2(SIZE, OP, BASETYPE, SUFFIX, FORCE) uint##SIZE##_t \
+test_v##OP##SUFFIX (BASETYPE##SIZE##_t a, BASETYPE##SIZE##_t b) \
+{ \
+ uint##SIZE##_t res; \
+ FORCE (a); \
+ FORCE (b); \
+ res = v##OP##SUFFIX (a, b); \
+ FORCE (res); \
+ return res; \
+}
+
+#define UNSIGNED_OPS(SIZE, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, tst, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, ceqz, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, ceq, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, cge, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, cgt, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, cle, BASETYPE, SUFFIX, FORCE) \
+OP2 (SIZE, clt, BASETYPE, SUFFIX, FORCE)
+
+#define ALL_OPS(SIZE, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, cgez, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, cgtz, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, clez, BASETYPE, SUFFIX, FORCE) \
+OP1 (SIZE, cltz, BASETYPE, SUFFIX, FORCE) \
+UNSIGNED_OPS (SIZE, BASETYPE, SUFFIX, FORCE)
+
+ALL_OPS (8x8, int, _s8, DONT_FORCE)
+ALL_OPS (16x4, int, _s16, DONT_FORCE)
+ALL_OPS (32x2, int, _s32, DONT_FORCE)
+ALL_OPS (64x1, int, _s64, DONT_FORCE)
+ALL_OPS (64, int, d_s64, FORCE_SIMD)
+ALL_OPS (8x16, int, q_s8, DONT_FORCE)
+ALL_OPS (16x8, int, q_s16, DONT_FORCE)
+ALL_OPS (32x4, int, q_s32, DONT_FORCE)
+ALL_OPS (64x2, int, q_s64, DONT_FORCE)
+UNSIGNED_OPS (8x8, uint, _u8, DONT_FORCE)
+UNSIGNED_OPS (16x4, uint, _u16, DONT_FORCE)
+UNSIGNED_OPS (32x2, uint, _u32, DONT_FORCE)
+UNSIGNED_OPS (64x1, uint, _u64, DONT_FORCE)
+UNSIGNED_OPS (64, uint, d_u64, FORCE_SIMD)
+UNSIGNED_OPS (8x16, uint, q_u8, DONT_FORCE)
+UNSIGNED_OPS (16x8, uint, q_u16, DONT_FORCE)
+UNSIGNED_OPS (32x4, uint, q_u32, DONT_FORCE)
+UNSIGNED_OPS (64x2, uint, q_u64, DONT_FORCE)
+
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-inline" } */
+
+/* Scan-assembler test, so, incorporate as little other code as possible. */
+
+#include "arm_neon.h"
+#include "int_comparisons.x"
+
+/* Operations on all 18 integer types: (q?)_[su](8|16|32|64), d_[su]64.
+ (d?)_[us]64 generate regs of form 'd0' rather than e.g. 'v0.2d'. */
+/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmeq\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmtst\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmtst\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+
+/* vcge + vcle both implemented with cmge (signed) or cmhs (unsigned). */
+/* { dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmhs\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmhs\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+
+/* vcgt + vclt both implemented with cmgt (signed) or cmhi (unsigned). */
+/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmhi\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\]" 14 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmhi\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]+d\[0-9\]+" 4 } } */
+
+/* Comparisons against immediate zero, on the 8 signed integer types only. */
+
+/* { dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */
+/* For int64_t and int64x1_t, combine_simplify_rtx failure of
+ https://gcc.gnu.org/ml/gcc/2014-06/msg00253.html
+ prevents generation of cmge....#0, instead producing mvn + sshr. */
+/* { #dg-final { scan-assembler-times "\[ \t\]cmge\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmgt\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmle\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmle\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?0" 2 } } */
+/* { dg-final { scan-assembler-times "\[ \t\]cmlt\[ \t\]+v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*v\[0-9\]+\.\[0-9\]+\[bshd\],\[ \t\]*#?0" 7 } } */
+/* For int64_t and int64x1_t, cmlt ... #0 and sshr ... #63 are equivalent,
+ so allow either. cmgez issue above results in extra 2 * sshr....63. */
+/* { dg-final { scan-assembler-times "\[ \t\](?:cmlt|sshr)\[ \t\]+d\[0-9\]+,\[ \t\]*d\[0-9\]+,\[ \t\]*#?(?:0|63)" 4 } } */
+
+// All should have been compiled into single insns without inverting result:
+/* { dg-final { scan-assembler-not "not" } } */
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-inline" } */
+/* Stops the test_xxx methods being inlined into main, thus preventing constant
+ propagation. */
+
+#include "int_comparisons.x"
+
+extern void abort (void);
+
+#define CHECK2(R0, R1) if (res[0] != R0 || res[1] != R1) abort ()
+
+#define TEST2(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) { \
+ BASETYPE##_t _a[2] = {2, 3}; \
+ BASETYPE##x2_t a = vld1##SUFFIX (_a); \
+ BASETYPE##_t _b[2] = {1, 3}; \
+ BASETYPE##x2_t b = vld1##SUFFIX (_b); \
+ RESTYPE res[2]; \
+ vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b)); CHECK2 (0, 0); \
+ vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (b, a)); CHECK2 (-1, 0); \
+ vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b)); CHECK2 (0, -1); \
+ vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (b, a)); CHECK2 (-1, -1); \
+ vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b)); CHECK2 (0, -1); \
+ vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b)); CHECK2 (-1, -1); \
+ vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (b, a)); CHECK2 (0, -1); \
+ vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b)); CHECK2 (-1, 0); \
+ vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (b, a)); CHECK2 (0, 0); \
+ vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b)); CHECK2 (0, -1); \
+ vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a + 1, b)); CHECK2 (-1, 0); \
+}
+
+#define CHECK4(T, R0, R1, R2, R3) \
+ if (res[0] != (T)R0 || res[1] != (T)R1 \
+ || res[2] != (T)R2 || res[3] != (T)R3) abort ()
+
+#define TEST4(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) { \
+ BASETYPE##_t _a[4] = {1, 2, 3, 4}; \
+ BASETYPE##x4_t a = vld1##SUFFIX (_a); \
+ BASETYPE##_t _b[4] = {4, 2, 1, 3}; \
+ BASETYPE##x4_t b = vld1##SUFFIX (_b); \
+ RESTYPE res[4]; \
+ vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b)); \
+ CHECK4 (RESTYPE, -1, 0, 0, 0); \
+ vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b)); \
+ CHECK4 (RESTYPE, -1, -1, 0, 0); \
+ vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b)); \
+ CHECK4 (RESTYPE, 0, -1, 0, 0); \
+ vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b)); \
+ CHECK4 (RESTYPE, 0, -1, -1, -1); \
+ vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b)); \
+ CHECK4 (RESTYPE, 0, 0, -1, -1); \
+ vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b)); \
+ CHECK4 (RESTYPE, 0, -1, -1, 0); \
+}
+
+#define CHECK8(T, R0, R1, R2, R3, R4, R5, R6, R7) \
+ if (res[0] != (T)R0 || res[1] != (T)R1 || res[2] != (T)R2 || res[3] != (T)R3 \
+ || res[4] != (T)R4 || res[5] != (T)R5 || res[6] != (T)R6 \
+ || res[7] != (T)R7) abort ()
+
+#define TEST8(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) { \
+ BASETYPE##_t _a[8] = {1, 2, 3, 4, 5, 6, 7, 8}; \
+ BASETYPE##x8_t a = vld1##SUFFIX (_a); \
+ BASETYPE##_t _b[8] = {4, 2, 1, 3, 2, 6, 8, 9}; \
+ BASETYPE##x8_t b = vld1##SUFFIX (_b); \
+ RESTYPE res[8]; \
+ vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b)); \
+ CHECK8 (RESTYPE, -1, 0, 0, 0, 0, 0, -1, -1); \
+ vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b)); \
+ CHECK8 (RESTYPE, -1, -1, 0, 0, 0, -1, -1, -1); \
+ vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b)); \
+ CHECK8 (RESTYPE, 0, -1, 0, 0, 0, -1, 0, 0); \
+ vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b)); \
+ CHECK8 (RESTYPE, 0, -1, -1, -1, -1, -1, 0, 0); \
+ vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b)); \
+ CHECK8 (RESTYPE, 0, 0, -1, -1, -1, 0, 0, 0); \
+ vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b)); \
+ CHECK8 (RESTYPE, 0, -1, -1, 0, 0, -1, 0, -1); \
+}
+
+/* 16-way tests use same 8 values twice. */
+#define CHECK16(T, R0, R1, R2, R3, R4, R5, R6, R7) \
+ if (res[0] != (T)R0 || res[1] != (T)R1 || res[2] != (T)R2 || res[3] != (T)R3 \
+ || res[4] != (T)R4 || res[5] != (T)R5 || res[6] != (T)R6 \
+ || res[7] != (T)R7 || res[8] != (T)R0 || res[9] != (T)R1 \
+ || res[10] != (T)R2 || res[11] != (T)R3 || res[12] != (T)R4 \
+ || res[13] != (T)R5 || res[14] != (T)R6 || res[15] != (T)R7) abort ()
+
+#define TEST16(BASETYPE, SUFFIX, RESTYPE, ST1_SUFFIX) { \
+ BASETYPE##_t _a[16] = {1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8}; \
+ BASETYPE##x16_t a = vld1##SUFFIX (_a); \
+ BASETYPE##_t _b[16] = {4, 2, 1, 3, 2, 6, 8, 9, 4, 2, 1, 3, 2, 6, 8, 9}; \
+ BASETYPE##x16_t b = vld1##SUFFIX (_b); \
+ RESTYPE res[16]; \
+ vst1##ST1_SUFFIX (res, test_vclt##SUFFIX (a, b)); \
+ CHECK16 (RESTYPE, -1, 0, 0, 0, 0, 0, -1, -1); \
+ vst1##ST1_SUFFIX (res, test_vcle##SUFFIX (a, b)); \
+ CHECK16 (RESTYPE, -1, -1, 0, 0, 0, -1, -1, -1); \
+ vst1##ST1_SUFFIX (res, test_vceq##SUFFIX (a, b)); \
+ CHECK16 (RESTYPE, 0, -1, 0, 0, 0, -1, 0, 0); \
+ vst1##ST1_SUFFIX (res, test_vcge##SUFFIX (a, b)); \
+ CHECK16 (RESTYPE, 0, -1, -1, -1, -1, -1, 0, 0); \
+ vst1##ST1_SUFFIX (res, test_vcgt##SUFFIX (a, b)); \
+ CHECK16 (RESTYPE, 0, 0, -1, -1, -1, 0, 0, 0); \
+ vst1##ST1_SUFFIX (res, test_vtst##SUFFIX (a, b)); \
+ CHECK16 (RESTYPE, 0, -1, -1, 0, 0, -1, 0, -1); \
+}
+
+int
+main (int argc, char **argv)
+{
+ TEST2 (int32, _s32, uint32_t, _u32);
+ TEST2 (uint32, _u32, uint32_t, _u32);
+ TEST2 (int64, q_s64, uint64_t, q_u64);
+ TEST2 (uint64, q_u64, uint64_t, q_u64);
+
+ TEST4 (int16, _s16, uint16_t, _u16);
+ TEST4 (uint16, _u16, uint16_t, _u16);
+ TEST4 (int32, q_s32, uint32_t, q_u32);
+ TEST4 (uint32, q_u32, uint32_t, q_u32);
+
+ TEST8 (int8, _s8, uint8_t, _u8);
+ TEST8 (uint8, _u8, uint8_t, _u8);
+ TEST8 (int16, q_s16, uint16_t, q_u16);
+ TEST8 (uint16, q_u16, uint16_t, q_u16);
+
+ TEST16 (int8, q_s8, uint8_t, q_u8);
+ TEST16 (uint8, q_u8, uint8_t, q_u8);
+
+ return 0;
+}
+