+2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
+
+ * config/aarch64/iterators.md (SVE_HSDI): New mode iterator.
+ (narrower_mask): Handle VNx4HI, VNx2HI and VNx2SI.
+ * config/aarch64/aarch64-sve.md
+ (<ANY_EXTEND:optab><SVE_PARTIAL_I:mode><SVE_HSDI:mode>2): New pattern.
+ (*<ANY_EXTEND:optab><SVE_PARTIAL_I:mode><SVE_HSDI:mode>2): Likewise.
+ (@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Update
+ comment. Avoid new narrower_mask ambiguity.
+ (@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Likewise.
+ (*cond_uxt<mode>_2): Update comment.
+ (*cond_uxt<mode>_any): Likewise.
+
2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
* config/aarch64/aarch64-modes.def: Define partial SVE vector
;; == Unary arithmetic
;; ---- [INT] General unary arithmetic corresponding to rtx codes
;; ---- [INT] General unary arithmetic corresponding to unspecs
-;; ---- [INT] Sign extension
-;; ---- [INT] Zero extension
+;; ---- [INT] Sign and zero extension
;; ---- [INT] Logical inverse
;; ---- [FP<-INT] General unary arithmetic that maps to unspecs
;; ---- [FP] General unary arithmetic corresponding to unspecs
)
;; -------------------------------------------------------------------------
-;; ---- [INT] Sign extension
+;; ---- [INT] Sign and zero extension
;; -------------------------------------------------------------------------
;; Includes:
;; - SXTB
;; - SXTH
;; - SXTW
+;; - UXTB
+;; - UXTH
+;; - UXTW
;; -------------------------------------------------------------------------
-;; Predicated SXT[BHW].
+;; Unpredicated sign and zero extension from a narrower mode.
+(define_expand "<optab><SVE_PARTIAL_I:mode><SVE_HSDI:mode>2"
+ [(set (match_operand:SVE_HSDI 0 "register_operand")
+ (unspec:SVE_HSDI
+ [(match_dup 2)
+ (ANY_EXTEND:SVE_HSDI
+ (match_operand:SVE_PARTIAL_I 1 "register_operand"))]
+ UNSPEC_PRED_X))]
+ "TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0"
+ {
+ operands[2] = aarch64_ptrue_reg (<SVE_HSDI:VPRED>mode);
+ }
+)
+
+;; Predicated sign and zero extension from a narrower mode.
+(define_insn "*<optab><SVE_PARTIAL_I:mode><SVE_HSDI:mode>2"
+ [(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
+ (unspec:SVE_HSDI
+ [(match_operand:<SVE_HSDI:VPRED> 1 "register_operand" "Upl")
+ (ANY_EXTEND:SVE_HSDI
+ (match_operand:SVE_PARTIAL_I 2 "register_operand" "w"))]
+ UNSPEC_PRED_X))]
+ "TARGET_SVE && (~<SVE_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0"
+ "<su>xt<SVE_PARTIAL_I:Vesize>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>"
+)
+
+;; Predicated truncate-and-sign-extend operations.
(define_insn "@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>"
[(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
(unspec:SVE_FULL_HSDI
(truncate:SVE_PARTIAL_I
(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")))]
UNSPEC_PRED_X))]
- "TARGET_SVE && (~<narrower_mask> & <self_mask>) == 0"
+ "TARGET_SVE
+ && (~<SVE_FULL_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0"
"sxt<SVE_PARTIAL_I:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>"
)
-;; Predicated SXT[BHW] with merging.
+;; Predicated truncate-and-sign-extend operations with merging.
(define_insn "@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>"
[(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w, ?&w, ?&w")
(unspec:SVE_FULL_HSDI
(match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")))
(match_operand:SVE_FULL_HSDI 3 "aarch64_simd_reg_or_zero" "0, Dz, w")]
UNSPEC_SEL))]
- "TARGET_SVE && (~<narrower_mask> & <self_mask>) == 0"
+ "TARGET_SVE
+ && (~<SVE_FULL_HSDI:narrower_mask> & <SVE_PARTIAL_I:self_mask>) == 0"
"@
sxt<SVE_PARTIAL_I:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
movprfx\t%0.<SVE_FULL_HSDI:Vetype>, %1/z, %2.<SVE_FULL_HSDI:Vetype>\;sxt<SVE_PARTIAL_I:Vesize>\t%0.<SVE_FULL_HSDI:Vetype>, %1/m, %2.<SVE_FULL_HSDI:Vetype>
[(set_attr "movprfx" "*,yes,yes")]
)
-;; -------------------------------------------------------------------------
-;; ---- [INT] Zero extension
-;; -------------------------------------------------------------------------
-;; Includes:
-;; - UXTB
-;; - UXTH
-;; - UXTW
-;; -------------------------------------------------------------------------
-
-;; Match UXT[BHW] as a conditional AND of a constant, merging with the
+;; Predicated truncate-and-zero-extend operations, merging with the
;; first input.
+;;
+;; The canonical form of this operation is an AND of a constant rather
+;; than (zero_extend (truncate ...)).
(define_insn "*cond_uxt<mode>_2"
[(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
(unspec:SVE_FULL_I
[(set_attr "movprfx" "*,yes")]
)
-;; Match UXT[BHW] as a conditional AND of a constant, merging with an
+;; Predicated truncate-and-zero-extend operations, merging with an
;; independent value.
;;
;; The earlyclobber isn't needed for the first alternative, but omitting
VNx4SI VNx2SI
VNx2DI])
+;; SVE integer vector modes whose elements are 16 bits or wider.
+(define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
+ VNx4SI VNx2SI
+ VNx2DI])
+
;; Modes involved in extending or truncating SVE data, for 8 elements per
;; 128-bit block.
(define_mode_iterator VNx8_NARROW [VNx8QI])
(VNx2HI "0x22")
(VNx2SI "0x24")])
-;; For full vector modes, the mask of narrower modes, encoded as above.
-(define_mode_attr narrower_mask [(VNx8HI "0x81")
- (VNx4SI "0x43")
+;; For SVE_HSDI vector modes, the mask of narrower modes, encoded as above.
+(define_mode_attr narrower_mask [(VNx8HI "0x81") (VNx4HI "0x41")
+ (VNx2HI "0x21")
+ (VNx4SI "0x43") (VNx2SI "0x23")
(VNx2DI "0x27")])
;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
+2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
+
+ * gcc.target/aarch64/sve/cost_model_1.c: Expect the loop to be
+ vectorized with bytes stored in 32-bit containers.
+ * gcc.target/aarch64/sve/extend_1.c: New test.
+ * gcc.target/aarch64/sve/extend_2.c: New test.
+ * gcc.target/aarch64/sve/extend_3.c: New test.
+ * gcc.target/aarch64/sve/extend_4.c: New test.
+ * gcc.target/aarch64/sve/load_const_offset_3.c: Add
+ --param aarch64-sve-compare-costs=0.
+ * gcc.target/aarch64/sve/mask_struct_store_1.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_1_run.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_2.c: Likewise.
+ * gcc.target/aarch64/sve/mask_struct_store_2_run.c: Likewise.
+ * gcc.target/aarch64/sve/unpack_unsigned_1.c: Likewise.
+ * gcc.target/aarch64/sve/unpack_unsigned_1_run.c: Likewise.
+
2019-11-16 Richard Sandiford <richard.sandiford@arm.com>
* gcc.target/aarch64/sve/mixed_size_1.c: New test.
-/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-options "-O2 -ftree-vectorize" } */
void
f (unsigned int *restrict x, unsigned int *restrict y,
x[i] = x[i] + y[i] + z[i];
}
-/* { dg-final { scan-tree-dump "not vectorized: estimated iteration count too small" vect } } */
-/* { dg-final { scan-tree-dump "vectorized 0 loops" vect } } */
+/* { dg-final { scan-assembler {\tld1b\tz[0-9]+\.s, p[0-7]/z, \[x2\]\n} } } */
--- /dev/null
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2) \
+ void \
+ f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst, TYPE1 *restrict src1, \
+ TYPE2 *restrict src2, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dst[i] += src1[i] + (TYPE2) (src2[i] + 1); \
+ }
+
+#define TEST_ALL(T) \
+ T (uint16_t, uint8_t) \
+ T (uint32_t, uint8_t) \
+ T (uint64_t, uint8_t) \
+ T (uint32_t, uint16_t) \
+ T (uint64_t, uint16_t) \
+ T (uint64_t, uint32_t)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tuxtb\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuxtb\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuxtb\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuxth\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuxth\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuxtw\tz[0-9]+\.d,} 1 } } */
--- /dev/null
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2) \
+ void \
+ f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst, TYPE1 *restrict src1, \
+ TYPE2 *restrict src2, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dst[i] += src1[i] + (TYPE2) (src2[i] + 1); \
+ }
+
+#define TEST_ALL(T) \
+ T (int16_t, int8_t) \
+ T (int32_t, int8_t) \
+ T (int64_t, int8_t) \
+ T (int32_t, int16_t) \
+ T (int64_t, int16_t) \
+ T (int64_t, int32_t)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, #1\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, #1\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsxtb\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsxtb\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsxtb\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsxth\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsxth\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsxtw\tz[0-9]+\.d,} 1 } } */
--- /dev/null
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=512" } */
+
+#include <stdint.h>
+
+void
+f (uint64_t *dst, uint32_t *restrict src1, uint16_t *restrict src2,
+ uint8_t *restrict src3)
+{
+ for (int i = 0; i < 7; ++i)
+ dst[i] += (uint32_t) (src1[i] + (uint16_t) (src2[i]
+ + (uint8_t) (src3[i] + 1)));
+}
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tuxtb\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuxth\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tuxtw\tz[0-9]+\.d,} 1 } } */
--- /dev/null
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=512" } */
+
+#include <stdint.h>
+
+void
+f (int64_t *dst, int32_t *restrict src1, int16_t *restrict src2,
+ int8_t *restrict src3)
+{
+ for (int i = 0; i < 7; ++i)
+ dst[i] += (int32_t) (src1[i] + (int16_t) (src2[i]
+ + (int8_t) (src3[i] + 1)));
+}
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.b, z[0-9]+\.b, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.h, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsxtb\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsxth\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsxtw\tz[0-9]+\.d,} 1 } } */
/* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O2 -ftree-vectorize -save-temps -msve-vector-bits=256" } */
+/* { dg-options "-O2 -ftree-vectorize -save-temps -msve-vector-bits=256 --param aarch64-sve-compare-costs=0" } */
#include "load_const_offset_2.c"
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math --param aarch64-sve-compare-costs=0" } */
#include <stdint.h>
/* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math --param aarch64-sve-compare-costs=0" } */
#include "mask_struct_store_1.c"
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math --param aarch64-sve-compare-costs=0" } */
#include <stdint.h>
/* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O2 -ftree-vectorize -ffast-math" } */
+/* { dg-options "-O2 -ftree-vectorize -ffast-math --param aarch64-sve-compare-costs=0" } */
#include "mask_struct_store_2.c"
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -fno-inline" } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline --param aarch64-sve-compare-costs=0" } */
#include <stdint.h>
/* { dg-do run { target aarch64_sve_hw } } */
-/* { dg-options "-O2 -ftree-vectorize -fno-inline" } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline --param aarch64-sve-compare-costs=0" } */
#include "unpack_unsigned_1.c"