+2018-02-01 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * config/aarch64/aarch64-sve.md (sve_ld1rq): Replace with...
+ (*sve_ld1rq<Vesize>): ... this new pattern. Handle all element sizes,
+ not just bytes.
+ * config/aarch64/aarch64.c (aarch64_expand_sve_widened_duplicate):
+ Remove BSWAP handing for big-endian targets and use the form of
+ LD1RQ appropariate for the mode.
+
2018-02-01 Richard Sandiford <richard.sandiford@linaro.org>
* config/aarch64/aarch64.c (aarch64_simd_valid_immediate): Handle
;; Load 128 bits from memory and duplicate to fill a vector. Since there
;; are so few operations on 128-bit "elements", we don't define a VNx1TI
;; and simply use vectors of bytes instead.
-(define_insn "sve_ld1rq"
- [(set (match_operand:VNx16QI 0 "register_operand" "=w")
- (unspec:VNx16QI
- [(match_operand:VNx16BI 1 "register_operand" "Upl")
+(define_insn "*sve_ld1rq<Vesize>"
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+ (unspec:SVE_ALL
+ [(match_operand:<VPRED> 1 "register_operand" "Upl")
(match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
UNSPEC_LD1RQ))]
"TARGET_SVE"
- "ld1rqb\t%0.b, %1/z, %2"
+ "ld1rq<Vesize>\t%0.<Vetype>, %1/z, %2"
)
;; Implement a predicate broadcast by shifting the low bit of the scalar
return true;
}
- /* The bytes are loaded in little-endian order, so do a byteswap on
- big-endian targets. */
- if (BYTES_BIG_ENDIAN)
- {
- src = simplify_unary_operation (BSWAP, src_mode, src, src_mode);
- if (!src)
- return NULL_RTX;
- }
-
- /* Use LD1RQ to load the 128 bits from memory. */
+ /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
src = force_const_mem (src_mode, src);
if (!src)
return false;
src = replace_equiv_address (src, addr);
}
- rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
- emit_insn (gen_sve_ld1rq (gen_lowpart (VNx16QImode, dest), ptrue, src));
+ machine_mode mode = GET_MODE (dest);
+ unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
+ machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
+ rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
+ src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
+ emit_insn (gen_rtx_SET (dest, src));
return true;
}
+2018-02-01 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * gcc.target/aarch64/sve/slp_2.c: Expect LD1RQD rather than LD1RQB.
+ * gcc.target/aarch64/sve/slp_3.c: Expect LD1RQW rather than LD1RQB.
+ * gcc.target/aarch64/sve/slp_4.c: Expect LD1RQH rather than LD1RQB.
+
2018-02-01 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/81661
/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 } } */
/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 3 } } */
/* { dg-final { scan-assembler-not {\tzip1\t} } } */
/* { dg-final { scan-assembler-not {\tzip2\t} } } */
/* 1 for each 16-bit type and 4 for double. */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 } } */
/* 1 for each 32-bit type. */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
/* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double. */
/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 } } */
/* 1 for each 16-bit type. */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]\.h, } 3 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #11\n} 2 } } */
/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #17\n} 2 } } */