;; in *aarch64_simd_bsl<mode>_alt.
(define_insn "aarch64_simd_bsl<mode>_internal"
- [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
- (xor:VSDQ_I_DI
- (and:VSDQ_I_DI
- (xor:VSDQ_I_DI
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+ (xor:VDQ_I
+ (and:VDQ_I
+ (xor:VDQ_I
(match_operand:<V_INT_EQUIV> 3 "register_operand" "w,0,w")
- (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
- (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+ (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+ (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
(match_dup:<V_INT_EQUIV> 3)
))]
"TARGET_SIMD"
;; permutations of commutative operations, we have to have a separate pattern.
(define_insn "*aarch64_simd_bsl<mode>_alt"
- [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
- (xor:VSDQ_I_DI
- (and:VSDQ_I_DI
- (xor:VSDQ_I_DI
- (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
- (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
- (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
- (match_dup:VSDQ_I_DI 2)))]
+ [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+ (xor:VDQ_I
+ (and:VDQ_I
+ (xor:VDQ_I
+ (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+ (match_operand:<V_INT_EQUIV> 2 "register_operand" "w,0,w"))
+ (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+ (match_dup:<V_INT_EQUIV> 2)))]
"TARGET_SIMD"
"@
bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
[(set_attr "type" "neon_bsl<q>")]
)
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers. If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again. However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+ [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+ (xor:DI
+ (and:DI
+ (xor:DI
+ (match_operand:DI 3 "register_operand" "w,0,w,r")
+ (match_operand:DI 2 "register_operand" "w,w,0,r"))
+ (match_operand:DI 1 "register_operand" "0,w,w,r"))
+ (match_dup:DI 3)
+ ))]
+ "TARGET_SIMD"
+ "@
+ bsl\\t%0.8b, %2.8b, %3.8b
+ bit\\t%0.8b, %2.8b, %1.8b
+ bif\\t%0.8b, %3.8b, %1.8b
+ #"
+ "&& GP_REGNUM_P (REGNO (operands[0]))"
+ [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+ /* Split back to individual operations. If we're before reload, and
+ able to create a temporary register, do so. If we're after reload,
+ we've got an early-clobber destination register, so use that.
+ Otherwise, we can't create pseudos and we can't yet guarantee that
+ operands[0] is safe to write, so FAIL to split. */
+
+ rtx scratch;
+ if (reload_completed)
+ scratch = operands[0];
+ else if (can_create_pseudo_p ())
+ scratch = gen_reg_rtx (DImode);
+ else
+ FAIL;
+
+ emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+ emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+ emit_insn (gen_xordi3 (operands[0], scratch, operands[3]));
+ DONE;
+}
+ [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+ (set_attr "length" "4,4,4,12")]
+)
+
+(define_insn_and_split "aarch64_simd_bsldi_alt"
+ [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+ (xor:DI
+ (and:DI
+ (xor:DI
+ (match_operand:DI 3 "register_operand" "w,w,0,r")
+ (match_operand:DI 2 "register_operand" "w,0,w,r"))
+ (match_operand:DI 1 "register_operand" "0,w,w,r"))
+ (match_dup:DI 2)
+ ))]
+ "TARGET_SIMD"
+ "@
+ bsl\\t%0.8b, %3.8b, %2.8b
+ bit\\t%0.8b, %3.8b, %1.8b
+ bif\\t%0.8b, %2.8b, %1.8b
+ #"
+ "&& GP_REGNUM_P (REGNO (operands[0]))"
+ [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+ /* Split back to individual operations. If we're before reload, and
+ able to create a temporary register, do so. If we're after reload,
+ we've got an early-clobber destination register, so use that.
+ Otherwise, we can't create pseudos and we can't yet guarantee that
+ operands[0] is safe to write, so FAIL to split. */
+
+ rtx scratch;
+ if (reload_completed)
+ scratch = operands[0];
+ else if (can_create_pseudo_p ())
+ scratch = gen_reg_rtx (DImode);
+ else
+ FAIL;
+
+ emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+ emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+ emit_insn (gen_xordi3 (operands[0], scratch, operands[2]));
+ DONE;
+}
+ [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+ (set_attr "length" "4,4,4,12")]
+)
+
(define_expand "aarch64_simd_bsl<mode>"
[(match_operand:VALLDIF 0 "register_operand")
(match_operand:<V_INT_EQUIV> 1 "register_operand")
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-rtl-combine --save-temps" } */
+
+/* Test that we don't generate BSL when in DImode with values in integer
+ registers, and do generate it where we have values in floating-point
+ registers. This is useful, as it allows us to avoid register moves
+ in the general case.
+
+ We want:
+ eor x0, x0, x1
+ and x0, x0, x2
+ eor x0, x0, x1
+ ret
+
+ Rather than:
+ fmov d2, x0
+ fmov d0, x2
+ fmov d1, x1
+ bsl v0.8b, v2.8b, v1.8b
+ fmov x0, d0
+ ret */
+
+extern void abort (void);
+
+unsigned long long __attribute__ ((noinline))
+foo (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+ return ((a ^ b) & c) ^ b;
+}
+
+unsigned long long __attribute__ ((noinline))
+foo2 (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+ return ((a ^ b) & c) ^ a;
+}
+
+#define force_simd(V1) asm volatile ("mov %d0, %1.d[0]" \
+ : "=w"(V1) \
+ : "w"(V1) \
+ : /* No clobbers */);
+
+unsigned long long __attribute__ ((noinline))
+bar (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+ force_simd (a);
+ force_simd (b);
+ force_simd (c);
+ c = ((a ^ b) & c) ^ b;
+ force_simd (c);
+ return c;
+}
+
+unsigned long long __attribute__ ((noinline))
+bar2 (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+ force_simd (a);
+ force_simd (b);
+ force_simd (c);
+ c = ((a ^ b) & c) ^ a;
+ force_simd (c);
+ return c;
+}
+
+int
+main (int argc, char** argv)
+{
+ unsigned long long a = 0x0123456789abcdefULL;
+ unsigned long long b = 0xfedcba9876543210ULL;
+ unsigned long long c = 0xaabbccddeeff7777ULL;
+ if (foo (a, b, c) != bar (a, b, c))
+ abort ();
+ if (foo2 (a, b, c) != bar2 (a, b, c))
+ abort ();
+ return 0;
+}
+
+/* 2 BSL, 6 FMOV (to floating-point registers), and 2 FMOV (to general
+purpose registers) for the "bar" tests, which should still use BSL. */
+/* { dg-final { scan-assembler-times "bsl\tv\[0-9\]" 2 } } */
+/* { dg-final { scan-assembler-times "fmov\td\[0-9\]" 6 } } */
+/* { dg-final { scan-assembler-times "fmov\tx\[0-9\]" 2 } } */
+
+/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
+
+/* We always match the idiom during combine. */
+/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_internal" 2 "combine" } } */
+/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_alt" 2 "combine" } } */