From ad2603433853129e847cade5e269c6a5f889a020 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Fri, 15 Jan 2021 18:50:27 +0000 Subject: [PATCH] AArch64: Add NEON, SVE and SVE2 RTL patterns for Multiply, FMS and FMA. This adds implementation for the optabs for complex operations. With this the following C code: void g (float complex a[restrict N], float complex b[restrict N], float complex c[restrict N]) { for (int i=0; i < N; i++) c[i] = a[i] * b[i]; } generates NEON: g: movi v3.4s, 0 mov x3, 0 .p2align 3,,7 .L2: mov v0.16b, v3.16b ldr q2, [x1, x3] ldr q1, [x0, x3] fcmla v0.4s, v1.4s, v2.4s, #0 fcmla v0.4s, v1.4s, v2.4s, #90 str q0, [x2, x3] add x3, x3, 16 cmp x3, 1600 bne .L2 ret SVE: g: mov x3, 0 mov x4, 400 ptrue p1.b, all whilelo p0.s, xzr, x4 mov z3.s, #0 .p2align 3,,7 .L2: ld1w z1.s, p0/z, [x0, x3, lsl 2] ld1w z2.s, p0/z, [x1, x3, lsl 2] movprfx z0, z3 fcmla z0.s, p1/m, z1.s, z2.s, #0 fcmla z0.s, p1/m, z1.s, z2.s, #90 st1w z0.s, p0, [x2, x3, lsl 2] incw x3 whilelo p0.s, x3, x4 b.any .L2 ret SVE2 (with int instead of float) g: mov x3, 0 mov x4, 400 mov z3.b, #0 whilelo p0.s, xzr, x4 .p2align 3,,7 .L2: ld1w z1.s, p0/z, [x0, x3, lsl 2] ld1w z2.s, p0/z, [x1, x3, lsl 2] movprfx z0, z3 cmla z0.s, z1.s, z2.s, #0 cmla z0.s, z1.s, z2.s, #90 st1w z0.s, p0, [x2, x3, lsl 2] incw x3 whilelo p0.s, x3, x4 b.any .L2 ret gcc/ChangeLog: * config/aarch64/aarch64-simd.md (cml4, cmul3): New. * config/aarch64/iterators.md (UNSPEC_FCMUL, UNSPEC_FCMUL180, UNSPEC_FCMLA_CONJ, UNSPEC_FCMLA180_CONJ, UNSPEC_CMLA_CONJ, UNSPEC_CMLA180_CONJ, UNSPEC_CMUL, UNSPEC_CMUL180, FCMLA_OP, FCMUL_OP, conj_op, rotsplit1, rotsplit2, fcmac1, sve_rot1, sve_rot2, SVE2_INT_CMLA_OP, SVE2_INT_CMUL_OP, SVE2_INT_CADD_OP): New. (rot): Add UNSPEC_FCMUL, UNSPEC_FCMUL180. (rot_op): Renamed to conj_op. * config/aarch64/aarch64-sve.md (cml4, cmul3): New. * config/aarch64/aarch64-sve2.md (cml4, cmul3): New. --- gcc/config/aarch64/aarch64-simd.md | 38 ++++++++++ gcc/config/aarch64/aarch64-sve.md | 56 +++++++++++++++ gcc/config/aarch64/aarch64-sve2.md | 42 +++++++++++ gcc/config/aarch64/iterators.md | 109 ++++++++++++++++++++++++++++- 4 files changed, 242 insertions(+), 3 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index eabc915c681..41071b668fd 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -516,6 +516,44 @@ [(set_attr "type" "neon_fcmla")] ) +;; The complex mla/mls operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cml4" + [(set (match_operand:VHSDF 0 "register_operand") + (plus:VHSDF (match_operand:VHSDF 1 "register_operand") + (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand") + (match_operand:VHSDF 3 "register_operand")] + FCMLA_OP)))] + "TARGET_COMPLEX && !BYTES_BIG_ENDIAN" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_aarch64_fcmla (tmp, operands[1], + operands[3], operands[2])); + emit_insn (gen_aarch64_fcmla (operands[0], tmp, + operands[3], operands[2])); + DONE; +}) + +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul3" + [(set (match_operand:VHSDF 0 "register_operand") + (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand") + (match_operand:VHSDF 2 "register_operand")] + FCMUL_OP))] + "TARGET_COMPLEX && !BYTES_BIG_ENDIAN" +{ + rtx tmp = force_reg (mode, CONST0_RTX (mode)); + rtx res1 = gen_reg_rtx (mode); + emit_insn (gen_aarch64_fcmla (res1, tmp, + operands[2], operands[1])); + emit_insn (gen_aarch64_fcmla (operands[0], res1, + operands[2], operands[1])); + DONE; +}) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "aarch64_dot" [(set (match_operand:VS 0 "register_operand" "=w") diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index da15bd87885..60831960031 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -7243,6 +7243,62 @@ [(set_attr "movprfx" "*,yes")] ) +;; unpredicated optab pattern for auto-vectorizer +;; The complex mla/mls operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cml4" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_dup 4) + (match_dup 5) + (match_operand:SVE_FULL_F 1 "register_operand") + (match_operand:SVE_FULL_F 2 "register_operand") + (match_operand:SVE_FULL_F 3 "register_operand")] + FCMLA_OP))] + "TARGET_SVE" +{ + operands[4] = aarch64_ptrue_reg (mode); + operands[5] = gen_int_mode (SVE_RELAXED_GP, SImode); + rtx tmp = gen_reg_rtx (mode); + emit_insn + (gen_aarch64_pred_fcmla (tmp, operands[4], + operands[3], operands[2], + operands[1], operands[5])); + emit_insn + (gen_aarch64_pred_fcmla (operands[0], operands[4], + operands[3], operands[2], + tmp, operands[5])); + DONE; +}) + +;; unpredicated optab pattern for auto-vectorizer +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul3" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_operand:SVE_FULL_F 1 "register_operand") + (match_operand:SVE_FULL_F 2 "register_operand")] + FCMUL_OP))] + "TARGET_SVE" +{ + rtx pred_reg = aarch64_ptrue_reg (mode); + rtx gp_mode = gen_int_mode (SVE_RELAXED_GP, SImode); + rtx accum = force_reg (mode, CONST0_RTX (mode)); + rtx tmp = gen_reg_rtx (mode); + emit_insn + (gen_aarch64_pred_fcmla (tmp, pred_reg, + operands[2], operands[1], + accum, gp_mode)); + emit_insn + (gen_aarch64_pred_fcmla (operands[0], pred_reg, + operands[2], operands[1], + tmp, gp_mode)); + DONE; +}) + ;; Predicated FCMLA with merging. (define_expand "@cond_" [(set (match_operand:SVE_FULL_F 0 "register_operand") diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 5cb9144da98..e7cd2b86d25 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -1848,6 +1848,48 @@ [(set_attr "movprfx" "*,yes")] ) +;; unpredicated optab pattern for auto-vectorizer +;; The complex mla/mls operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cml4" + [(set (match_operand:SVE_FULL_I 0 "register_operand") + (plus:SVE_FULL_I (match_operand:SVE_FULL_I 1 "register_operand") + (unspec:SVE_FULL_I + [(match_operand:SVE_FULL_I 2 "register_operand") + (match_operand:SVE_FULL_I 3 "register_operand")] + SVE2_INT_CMLA_OP)))] + "TARGET_SVE2" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_aarch64_sve_cmla (tmp, operands[1], + operands[3], operands[2])); + emit_insn (gen_aarch64_sve_cmla (operands[0], tmp, + operands[3], operands[2])); + DONE; +}) + +;; unpredicated optab pattern for auto-vectorizer +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul3" + [(set (match_operand:SVE_FULL_I 0 "register_operand") + (unspec:SVE_FULL_I + [(match_operand:SVE_FULL_I 1 "register_operand") + (match_operand:SVE_FULL_I 2 "register_operand")] + SVE2_INT_CMUL_OP))] + "TARGET_SVE2" +{ + rtx accum = force_reg (mode, CONST0_RTX (mode)); + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_aarch64_sve_cmla (tmp, accum, + operands[2], operands[1])); + emit_insn (gen_aarch64_sve_cmla (operands[0], tmp, + operands[2], operands[1])); + DONE; +}) + ;; ------------------------------------------------------------------------- ;; ---- [INT] Complex dot product ;; ------------------------------------------------------------------------- diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index d42a70653ed..b64d77037af 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -712,6 +712,10 @@ UNSPEC_FCMLA90 ; Used in aarch64-simd.md. UNSPEC_FCMLA180 ; Used in aarch64-simd.md. UNSPEC_FCMLA270 ; Used in aarch64-simd.md. + UNSPEC_FCMUL ; Used in aarch64-simd.md. + UNSPEC_FCMUL_CONJ ; Used in aarch64-simd.md. + UNSPEC_FCMLA_CONJ ; Used in aarch64-simd.md. + UNSPEC_FCMLA180_CONJ ; Used in aarch64-simd.md. UNSPEC_ASRD ; Used in aarch64-sve.md. UNSPEC_ADCLB ; Used in aarch64-sve2.md. UNSPEC_ADCLT ; Used in aarch64-sve2.md. @@ -730,6 +734,10 @@ UNSPEC_CMLA180 ; Used in aarch64-sve2.md. UNSPEC_CMLA270 ; Used in aarch64-sve2.md. UNSPEC_CMLA90 ; Used in aarch64-sve2.md. + UNSPEC_CMLA_CONJ ; Used in aarch64-sve2.md. + UNSPEC_CMLA180_CONJ ; Used in aarch64-sve2.md. + UNSPEC_CMUL ; Used in aarch64-sve2.md. + UNSPEC_CMUL_CONJ ; Used in aarch64-sve2.md. UNSPEC_COND_FCVTLT ; Used in aarch64-sve2.md. UNSPEC_COND_FCVTNT ; Used in aarch64-sve2.md. UNSPEC_COND_FCVTX ; Used in aarch64-sve2.md. @@ -1291,7 +1299,7 @@ ;; Widened mode register suffixes for VD_BHSI/VQW/VQ_HSF. (define_mode_attr Vwtype [(V8QI "8h") (V4HI "4s") - (V2SI "2d") (V16QI "8h") + (V2SI "2d") (V16QI "8h") (V8HI "4s") (V4SI "2d") (V8HF "4s") (V4SF "2d")]) @@ -1313,7 +1321,7 @@ ;; Widened mode register suffixes for VDW/VQW. (define_mode_attr Vmwtype [(V8QI ".8h") (V4HI ".4s") - (V2SI ".2d") (V16QI ".8h") + (V2SI ".2d") (V16QI ".8h") (V8HI ".4s") (V4SI ".2d") (V4HF ".4s") (V2SF ".2d") (SI "") (HI "")]) @@ -2611,6 +2619,20 @@ UNSPEC_SQRDCMLAH180 UNSPEC_SQRDCMLAH270]) +;; Unlike the normal CMLA instructions these represent the actual operation +;; to be performed. They will always need to be expanded into multiple +;; sequences consisting of CMLA. +(define_int_iterator SVE2_INT_CMLA_OP [UNSPEC_CMLA + UNSPEC_CMLA_CONJ + UNSPEC_CMLA180 + UNSPEC_CMLA180_CONJ]) + +;; Unlike the normal CMLA instructions these represent the actual operation +;; to be performed. They will always need to be expanded into multiple +;; sequences consisting of CMLA. +(define_int_iterator SVE2_INT_CMUL_OP [UNSPEC_CMUL + UNSPEC_CMUL_CONJ]) + ;; Same as SVE2_INT_CADD but exclude the saturating instructions (define_int_iterator SVE2_INT_CADD_OP [UNSPEC_CADD90 UNSPEC_CADD270]) @@ -2725,6 +2747,14 @@ (define_int_iterator BF_MLA [UNSPEC_BFMLALB UNSPEC_BFMLALT]) +(define_int_iterator FCMLA_OP [UNSPEC_FCMLA + UNSPEC_FCMLA180 + UNSPEC_FCMLA_CONJ + UNSPEC_FCMLA180_CONJ]) + +(define_int_iterator FCMUL_OP [UNSPEC_FCMUL + UNSPEC_FCMUL_CONJ]) + ;; Iterators for atomic operations. (define_int_iterator ATOMIC_LDOP @@ -3435,7 +3465,80 @@ (UNSPEC_COND_FCMLA "0") (UNSPEC_COND_FCMLA90 "90") (UNSPEC_COND_FCMLA180 "180") - (UNSPEC_COND_FCMLA270 "270")]) + (UNSPEC_COND_FCMLA270 "270") + (UNSPEC_FCMUL "0") + (UNSPEC_FCMUL_CONJ "180")]) + +;; A conjucate is a negation of the imaginary component +;; The number in the unspecs are the rotation component of the instruction, e.g +;; FCMLA180 means use the instruction with #180. +;; The iterator is used to produce the right name mangling for the function. +(define_int_attr conj_op [(UNSPEC_FCMLA180 "") + (UNSPEC_FCMLA180_CONJ "_conj") + (UNSPEC_FCMLA "") + (UNSPEC_FCMLA_CONJ "_conj") + (UNSPEC_FCMUL "") + (UNSPEC_FCMUL_CONJ "_conj") + (UNSPEC_CMLA "") + (UNSPEC_CMLA180 "") + (UNSPEC_CMLA180_CONJ "_conj") + (UNSPEC_CMLA_CONJ "_conj") + (UNSPEC_CMUL "") + (UNSPEC_CMUL_CONJ "_conj")]) + +;; The complex operations when performed on a real complex number require two +;; instructions to perform the operation. e.g. complex multiplication requires +;; two FCMUL with a particular rotation value. +;; +;; These values can be looked up in rotsplit1 and rotsplit2. as an example +;; FCMUL needs the first instruction to use #0 and the second #90. +(define_int_attr rotsplit1 [(UNSPEC_FCMLA "0") + (UNSPEC_FCMLA_CONJ "0") + (UNSPEC_FCMUL "0") + (UNSPEC_FCMUL_CONJ "0") + (UNSPEC_FCMLA180 "180") + (UNSPEC_FCMLA180_CONJ "180")]) + +(define_int_attr rotsplit2 [(UNSPEC_FCMLA "90") + (UNSPEC_FCMLA_CONJ "270") + (UNSPEC_FCMUL "90") + (UNSPEC_FCMUL_CONJ "270") + (UNSPEC_FCMLA180 "270") + (UNSPEC_FCMLA180_CONJ "90")]) + +;; SVE has slightly different namings from NEON so we have to split these +;; iterators. +(define_int_attr sve_rot1 [(UNSPEC_FCMLA "") + (UNSPEC_FCMLA_CONJ "") + (UNSPEC_FCMUL "") + (UNSPEC_FCMUL_CONJ "") + (UNSPEC_FCMLA180 "180") + (UNSPEC_FCMLA180_CONJ "180") + (UNSPEC_CMLA "") + (UNSPEC_CMLA_CONJ "") + (UNSPEC_CMUL "") + (UNSPEC_CMUL_CONJ "") + (UNSPEC_CMLA180 "180") + (UNSPEC_CMLA180_CONJ "180")]) + +(define_int_attr sve_rot2 [(UNSPEC_FCMLA "90") + (UNSPEC_FCMLA_CONJ "270") + (UNSPEC_FCMUL "90") + (UNSPEC_FCMUL_CONJ "270") + (UNSPEC_FCMLA180 "270") + (UNSPEC_FCMLA180_CONJ "90") + (UNSPEC_CMLA "90") + (UNSPEC_CMLA_CONJ "270") + (UNSPEC_CMUL "90") + (UNSPEC_CMUL_CONJ "270") + (UNSPEC_CMLA180 "270") + (UNSPEC_CMLA180_CONJ "90")]) + + +(define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA_CONJ "a") + (UNSPEC_FCMLA180 "s") (UNSPEC_FCMLA180_CONJ "s") + (UNSPEC_CMLA "a") (UNSPEC_CMLA_CONJ "a") + (UNSPEC_CMLA180 "s") (UNSPEC_CMLA180_CONJ "s")]) (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla") (UNSPEC_COND_FMLS "fmls") -- 2.30.2