From: Tamar Christina Date: Mon, 25 Jan 2021 08:56:37 +0000 (+0000) Subject: Arm: Add NEON and MVE complex mul, mla and mls patterns. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=389b67feac78c8f21c6946bf8e36a16060f45728;p=gcc.git Arm: Add NEON and MVE complex mul, mla and mls patterns. This adds implementation for the optabs for complex operations. With this the following C code: void g (float complex a[restrict N], float complex b[restrict N], float complex c[restrict N]) { for (int i=0; i < N; i++) c[i] = a[i] * b[i]; } generates NEON: g: vmov.f32 q11, #0.0 @ v4sf add r3, r2, #1600 .L2: vmov q8, q11 @ v4sf vld1.32 {q10}, [r1]! vld1.32 {q9}, [r0]! vcmla.f32 q8, q9, q10, #0 vcmla.f32 q8, q9, q10, #90 vst1.32 {q8}, [r2]! cmp r3, r2 bne .L2 bx lr MVE: g: push {lr} mov lr, #100 dls lr, lr .L2: vldrw.32 q1, [r1], #16 vldrw.32 q2, [r0], #16 vcmul.f32 q3, q2, q1, #0 vcmla.f32 q3, q2, q1, #90 vstrw.32 q3, [r2], #16 le lr, .L2 ldr pc, [sp], #4 instead of g: add r3, r2, #1600 .L2: vld2.32 {d20-d23}, [r0]! vld2.32 {d16-d19}, [r1]! vmul.f32 q14, q11, q9 vmul.f32 q15, q11, q8 vneg.f32 q14, q14 vfma.f32 q15, q10, q9 vfma.f32 q14, q10, q8 vmov q13, q15 @ v4sf vmov q12, q14 @ v4sf vst2.32 {d24-d27}, [r2]! cmp r3, r2 bne .L2 bx lr and g: add r3, r2, #1600 .L2: vld2.32 {d20-d23}, [r0]! vld2.32 {d16-d19}, [r1]! vmul.f32 q15, q10, q8 vmul.f32 q14, q10, q9 vmls.f32 q15, q11, q9 vmla.f32 q14, q11, q8 vmov q12, q15 @ v4sf vmov q13, q14 @ v4sf vst2.32 {d24-d27}, [r2]! cmp r3, r2 bne .L2 bx lr respectively. gcc/ChangeLog: * config/arm/iterators.md (rotsplit1, rotsplit2, conj_op, fcmac1, VCMLA_OP, VCMUL_OP): New. * config/arm/mve.md (mve_vcmlaq): Support vec_dup 0. * config/arm/neon.md (cmul3): New. * config/arm/unspecs.md (UNSPEC_VCMLA_CONJ, UNSPEC_VCMLA180_CONJ, UNSPEC_VCMUL_CONJ): New. * config/arm/vec-common.md (cmul3, arm_vcmla, cml4): New. --- diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 2e0aacbd3f7..b9027905307 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1186,6 +1186,33 @@ (UNSPEC_VCMLA180 "180") (UNSPEC_VCMLA270 "270")]) +;; The complex operations when performed on a real complex number require two +;; instructions to perform the operation. e.g. complex multiplication requires +;; two VCMUL with a particular rotation value. +;; +;; These values can be looked up in rotsplit1 and rotsplit2. as an example +;; VCMUL needs the first instruction to use #0 and the second #90. +(define_int_attr rotsplit1 [(UNSPEC_VCMLA "0") + (UNSPEC_VCMLA_CONJ "0") + (UNSPEC_VCMUL "0") + (UNSPEC_VCMUL_CONJ "0") + (UNSPEC_VCMLA180 "180") + (UNSPEC_VCMLA180_CONJ "180")]) + +(define_int_attr rotsplit2 [(UNSPEC_VCMLA "90") + (UNSPEC_VCMLA_CONJ "270") + (UNSPEC_VCMUL "90") + (UNSPEC_VCMUL_CONJ "270") + (UNSPEC_VCMLA180 "270") + (UNSPEC_VCMLA180_CONJ "90")]) + +(define_int_attr conj_op [(UNSPEC_VCMLA180 "") + (UNSPEC_VCMLA180_CONJ "_conj") + (UNSPEC_VCMLA "") + (UNSPEC_VCMLA_CONJ "_conj") + (UNSPEC_VCMUL "") + (UNSPEC_VCMUL_CONJ "_conj")]) + (define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90") (UNSPEC_VCADD270 "_rot270") (UNSPEC_VCMLA "") @@ -1200,6 +1227,9 @@ (define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90 UNSPEC_VCMUL180 UNSPEC_VCMUL270]) +(define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA_CONJ "a") + (UNSPEC_VCMLA180 "s") (UNSPEC_VCMLA180_CONJ "s")]) + (define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8") (UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8") (UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8 "uhsub8") @@ -1723,3 +1753,13 @@ (define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48]) (define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48]) (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U]) + +;; Define iterators for VCMLA operations +(define_int_iterator VCMLA_OP [UNSPEC_VCMLA + UNSPEC_VCMLA_CONJ + UNSPEC_VCMLA180 + UNSPEC_VCMLA180_CONJ]) + +;; Define iterators for VCMLA operations as MUL +(define_int_iterator VCMUL_OP [UNSPEC_VCMUL + UNSPEC_VCMUL_CONJ]) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 62ff12365ab..465f71c4eee 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -4101,15 +4101,16 @@ (define_insn "mve_vcmlaq" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w,w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz") - (match_operand:MVE_0 2 "s_register_operand" "w,w") - (match_operand:MVE_0 3 "s_register_operand" "w,w")] - VCMLA)) + (plus:MVE_0 (match_operand:MVE_0 1 "reg_or_zero_operand" "Dz,0") + (unspec:MVE_0 + [(match_operand:MVE_0 2 "s_register_operand" "w,w") + (match_operand:MVE_0 3 "s_register_operand" "w,w")] + VCMLA))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "@ - vcmla.f%# %q0, %q2, %q3, # - vcmul.f%# %q0, %q2, %q3, #" + vcmul.f%# %q0, %q2, %q3, # + vcmla.f%# %q0, %q2, %q3, #" [(set_attr "type" "mve_move") ]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index e904db97ea7..fec2cc91d24 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2952,6 +2952,25 @@ [(set_attr "type" "neon_fcmla")] ) +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul3" + [(set (match_operand:VDF 0 "register_operand") + (unspec:VDF [(match_operand:VDF 1 "register_operand") + (match_operand:VDF 2 "register_operand")] + VCMUL_OP))] + "TARGET_COMPLEX && !BYTES_BIG_ENDIAN" +{ + rtx res1 = gen_reg_rtx (mode); + rtx tmp = force_reg (mode, CONST0_RTX (mode)); + emit_insn (gen_neon_vcmla (res1, tmp, + operands[2], operands[1])); + emit_insn (gen_neon_vcmla (operands[0], res1, + operands[2], operands[1])); + DONE; +}) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "neon_dot" diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 97a803e8da5..c6ebb6fc2b6 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -510,10 +510,13 @@ UNSPEC_VCMLA90 UNSPEC_VCMLA180 UNSPEC_VCMLA270 + UNSPEC_VCMLA_CONJ + UNSPEC_VCMLA180_CONJ UNSPEC_VCMUL UNSPEC_VCMUL90 UNSPEC_VCMUL180 UNSPEC_VCMUL270 + UNSPEC_VCMUL_CONJ UNSPEC_MATMUL_S UNSPEC_MATMUL_U UNSPEC_MATMUL_US diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index ff448da126b..692b28ea8cc 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -215,6 +215,63 @@ && ARM_HAVE__ARITH)) && !BYTES_BIG_ENDIAN" ) +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul3" + [(set (match_operand:VQ_HSF 0 "register_operand") + (unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand") + (match_operand:VQ_HSF 2 "register_operand")] + VCMUL_OP))] + "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT)) + && !BYTES_BIG_ENDIAN" +{ + rtx res1 = gen_reg_rtx (mode); + if (TARGET_COMPLEX) + { + rtx tmp = force_reg (mode, CONST0_RTX (mode)); + emit_insn (gen_arm_vcmla (res1, tmp, + operands[2], operands[1])); + } + else + emit_insn (gen_arm_vcmla (res1, CONST0_RTX (mode), + operands[2], operands[1])); + + emit_insn (gen_arm_vcmla (operands[0], res1, + operands[2], operands[1])); + DONE; +}) + +(define_expand "arm_vcmla" + [(set (match_operand:VF 0 "register_operand") + (plus:VF (match_operand:VF 1 "register_operand") + (unspec:VF [(match_operand:VF 2 "register_operand") + (match_operand:VF 3 "register_operand")] + VCMLA)))] + "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT + && ARM_HAVE__ARITH)) && !BYTES_BIG_ENDIAN" +) + +;; The complex mla/mls operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cml4" + [(set (match_operand:VF 0 "register_operand") + (plus:VF (match_operand:VF 1 "register_operand") + (unspec:VF [(match_operand:VF 2 "register_operand") + (match_operand:VF 3 "register_operand")] + VCMLA_OP)))] + "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT + && ARM_HAVE__ARITH)) && !BYTES_BIG_ENDIAN" +{ + rtx tmp = gen_reg_rtx (mode); + emit_insn (gen_arm_vcmla (tmp, operands[1], + operands[3], operands[2])); + emit_insn (gen_arm_vcmla (operands[0], tmp, + operands[3], operands[2])); + DONE; +}) + (define_expand "movmisalign" [(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand") (unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]