From 9d63f43b2d6ac073164a43116b4cb11d7d188ff1 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Thu, 10 Jan 2019 03:30:59 +0000 Subject: [PATCH] aarch64-builtins.c (enum aarch64_type_qualifiers): Add qualifier_lane_pair_index. gcc/ChangeLog: 2019-01-10 Tamar Christina * config/aarch64/aarch64-builtins.c (enum aarch64_type_qualifiers): Add qualifier_lane_pair_index. (emit-rtl.h): Include. (TYPES_QUADOP_LANE_PAIR): New. (aarch64_simd_expand_args): Use it. (aarch64_simd_expand_builtin): Likewise. (AARCH64_SIMD_FCMLA_LANEQ_BUILTINS, aarch64_fcmla_laneq_builtin_datum): New. (FCMLA_LANEQ_BUILTIN, AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE, AARCH64_SIMD_FCMLA_LANEQ_BUILTINS, aarch64_fcmla_lane_builtin_data, aarch64_init_fcmla_laneq_builtins, aarch64_expand_fcmla_builtin): New. (aarch64_init_builtins): Add aarch64_init_fcmla_laneq_builtins. (aarch64_expand_buildin): Add AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V2SF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ2700_V2SF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V4HF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V4HF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V4HF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V4HF. * config/aarch64/iterators.md (FCMLA_maybe_lane): New. * config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Add __ARM_FEATURE_COMPLEX. * config/aarch64/aarch64-simd-builtins.def (fcadd90, fcadd270, fcmla0, fcmla90, fcmla180, fcmla270, fcmla_lane0, fcmla_lane90, fcmla_lane180, fcmla_lane270, fcmla_laneq0, fcmla_laneq90, fcmla_laneq180, fcmla_laneq270, fcmlaq_lane0, fcmlaq_lane90, fcmlaq_lane180, fcmlaq_lane270): New. * config/aarch64/aarch64-simd.md (aarch64_fcmla_lane, aarch64_fcmla_laneqv4hf, aarch64_fcmlaq_lane,aarch64_fcadd, aarch64_fcmla): New. * config/aarch64/arm_neon.h: (vcadd_rot90_f16): New. (vcaddq_rot90_f16): New. (vcadd_rot270_f16): New. (vcaddq_rot270_f16): New. (vcmla_f16): New. (vcmlaq_f16): New. (vcmla_lane_f16): New. (vcmla_laneq_f16): New. (vcmlaq_lane_f16): New. (vcmlaq_rot90_lane_f16): New. (vcmla_rot90_laneq_f16): New. (vcmla_rot90_lane_f16): New. (vcmlaq_rot90_f16): New. (vcmla_rot90_f16): New. (vcmlaq_laneq_f16): New. (vcmla_rot180_laneq_f16): New. (vcmla_rot180_lane_f16): New. (vcmlaq_rot180_f16): New. (vcmla_rot180_f16): New. (vcmlaq_rot90_laneq_f16): New. (vcmlaq_rot270_laneq_f16): New. (vcmlaq_rot270_lane_f16): New. (vcmla_rot270_laneq_f16): New. (vcmlaq_rot270_f16): New. (vcmla_rot270_f16): New. (vcmlaq_rot180_laneq_f16): New. (vcmlaq_rot180_lane_f16): New. (vcmla_rot270_lane_f16): New. (vcadd_rot90_f32): New. (vcaddq_rot90_f32): New. (vcaddq_rot90_f64): New. (vcadd_rot270_f32): New. (vcaddq_rot270_f32): New. (vcaddq_rot270_f64): New. (vcmla_f32): New. (vcmlaq_f32): New. (vcmlaq_f64): New. (vcmla_lane_f32): New. (vcmla_laneq_f32): New. (vcmlaq_lane_f32): New. (vcmlaq_laneq_f32): New. (vcmla_rot90_f32): New. (vcmlaq_rot90_f32): New. (vcmlaq_rot90_f64): New. (vcmla_rot90_lane_f32): New. (vcmla_rot90_laneq_f32): New. (vcmlaq_rot90_lane_f32): New. (vcmlaq_rot90_laneq_f32): New. (vcmla_rot180_f32): New. (vcmlaq_rot180_f32): New. (vcmlaq_rot180_f64): New. (vcmla_rot180_lane_f32): New. (vcmla_rot180_laneq_f32): New. (vcmlaq_rot180_lane_f32): New. (vcmlaq_rot180_laneq_f32): New. (vcmla_rot270_f32): New. (vcmlaq_rot270_f32): New. (vcmlaq_rot270_f64): New. (vcmla_rot270_lane_f32): New. (vcmla_rot270_laneq_f32): New. (vcmlaq_rot270_lane_f32): New. (vcmlaq_rot270_laneq_f32): New. * config/aarch64/aarch64.h (TARGET_COMPLEX): New. * config/aarch64/iterators.md (UNSPEC_FCADD90, UNSPEC_FCADD270, UNSPEC_FCMLA, UNSPEC_FCMLA90, UNSPEC_FCMLA180, UNSPEC_FCMLA270): New. (FCADD, FCMLA): New. (rot): New. * config/arm/types.md (neon_fcadd, neon_fcmla): New. gcc/testsuite/ChangeLog: 2019-01-10 Tamar Christina * gcc.target/aarch64/advsimd-intrinsics/vector-complex.c: New test. * gcc.target/aarch64/advsimd-intrinsics/vector-complex_f16.c: New test. From-SVN: r267795 --- gcc/ChangeLog | 96 ++++ gcc/config/aarch64/aarch64-builtins.c | 186 ++++++- gcc/config/aarch64/aarch64-c.c | 1 + gcc/config/aarch64/aarch64-simd-builtins.def | 19 + gcc/config/aarch64/aarch64-simd.md | 64 +++ gcc/config/aarch64/aarch64.h | 3 + gcc/config/aarch64/arm_neon.h | 475 ++++++++++++++++++ gcc/config/aarch64/iterators.md | 28 ++ gcc/config/arm/types.md | 3 + gcc/testsuite/ChangeLog | 5 + .../advsimd-intrinsics/vector-complex.c | 251 +++++++++ .../advsimd-intrinsics/vector-complex_f16.c | 306 +++++++++++ 12 files changed, 1435 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex_f16.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e04c52a0133..f80cad99048 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,99 @@ +2019-01-10 Tamar Christina + + * config/aarch64/aarch64-builtins.c (enum aarch64_type_qualifiers): Add qualifier_lane_pair_index. + (emit-rtl.h): Include. + (TYPES_QUADOP_LANE_PAIR): New. + (aarch64_simd_expand_args): Use it. + (aarch64_simd_expand_builtin): Likewise. + (AARCH64_SIMD_FCMLA_LANEQ_BUILTINS, aarch64_fcmla_laneq_builtin_datum): New. + (FCMLA_LANEQ_BUILTIN, AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE, + AARCH64_SIMD_FCMLA_LANEQ_BUILTINS, aarch64_fcmla_lane_builtin_data, + aarch64_init_fcmla_laneq_builtins, aarch64_expand_fcmla_builtin): New. + (aarch64_init_builtins): Add aarch64_init_fcmla_laneq_builtins. + (aarch64_expand_buildin): Add AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF, + AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V2SF, + AARCH64_SIMD_BUILTIN_FCMLA_LANEQ2700_V2SF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V4HF, + AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V4HF, AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V4HF, + AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V4HF. + * config/aarch64/iterators.md (FCMLA_maybe_lane): New. + * config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Add __ARM_FEATURE_COMPLEX. + * config/aarch64/aarch64-simd-builtins.def (fcadd90, fcadd270, fcmla0, fcmla90, + fcmla180, fcmla270, fcmla_lane0, fcmla_lane90, fcmla_lane180, fcmla_lane270, + fcmla_laneq0, fcmla_laneq90, fcmla_laneq180, fcmla_laneq270, + fcmlaq_lane0, fcmlaq_lane90, fcmlaq_lane180, fcmlaq_lane270): New. + * config/aarch64/aarch64-simd.md (aarch64_fcmla_lane, + aarch64_fcmla_laneqv4hf, aarch64_fcmlaq_lane,aarch64_fcadd, + aarch64_fcmla): New. + * config/aarch64/arm_neon.h: + (vcadd_rot90_f16): New. + (vcaddq_rot90_f16): New. + (vcadd_rot270_f16): New. + (vcaddq_rot270_f16): New. + (vcmla_f16): New. + (vcmlaq_f16): New. + (vcmla_lane_f16): New. + (vcmla_laneq_f16): New. + (vcmlaq_lane_f16): New. + (vcmlaq_rot90_lane_f16): New. + (vcmla_rot90_laneq_f16): New. + (vcmla_rot90_lane_f16): New. + (vcmlaq_rot90_f16): New. + (vcmla_rot90_f16): New. + (vcmlaq_laneq_f16): New. + (vcmla_rot180_laneq_f16): New. + (vcmla_rot180_lane_f16): New. + (vcmlaq_rot180_f16): New. + (vcmla_rot180_f16): New. + (vcmlaq_rot90_laneq_f16): New. + (vcmlaq_rot270_laneq_f16): New. + (vcmlaq_rot270_lane_f16): New. + (vcmla_rot270_laneq_f16): New. + (vcmlaq_rot270_f16): New. + (vcmla_rot270_f16): New. + (vcmlaq_rot180_laneq_f16): New. + (vcmlaq_rot180_lane_f16): New. + (vcmla_rot270_lane_f16): New. + (vcadd_rot90_f32): New. + (vcaddq_rot90_f32): New. + (vcaddq_rot90_f64): New. + (vcadd_rot270_f32): New. + (vcaddq_rot270_f32): New. + (vcaddq_rot270_f64): New. + (vcmla_f32): New. + (vcmlaq_f32): New. + (vcmlaq_f64): New. + (vcmla_lane_f32): New. + (vcmla_laneq_f32): New. + (vcmlaq_lane_f32): New. + (vcmlaq_laneq_f32): New. + (vcmla_rot90_f32): New. + (vcmlaq_rot90_f32): New. + (vcmlaq_rot90_f64): New. + (vcmla_rot90_lane_f32): New. + (vcmla_rot90_laneq_f32): New. + (vcmlaq_rot90_lane_f32): New. + (vcmlaq_rot90_laneq_f32): New. + (vcmla_rot180_f32): New. + (vcmlaq_rot180_f32): New. + (vcmlaq_rot180_f64): New. + (vcmla_rot180_lane_f32): New. + (vcmla_rot180_laneq_f32): New. + (vcmlaq_rot180_lane_f32): New. + (vcmlaq_rot180_laneq_f32): New. + (vcmla_rot270_f32): New. + (vcmlaq_rot270_f32): New. + (vcmlaq_rot270_f64): New. + (vcmla_rot270_lane_f32): New. + (vcmla_rot270_laneq_f32): New. + (vcmlaq_rot270_lane_f32): New. + (vcmlaq_rot270_laneq_f32): New. + * config/aarch64/aarch64.h (TARGET_COMPLEX): New. + * config/aarch64/iterators.md (UNSPEC_FCADD90, UNSPEC_FCADD270, + UNSPEC_FCMLA, UNSPEC_FCMLA90, UNSPEC_FCMLA180, UNSPEC_FCMLA270): New. + (FCADD, FCMLA): New. + (rot): New. + * config/arm/types.md (neon_fcadd, neon_fcmla): New. + 2019-01-09 Sandra Loosemore PR other/16615 diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index e2d24168465..df0e035e39a 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -42,6 +42,7 @@ #include "langhooks.h" #include "gimple-iterator.h" #include "case-cfn-macros.h" +#include "emit-rtl.h" #define v8qi_UP E_V8QImode #define v4hi_UP E_V4HImode @@ -102,7 +103,10 @@ enum aarch64_type_qualifiers /* Lane indices - must be in range, and flipped for bigendian. */ qualifier_lane_index = 0x200, /* Lane indices for single lane structure loads and stores. */ - qualifier_struct_load_store_lane_index = 0x400 + qualifier_struct_load_store_lane_index = 0x400, + /* Lane indices selected in pairs. - must be in range, and flipped for + bigendian. */ + qualifier_lane_pair_index = 0x800, }; typedef struct @@ -171,6 +175,11 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers) +static enum aarch64_type_qualifiers +aarch64_types_quadop_lane_pair_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_none, qualifier_none, qualifier_none, + qualifier_none, qualifier_lane_pair_index }; +#define TYPES_QUADOP_LANE_PAIR (aarch64_types_quadop_lane_pair_qualifiers) static enum aarch64_type_qualifiers aarch64_types_quadop_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_none, @@ -356,6 +365,18 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { CRC32_BUILTIN (crc32cw, SI) \ CRC32_BUILTIN (crc32cx, DI) +/* The next 8 FCMLA instrinsics require some special handling compared the + normal simd intrinsics. */ +#define AARCH64_SIMD_FCMLA_LANEQ_BUILTINS \ + FCMLA_LANEQ_BUILTIN (0, v2sf, fcmla, V2SF, false) \ + FCMLA_LANEQ_BUILTIN (90, v2sf, fcmla, V2SF, false) \ + FCMLA_LANEQ_BUILTIN (180, v2sf, fcmla, V2SF, false) \ + FCMLA_LANEQ_BUILTIN (270, v2sf, fcmla, V2SF, false) \ + FCMLA_LANEQ_BUILTIN (0, v4hf, fcmla_laneq, V4HF, true) \ + FCMLA_LANEQ_BUILTIN (90, v4hf, fcmla_laneq, V4HF, true) \ + FCMLA_LANEQ_BUILTIN (180, v4hf, fcmla_laneq, V4HF, true) \ + FCMLA_LANEQ_BUILTIN (270, v4hf, fcmla_laneq, V4HF, true) \ + typedef struct { const char *name; @@ -364,9 +385,22 @@ typedef struct unsigned int fcode; } aarch64_crc_builtin_datum; +/* Hold information about how to expand the FCMLA_LANEQ builtins. */ +typedef struct +{ + const char *name; + machine_mode mode; + const enum insn_code icode; + unsigned int fcode; + bool lane; +} aarch64_fcmla_laneq_builtin_datum; + #define CRC32_BUILTIN(N, M) \ AARCH64_BUILTIN_##N, +#define FCMLA_LANEQ_BUILTIN(I, N, X, M, T) \ + AARCH64_SIMD_BUILTIN_FCMLA_LANEQ##I##_##M, + #undef VAR1 #define VAR1(T, N, MAP, A) \ AARCH64_SIMD_BUILTIN_##T##_##N##A, @@ -399,6 +433,9 @@ enum aarch64_builtins AARCH64_PAUTH_BUILTIN_AUTIA1716, AARCH64_PAUTH_BUILTIN_PACIA1716, AARCH64_PAUTH_BUILTIN_XPACLRI, + /* Special cased Armv8.3-A Complex FMA by Lane quad Builtins. */ + AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE, + AARCH64_SIMD_FCMLA_LANEQ_BUILTINS AARCH64_BUILTIN_MAX }; @@ -410,6 +447,18 @@ static aarch64_crc_builtin_datum aarch64_crc_builtin_data[] = { AARCH64_CRC32_BUILTINS }; + +#undef FCMLA_LANEQ_BUILTIN +#define FCMLA_LANEQ_BUILTIN(I, N, X, M, T) \ + {"__builtin_aarch64_fcmla_laneq"#I#N, E_##M##mode, CODE_FOR_aarch64_##X##I##N, \ + AARCH64_SIMD_BUILTIN_FCMLA_LANEQ##I##_##M, T}, + +/* This structure contains how to manage the mapping form the builtin to the + instruction to generate in the backend and how to invoke the instruction. */ +static aarch64_fcmla_laneq_builtin_datum aarch64_fcmla_lane_builtin_data[] { + AARCH64_SIMD_FCMLA_LANEQ_BUILTINS +}; + #undef CRC32_BUILTIN static GTY(()) tree aarch64_builtin_decls[AARCH64_BUILTIN_MAX]; @@ -746,6 +795,34 @@ aarch64_init_simd_builtin_scalar_types (void) static bool aarch64_simd_builtins_initialized_p = false; +/* Due to the architecture not providing lane variant of the lane instructions + for fcmla we can't use the standard simd builtin expansion code, but we + still want the majority of the validation that would normally be done. */ + +void +aarch64_init_fcmla_laneq_builtins (void) +{ + unsigned int i = 0; + + for (i = 0; i < ARRAY_SIZE (aarch64_fcmla_lane_builtin_data); ++i) + { + aarch64_fcmla_laneq_builtin_datum* d + = &aarch64_fcmla_lane_builtin_data[i]; + tree argtype = aarch64_lookup_simd_builtin_type (d->mode, qualifier_none); + machine_mode quadmode = GET_MODE_2XWIDER_MODE (d->mode).require (); + tree quadtype + = aarch64_lookup_simd_builtin_type (quadmode, qualifier_none); + tree lanetype + = aarch64_simd_builtin_std_type (SImode, qualifier_lane_pair_index); + tree ftype = build_function_type_list (argtype, argtype, argtype, + quadtype, lanetype, NULL_TREE); + tree fndecl = add_builtin_function (d->name, ftype, d->fcode, + BUILT_IN_MD, NULL, NULL_TREE); + + aarch64_builtin_decls[d->fcode] = fndecl; + } +} + void aarch64_init_simd_builtins (void) { @@ -1001,7 +1078,10 @@ aarch64_init_builtins (void) aarch64_init_fp16_types (); if (TARGET_SIMD) - aarch64_init_simd_builtins (); + { + aarch64_init_simd_builtins (); + aarch64_init_fcmla_laneq_builtins (); + } aarch64_init_crc32_builtins (); aarch64_init_builtin_rsqrt (); @@ -1031,6 +1111,7 @@ typedef enum SIMD_ARG_CONSTANT, SIMD_ARG_LANE_INDEX, SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX, + SIMD_ARG_LANE_PAIR_INDEX, SIMD_ARG_STOP } builtin_simd_arg; @@ -1102,6 +1183,22 @@ aarch64_simd_expand_args (rtx target, int icode, int have_retval, /* Keep to GCC-vector-extension lane indices in the RTL. */ op[opc] = aarch64_endian_lane_rtx (vmode, INTVAL (op[opc])); } + /* If the lane index isn't a constant then error out. */ + goto constant_arg; + + case SIMD_ARG_LANE_PAIR_INDEX: + /* Must be a previous operand into which this is an index and + index is restricted to nunits / 2. */ + gcc_assert (opc > 0); + if (CONST_INT_P (op[opc])) + { + machine_mode vmode = insn_data[icode].operand[opc - 1].mode; + unsigned int nunits + = GET_MODE_NUNITS (vmode).to_constant (); + aarch64_simd_lane_bounds (op[opc], 0, nunits / 2, exp); + /* Keep to GCC-vector-extension lane indices in the RTL. */ + op[opc] = aarch64_endian_lane_rtx (vmode, INTVAL (op[opc])); + } /* Fall through - if the lane index isn't a constant then the next case will error. */ /* FALLTHRU */ @@ -1215,6 +1312,8 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx target) if (d->qualifiers[qualifiers_k] & qualifier_lane_index) args[k] = SIMD_ARG_LANE_INDEX; + else if (d->qualifiers[qualifiers_k] & qualifier_lane_pair_index) + args[k] = SIMD_ARG_LANE_PAIR_INDEX; else if (d->qualifiers[qualifiers_k] & qualifier_struct_load_store_lane_index) args[k] = SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX; else if (d->qualifiers[qualifiers_k] & qualifier_immediate) @@ -1317,6 +1416,79 @@ aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target) return target; } +/* Expand a FCMLA lane expression EXP with code FCODE and + result going to TARGET if that is convenient. */ + +rtx +aarch64_expand_fcmla_builtin (tree exp, rtx target, int fcode) +{ + int bcode = fcode - AARCH64_SIMD_FCMLA_LANEQ_BUILTIN_BASE - 1; + aarch64_fcmla_laneq_builtin_datum* d + = &aarch64_fcmla_lane_builtin_data[bcode]; + machine_mode quadmode = GET_MODE_2XWIDER_MODE (d->mode).require (); + rtx op0 = force_reg (d->mode, expand_normal (CALL_EXPR_ARG (exp, 0))); + rtx op1 = force_reg (d->mode, expand_normal (CALL_EXPR_ARG (exp, 1))); + rtx op2 = force_reg (quadmode, expand_normal (CALL_EXPR_ARG (exp, 2))); + tree tmp = CALL_EXPR_ARG (exp, 3); + rtx lane_idx = expand_expr (tmp, NULL_RTX, VOIDmode, EXPAND_INITIALIZER); + + /* Validate that the lane index is a constant. */ + if (!CONST_INT_P (lane_idx)) + { + error ("%Kargument %d must be a constant immediate", exp, 4); + return const0_rtx; + } + + /* Validate that the index is within the expected range. */ + int nunits = GET_MODE_NUNITS (quadmode).to_constant (); + aarch64_simd_lane_bounds (lane_idx, 0, nunits / 2, exp); + + /* Keep to GCC-vector-extension lane indices in the RTL. */ + lane_idx = aarch64_endian_lane_rtx (quadmode, INTVAL (lane_idx)); + + /* Generate the correct register and mode. */ + int lane = INTVAL (lane_idx); + + if (lane < nunits / 4) + op2 = simplify_gen_subreg (d->mode, op2, quadmode, 0); + else + { + /* Select the upper 64 bits, either a V2SF or V4HF, this however + is quite messy, as the operation required even though simple + doesn't have a simple RTL pattern, and seems it's quite hard to + define using a single RTL pattern. The target generic version + gen_highpart_mode generates code that isn't optimal. */ + rtx temp1 = gen_reg_rtx (d->mode); + rtx temp2 = gen_reg_rtx (DImode); + temp1 = simplify_gen_subreg (d->mode, op2, quadmode, 0); + temp1 = simplify_gen_subreg (V2DImode, temp1, d->mode, 0); + emit_insn (gen_aarch64_get_lanev2di (temp2, temp1 , const1_rtx)); + op2 = simplify_gen_subreg (d->mode, temp2, GET_MODE (temp2), 0); + + /* And recalculate the index. */ + lane -= nunits / 4; + } + + if (!target) + target = gen_reg_rtx (d->mode); + else + target = force_reg (d->mode, target); + + rtx pat = NULL_RTX; + + if (d->lane) + pat = GEN_FCN (d->icode) (target, op0, op1, op2, + gen_int_mode (lane, SImode)); + else + pat = GEN_FCN (d->icode) (target, op0, op1, op2); + + if (!pat) + return NULL_RTX; + + emit_insn (pat); + return target; +} + /* Expand an expression EXP that calls a built-in function, with result going to TARGET if that's convenient. */ rtx @@ -1395,6 +1567,16 @@ aarch64_expand_builtin (tree exp, } return target; + + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V2SF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V2SF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V2SF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V2SF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ0_V4HF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ90_V4HF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ180_V4HF: + case AARCH64_SIMD_BUILTIN_FCMLA_LANEQ270_V4HF: + return aarch64_expand_fcmla_builtin (exp, target, fcode); } if (fcode >= AARCH64_SIMD_BUILTIN_BASE && fcode <= AARCH64_SIMD_BUILTIN_MAX) diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c index a595b53e5db..fcb1e80177d 100644 --- a/gcc/config/aarch64/aarch64-c.c +++ b/gcc/config/aarch64/aarch64-c.c @@ -109,6 +109,7 @@ aarch64_update_cpp_builtins (cpp_reader *pfile) aarch64_def_or_undef (TARGET_CRC32, "__ARM_FEATURE_CRC32", pfile); aarch64_def_or_undef (TARGET_DOTPROD, "__ARM_FEATURE_DOTPROD", pfile); + aarch64_def_or_undef (TARGET_COMPLEX, "__ARM_FEATURE_COMPLEX", pfile); cpp_undef (pfile, "__AARCH64_CMODEL_TINY__"); cpp_undef (pfile, "__AARCH64_CMODEL_SMALL__"); diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 55fe876bf7a..17bb0c4869b 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -217,6 +217,25 @@ BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0) BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0) + /* Implemented by aarch64_fcadd. */ + BUILTIN_VHSDF (BINOP, fcadd90, 0) + BUILTIN_VHSDF (BINOP, fcadd270, 0) + + /* Implemented by aarch64_fcmla{_lane}{q}. */ + BUILTIN_VHSDF (TERNOP, fcmla0, 0) + BUILTIN_VHSDF (TERNOP, fcmla90, 0) + BUILTIN_VHSDF (TERNOP, fcmla180, 0) + BUILTIN_VHSDF (TERNOP, fcmla270, 0) + BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane0, 0) + BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane90, 0) + BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane180, 0) + BUILTIN_VHSDF (QUADOP_LANE_PAIR, fcmla_lane270, 0) + + BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane0, 0) + BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane90, 0) + BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane180, 0) + BUILTIN_VQ_HSF (QUADOP_LANE_PAIR, fcmlaq_lane270, 0) + BUILTIN_VDQ_I (SHIFTIMM, ashr, 3) VAR1 (SHIFTIMM, ashr_simd, 0, di) BUILTIN_VDQ_I (SHIFTIMM, lshr, 3) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index ef633411e31..be6c27d319a 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -419,6 +419,70 @@ } ) +;; The fcadd and fcmla patterns are made UNSPEC for the explicitly due to the +;; fact that their usage need to guarantee that the source vectors are +;; contiguous. It would be wrong to describe the operation without being able +;; to describe the permute that is also required, but even if that is done +;; the permute would have been created as a LOAD_LANES which means the values +;; in the registers are in the wrong order. +(define_insn "aarch64_fcadd" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w") + (match_operand:VHSDF 2 "register_operand" "w")] + FCADD))] + "TARGET_COMPLEX" + "fcadd\t%0., %1., %2., #" + [(set_attr "type" "neon_fcadd")] +) + +(define_insn "aarch64_fcmla" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0") + (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") + (match_operand:VHSDF 3 "register_operand" "w")] + FCMLA)))] + "TARGET_COMPLEX" + "fcmla\t%0., %2., %3., #" + [(set_attr "type" "neon_fcmla")] +) + + +(define_insn "aarch64_fcmla_lane" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0") + (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") + (match_operand:VHSDF 3 "register_operand" "w") + (match_operand:SI 4 "const_int_operand" "n")] + FCMLA)))] + "TARGET_COMPLEX" + "fcmla\t%0., %2., %3., #" + [(set_attr "type" "neon_fcmla")] +) + +(define_insn "aarch64_fcmla_laneqv4hf" + [(set (match_operand:V4HF 0 "register_operand" "=w") + (plus:V4HF (match_operand:V4HF 1 "register_operand" "0") + (unspec:V4HF [(match_operand:V4HF 2 "register_operand" "w") + (match_operand:V8HF 3 "register_operand" "w") + (match_operand:SI 4 "const_int_operand" "n")] + FCMLA)))] + "TARGET_COMPLEX" + "fcmla\t%0.4h, %2.4h, %3.h[%4], #" + [(set_attr "type" "neon_fcmla")] +) + +(define_insn "aarch64_fcmlaq_lane" + [(set (match_operand:VQ_HSF 0 "register_operand" "=w") + (plus:VQ_HSF (match_operand:VQ_HSF 1 "register_operand" "0") + (unspec:VQ_HSF [(match_operand:VQ_HSF 2 "register_operand" "w") + (match_operand: 3 "register_operand" "w") + (match_operand:SI 4 "const_int_operand" "n")] + FCMLA)))] + "TARGET_COMPLEX" + "fcmla\t%0., %2., %3., #" + [(set_attr "type" "neon_fcmla")] +) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "aarch64_dot" [(set (match_operand:VS 0 "register_operand" "=w") diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 63f3be7fff4..7bd3bf525dd 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -273,6 +273,9 @@ extern unsigned aarch64_architecture_version; /* ARMv8.3-A features. */ #define TARGET_ARMV8_3 (AARCH64_ISA_V8_3) +/* Armv8.3-a Complex number extension to AdvSIMD extensions. */ +#define TARGET_COMPLEX (TARGET_SIMD && TARGET_ARMV8_3) + /* Make sure this is always defined so we don't have to check for ifdefs but rather use normal ifs. */ #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 2fd44dd25ce..90fce333d09 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -33294,6 +33294,481 @@ vbcaxq_s64 (int64x2_t __a, int64x2_t __b, int64x2_t __c) return __builtin_aarch64_bcaxqv2di (__a, __b, __c); } +#pragma GCC pop_options + +/* AdvSIMD Complex numbers intrinsics. */ + +#pragma GCC push_options +#pragma GCC target(("arch=armv8.3-a")) + +#pragma GCC push_options +#pragma GCC target(("+fp16")) +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcadd_rot90_f16 (float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcadd90v4hf (__a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot90_f16 (float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcadd90v8hf (__a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcadd_rot270_f16 (float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcadd270v4hf (__a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot270_f16 (float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcadd270v8hf (__a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcmla0v4hf (__r, __a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcmla0v8hf (__r, __a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane0v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq0v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane0v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane90v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq90v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane90v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcmla90v8hf (__r, __a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcmla90v4hf (__r, __a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane0v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq180v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane180v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcmla180v8hf (__r, __a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcmla180v4hf (__r, __a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane90v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane270v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane270v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq270v4hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_aarch64_fcmla270v8hf (__r, __a, __b); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_fcmla270v4hf (__r, __a, __b); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane180v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane180v8hf (__r, __a, __b, __index); +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane270v4hf (__r, __a, __b, __index); +} +#pragma GCC pop_options + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcadd_rot90_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcadd90v2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot90_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcadd90v4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot90_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcadd90v2df (__a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcadd_rot270_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcadd270v2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot270_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcadd270v4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot270_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcadd270v2df (__a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcmla0v2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcmla0v4sf (__r, __a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcmla0v2df (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane0v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq0v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane0v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane0v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcmla90v2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcmla90v4sf (__r, __a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcmla90v2df (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane90v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot90_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq90v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane90v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot90_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane90v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcmla180v2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcmla180v4sf (__r, __a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcmla180v2df (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane180v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot180_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq180v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane180v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot180_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane180v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcmla270v2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fcmla270v4sf (__r, __a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcmla270v2df (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane270v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_rot270_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_laneq270v2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) +{ + return __builtin_aarch64_fcmlaq_lane270v4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) +{ + return __builtin_aarch64_fcmla_lane270v4sf (__r, __a, __b, __index); +} #pragma GCC pop_options diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 1065ea3bf79..85fa1619ceb 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -485,6 +485,12 @@ UNSPEC_COND_GE ; Used in aarch64-sve.md. UNSPEC_COND_GT ; Used in aarch64-sve.md. UNSPEC_LASTB ; Used in aarch64-sve.md. + UNSPEC_FCADD90 ; Used in aarch64-simd.md. + UNSPEC_FCADD270 ; Used in aarch64-simd.md. + UNSPEC_FCMLA ; Used in aarch64-simd.md. + UNSPEC_FCMLA90 ; Used in aarch64-simd.md. + UNSPEC_FCMLA180 ; Used in aarch64-simd.md. + UNSPEC_FCMLA270 ; Used in aarch64-simd.md. ]) ;; ------------------------------------------------------------------ @@ -1134,6 +1140,13 @@ (VNx16SI "vnx4bi") (VNx16SF "vnx4bi") (VNx8DI "vnx2bi") (VNx8DF "vnx2bi")]) +;; On AArch64 the By element instruction doesn't have a 2S variant. +;; However because the instruction always selects a pair of values +;; The normal 3SAME instruction can be used here instead. +(define_mode_attr FCMLA_maybe_lane [(V2SF "") (V4SF "[%4]") + (V4HF "[%4]") (V8HF "[%4]") + ]) + ;; ------------------------------------------------------------------- ;; Code Iterators ;; ------------------------------------------------------------------- @@ -1587,6 +1600,14 @@ UNSPEC_COND_EQ UNSPEC_COND_NE UNSPEC_COND_GE UNSPEC_COND_GT]) +(define_int_iterator FCADD [UNSPEC_FCADD90 + UNSPEC_FCADD270]) + +(define_int_iterator FCMLA [UNSPEC_FCMLA + UNSPEC_FCMLA90 + UNSPEC_FCMLA180 + UNSPEC_FCMLA270]) + ;; Iterators for atomic operations. (define_int_iterator ATOMIC_LDOP @@ -1848,6 +1869,13 @@ (UNSPEC_COND_MAX "fmaxnm") (UNSPEC_COND_MIN "fminnm")]) +(define_int_attr rot [(UNSPEC_FCADD90 "90") + (UNSPEC_FCADD270 "270") + (UNSPEC_FCMLA "0") + (UNSPEC_FCMLA90 "90") + (UNSPEC_FCMLA180 "180") + (UNSPEC_FCMLA270 "270")]) + (define_int_attr sve_fmla_op [(UNSPEC_COND_FMLA "fmla") (UNSPEC_COND_FMLS "fmls") (UNSPEC_COND_FNMLA "fnmla") diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md index 8e7d097ce5d..f8f8dd09077 100644 --- a/gcc/config/arm/types.md +++ b/gcc/config/arm/types.md @@ -763,6 +763,9 @@ neon_sub_halve,\ neon_sub_halve_q,\ neon_sub_halve_narrow_q,\ +\ + neon_fcadd,\ + neon_fcmla,\ \ neon_abs,\ neon_abs_q,\ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index ef7d0f87b87..2df44c759cc 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2019-01-10 Tamar Christina + + * gcc.target/aarch64/advsimd-intrinsics/vector-complex.c: New test. + * gcc.target/aarch64/advsimd-intrinsics/vector-complex_f16.c: New test. + 2019-01-10 Tamar Christina * lib/target-supports.exp diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex.c new file mode 100644 index 00000000000..b7c999333ed --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex.c @@ -0,0 +1,251 @@ +/* { dg-skip-if "" { arm-*-* } } */ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_3a_complex_neon_ok } */ +/* { dg-add-options arm_v8_3a_complex_neon } */ +/* { dg-additional-options "-O2 -save-temps" } */ + +#include + +float32x2_t +test_vcadd_rot90_f32 (float32x2_t __a, float32x2_t __b) +{ + return vcadd_rot90_f32 (__a, __b); +} + +float32x4_t +test_vcaddq_rot90_f32 (float32x4_t __a, float32x4_t __b) +{ + return vcaddq_rot90_f32 (__a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcaddq_rot90_f64 (float64x2_t __a, float64x2_t __b) +{ + return vcaddq_rot90_f64 (__a, __b); +} +#endif + +float32x2_t +test_vcadd_rot270_f32 (float32x2_t __a, float32x2_t __b) +{ + return vcadd_rot270_f32 (__a, __b); +} + +float32x4_t +test_vcaddq_rot270_f32 (float32x4_t __a, float32x4_t __b) +{ + return vcaddq_rot270_f32 (__a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcaddq_rot270_f64 (float64x2_t __a, float64x2_t __b) +{ + return vcaddq_rot270_f64 (__a, __b); +} +#endif + +float32x2_t +test_vcmla_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_f32 (__r, __a, __b); +} + +float32x4_t +test_vcmlaq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_f32 (__r, __a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcmlaq_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return vcmlaq_f64 (__r, __a, __b); +} +#endif + +float32x2_t +test_vcmla_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_lane_f32 (__r, __a, __b, 0); +} + +float32x2_t +test_vcmla_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b) +{ + return vcmla_laneq_f32 (__r, __a, __b, 1); +} + +float32x4_t +test_vcmlaq_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b) +{ + return vcmlaq_lane_f32 (__r, __a, __b, 0); +} + +float32x4_t +test_vcmlaq_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_laneq_f32 (__r, __a, __b, 1); +} + +float32x2_t +test_vcmla_rot90_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot90_f32 (__r, __a, __b); +} + +float32x4_t +test_vcmlaq_rot90_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot90_f32 (__r, __a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcmlaq_rot90_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return vcmlaq_rot90_f64 (__r, __a, __b); +} +#endif + +float32x2_t +test_vcmla_rot90_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot90_lane_f32 (__r, __a, __b, 0); +} + +float32x2_t +test_vcmla_rot90_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b) +{ + return vcmla_rot90_laneq_f32 (__r, __a, __b, 1); +} + +float32x4_t +test_vcmlaq_rot90_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b) +{ + return vcmlaq_rot90_lane_f32 (__r, __a, __b, 0); +} + +float32x4_t +test_vcmlaq_rot90_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot90_laneq_f32 (__r, __a, __b, 1); +} + +float32x2_t +test_vcmla_rot180_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot180_f32 (__r, __a, __b); +} + +float32x4_t +test_vcmlaq_rot180_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot180_f32 (__r, __a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcmlaq_rot180_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return vcmlaq_rot180_f64 (__r, __a, __b); +} +#endif + +float32x2_t +test_vcmla_rot180_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot180_lane_f32 (__r, __a, __b, 0); +} + +float32x2_t +test_vcmla_rot180_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b) +{ + return vcmla_rot180_laneq_f32 (__r, __a, __b, 1); +} + +float32x4_t +test_vcmlaq_rot180_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b) +{ + return vcmlaq_rot180_lane_f32 (__r, __a, __b, 0); +} + +float32x4_t +test_vcmlaq_rot180_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot180_laneq_f32 (__r, __a, __b, 1); +} + +float32x2_t +test_vcmla_rot270_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot270_f32 (__r, __a, __b); +} + +float32x4_t +test_vcmlaq_rot270_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot270_f32 (__r, __a, __b); +} + +#ifdef __ARM_ARCH_ISA_A64 +float64x2_t +test_vcmlaq_rot270_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) +{ + return vcmlaq_rot270_f64 (__r, __a, __b); +} +#endif + +float32x2_t +test_vcmla_rot270_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return vcmla_rot270_lane_f32 (__r, __a, __b, 0); +} + +float32x2_t +test_vcmla_rot270_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b) +{ + return vcmla_rot270_laneq_f32 (__r, __a, __b, 1); +} + +float32x4_t +test_vcmlaq_rot270_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b) +{ + return vcmlaq_rot270_lane_f32 (__r, __a, __b, 0); +} + +float32x4_t +test_vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) +{ + return vcmlaq_rot270_laneq_f32 (__r, __a, __b, 1); +} + +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #0} 3 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #180} 3 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #270} 3 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, #90} 3 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[0\], #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[0\], #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[0\], #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[0\], #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[1\], #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[1\], #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[1\], #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.s\[1\], #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {dup\td[0-9]+, v[0-9]+.d\[1\]} 4 { target { aarch64*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex_f16.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex_f16.c new file mode 100644 index 00000000000..dbcebcbfba6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vector-complex_f16.c @@ -0,0 +1,306 @@ +/* { dg-skip-if "" { arm-*-* } } */ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_3a_complex_neon_ok } */ +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */ +/* { dg-add-options arm_v8_3a_complex_neon } */ +/* { dg-additional-options "-O2 -march=armv8.3-a+fp16 -save-temps" } */ + +#include + +float16x4_t +test_vcadd_rot90_f16 (float16x4_t __a, float16x4_t __b) +{ + return vcadd_rot90_f16 (__a, __b); +} + +float16x8_t +test_vcaddq_rot90_f16 (float16x8_t __a, float16x8_t __b) +{ + return vcaddq_rot90_f16 (__a, __b); +} + +float16x4_t +test_vcadd_rot270_f16 (float16x4_t __a, float16x4_t __b) +{ + return vcadd_rot270_f16 (__a, __b); +} + +float16x8_t +test_vcaddq_rot270_f16 (float16x8_t __a, float16x8_t __b) +{ + return vcaddq_rot270_f16 (__a, __b); +} + +float16x4_t +test_vcmla_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_f16 (__r, __a, __b); +} + +float16x8_t +test_vcmlaq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_f16 (__r, __a, __b); +} + +float16x4_t +test_vcmla_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_lane_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_laneq_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_lane_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_laneq_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_lane_f16_2 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_lane_f16 (__r, __a, __b, 1); +} + +float16x4_t +test_vcmla_laneq_f16_2 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_laneq_f16 (__r, __a, __b, 3); +} + +float16x8_t +test_vcmlaq_lane_f16_2 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_lane_f16 (__r, __a, __b, 1); +} + +float16x8_t +test_vcmlaq_laneq_f16_2 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_laneq_f16 (__r, __a, __b, 3); +} + +float16x4_t +test_vcmla_rot90_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot90_f16 (__r, __a, __b); +} + +float16x8_t +test_vcmlaq_rot90_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot90_f16 (__r, __a, __b); +} + +float16x4_t +test_vcmla_rot90_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot90_lane_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot90_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot90_laneq_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot90_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot90_lane_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot90_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot90_laneq_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot90_lane_f16_2 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot90_lane_f16 (__r, __a, __b, 1); +} + +float16x4_t +test_vcmla_rot90_laneq_f16_2 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot90_laneq_f16 (__r, __a, __b, 3); +} + +float16x8_t +test_vcmlaq_rot90_lane_f16_2 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot90_lane_f16 (__r, __a, __b, 1); +} + +float16x8_t +test_vcmlaq_rot90_laneq_f16_2 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot90_laneq_f16 (__r, __a, __b, 3); +} + +float16x4_t +test_vcmla_rot180_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot180_f16 (__r, __a, __b); +} + +float16x8_t +test_vcmlaq_rot180_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot180_f16 (__r, __a, __b); +} + +float16x4_t +test_vcmla_rot180_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot180_lane_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot180_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot180_laneq_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot180_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot180_lane_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot180_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot180_laneq_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot180_lane_f16_2 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot180_lane_f16 (__r, __a, __b, 1); +} + +float16x4_t +test_vcmla_rot180_laneq_f16_2 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot180_laneq_f16 (__r, __a, __b, 3); +} + +float16x8_t +test_vcmlaq_rot180_lane_f16_2 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot180_lane_f16 (__r, __a, __b, 1); +} + +float16x8_t +test_vcmlaq_rot180_laneq_f16_2 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot180_laneq_f16 (__r, __a, __b, 3); +} + +float16x4_t +test_vcmla_rot270_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot270_f16 (__r, __a, __b); +} + +float16x8_t +test_vcmlaq_rot270_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot270_f16 (__r, __a, __b); +} + +float16x4_t +test_vcmla_rot270_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot270_lane_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot270_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot270_laneq_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot270_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot270_lane_f16 (__r, __a, __b, 0); +} + +float16x8_t +test_vcmlaq_rot270_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot270_laneq_f16 (__r, __a, __b, 0); +} + +float16x4_t +test_vcmla_rot270_lane_f16_2 (float16x4_t __r, float16x4_t __a, float16x4_t __b) +{ + return vcmla_rot270_lane_f16 (__r, __a, __b, 1); +} + +float16x4_t +test_vcmla_rot270_laneq_f16_2 (float16x4_t __r, float16x4_t __a, float16x8_t __b) +{ + return vcmla_rot270_laneq_f16 (__r, __a, __b, 3); +} + +float16x8_t +test_vcmlaq_rot270_lane_f16_2 (float16x8_t __r, float16x8_t __a, float16x4_t __b) +{ + return vcmlaq_rot270_lane_f16 (__r, __a, __b, 1); +} + +float16x8_t +test_vcmlaq_rot270_laneq_f16_2 (float16x8_t __r, float16x8_t __a, float16x8_t __b) +{ + return vcmlaq_rot270_laneq_f16 (__r, __a, __b, 3); +} + +/* { dg-final { scan-assembler-times {dup\td[0-9]+, v[0-9]+.d\[1\]} 4 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcadd\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[0\], #0} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[0\], #180} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[0\], #270} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[0\], #90} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[1\], #0} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[1\], #180} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[1\], #270} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.4h, v[0-9]+.4h, v[0-9]+.h\[1\], #90} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[0\], #0} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[0\], #180} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[0\], #270} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[0\], #90} 2 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[1\], #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[1\], #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[1\], #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[1\], #90} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[3\], #0} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[3\], #180} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[3\], #270} 1 { target { aarch64*-*-* } } } } */ +/* { dg-final { scan-assembler-times {fcmla\tv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.h\[3\], #90} 1 { target { aarch64*-*-* } } } } */ -- 2.30.2