From eb637e76047a3b1d9787b1c75de5da3db2fe8aea Mon Sep 17 00:00:00 2001 From: Delia Burduv Date: Fri, 6 Mar 2020 10:38:20 +0000 Subject: [PATCH] ACLE intrinsics: BFloat16 load intrinsics for AArch32 2020-03-06 Delia Burduv * config/arm/arm_neon.h (vld2_bf16): New. (vld2q_bf16): New. (vld3_bf16): New. (vld3q_bf16): New. (vld4_bf16): New. (vld4q_bf16): New. (vld2_dup_bf16): New. (vld2q_dup_bf16): New. (vld3_dup_bf16): New. (vld3q_dup_bf16): New. (vld4_dup_bf16): New. (vld4q_dup_bf16): New. * config/arm/arm_neon_builtins.def (vld2): Changed to VAR13 and added v4bf, v8bf (vld2_dup): Changed to VAR8 and added v4bf, v8bf (vld3): Changed to VAR13 and added v4bf, v8bf (vld3_dup): Changed to VAR8 and added v4bf, v8bf (vld4): Changed to VAR13 and added v4bf, v8bf (vld4_dup): Changed to VAR8 and added v4bf, v8bf * config/arm/iterators.md (VDXBF2): New iterator. *config/arm/neon.md (neon_vld2): Use new iterators. (neon_vld2_dup): Likewise. (neon_vld3qa): Likewise. (neon_vld3qb): Likewise. (neon_vld3_dup): Likewise. (neon_vld4): Likewise. (neon_vld4qa): Likewise. (neon_vld4qb): Likewise. (neon_vld4_dup): Likewise. (neon_vld2_dupv8bf): New. (neon_vld3_dupv8bf): Likewise. (neon_vld4_dupv8bf): Likewise. * gcc.target/arm/simd/bf16_vldn_1.c: New test. --- gcc/ChangeLog | 36 +++++ gcc/config/arm/arm_neon.h | 108 +++++++++++++ gcc/config/arm/arm_neon_builtins.def | 18 +-- gcc/config/arm/iterators.md | 3 + gcc/config/arm/neon.md | 88 ++++++++-- gcc/testsuite/ChangeLog | 4 + .../gcc.target/arm/simd/bf16_vldn_1.c | 152 ++++++++++++++++++ 7 files changed, 387 insertions(+), 22 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6d2a35c8d22..82534420f41 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,39 @@ +2020-03-06 Delia Burduv + + * config/arm/arm_neon.h (vld2_bf16): New. + (vld2q_bf16): New. + (vld3_bf16): New. + (vld3q_bf16): New. + (vld4_bf16): New. + (vld4q_bf16): New. + (vld2_dup_bf16): New. + (vld2q_dup_bf16): New. + (vld3_dup_bf16): New. + (vld3q_dup_bf16): New. + (vld4_dup_bf16): New. + (vld4q_dup_bf16): New. + * config/arm/arm_neon_builtins.def + (vld2): Changed to VAR13 and added v4bf, v8bf + (vld2_dup): Changed to VAR8 and added v4bf, v8bf + (vld3): Changed to VAR13 and added v4bf, v8bf + (vld3_dup): Changed to VAR8 and added v4bf, v8bf + (vld4): Changed to VAR13 and added v4bf, v8bf + (vld4_dup): Changed to VAR8 and added v4bf, v8bf + * config/arm/iterators.md (VDXBF2): New iterator. + *config/arm/neon.md (neon_vld2): Use new iterators. + (neon_vld2_dup): Likewise. + (neon_vld3qa): Likewise. + (neon_vld3qb): Likewise. + (neon_vld3_dup): Likewise. + (neon_vld4): Likewise. + (neon_vld4qa): Likewise. + (neon_vld4qb): Likewise. + (neon_vld4_dup): Likewise. + (neon_vld2_dupv8bf): New. + (neon_vld3_dupv8bf): Likewise. + (neon_vld4_dupv8bf): Likewise. + 2020-03-06 Delia Burduv * config/arm/arm_neon.h (bfloat16x4x2_t): New typedef. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 4ab79d55e1f..f5ccf185038 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -19557,6 +19557,114 @@ vst4q_bf16 (bfloat16_t * __ptr, bfloat16x8x4_t __val) return __builtin_neon_vst4v8bf (__ptr, __bu.__o); } +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_bf16 (bfloat16_t const * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_bf16 (const bfloat16_t * __ptr) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv8bf ((const __builtin_neon_hi *) __ptr); + return __rv.__i; +} + #pragma GCC pop_options #ifdef __cplusplus diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index b73b3e5bba1..34c1945c0a1 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -320,29 +320,29 @@ VAR12 (STORE1, vst1, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR12 (STORE1LANE, vst1_lane, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) -VAR11 (LOAD1, vld2, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld2, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst2, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld3, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld3, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst3, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR11 (LOAD1, vld4, - v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR13 (LOAD1, vld4, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR9 (LOAD1LANE, vld4_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) -VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst4, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) VAR9 (STORE1LANE, vst4_lane, diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 8ff3c156601..244085286cb 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -87,6 +87,9 @@ ;; Double-width vector modes plus 64-bit elements, including V4BF. (define_mode_iterator VDXBF [V8QI V4HI V4HF (V4BF "TARGET_BF16_SIMD") V2SI V2SF DI]) +;; Double-width vector modes plus 64-bit elements, V4BF and V8BF. +(define_mode_iterator VDXBF2 [V8QI V4HI V4HF V2SI V2SF DI (V4BF "TARGET_BF16_SIMD") (V8BF ("TARGET_BF16_SIMD"))]) + ;; Double-width vector modes plus 64-bit elements, ;; with V4BFmode added, suitable for moves. (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 250d5784810..ead3e28da77 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -5428,7 +5428,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand:TI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" { @@ -5453,7 +5453,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" "vld2.\t%h0, %A1" @@ -5516,7 +5516,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld2_dup" [(set (match_operand:TI 0 "s_register_operand" "=w") (unspec:TI [(match_operand: 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2_DUP))] "TARGET_NEON" { @@ -5531,6 +5531,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg")))] ) +(define_insn "neon_vld2_dupv8bf" + [(set (match_operand:OI 0 "s_register_operand" "=w") + (unspec:OI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld2.16\t{%P0, %P1, %P2, %P3}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load2_all_lanes_q")] +) + (define_expand "vec_store_lanesti" [(set (match_operand:TI 0 "neon_struct_operand") (unspec:TI [(match_operand:TI 1 "s_register_operand") @@ -5637,7 +5658,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3))] "TARGET_NEON" { @@ -5665,7 +5686,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld3" [(match_operand:CI 0 "s_register_operand") (match_operand:CI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5680,7 +5701,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3qa" [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3A))] "TARGET_NEON" { @@ -5700,7 +5721,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") (match_operand:CI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3B))] "TARGET_NEON" { @@ -5777,7 +5798,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3_dup" [(set (match_operand:EI 0 "s_register_operand" "=w") (unspec:EI [(match_operand: 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3_DUP))] "TARGET_NEON" { @@ -5800,6 +5821,26 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load3_all_lanes") (const_string "neon_load1_1reg")))]) +(define_insn "neon_vld3_dupv8bf" + [(set (match_operand:CI 0 "s_register_operand" "=w") + (unspec:CI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[4]; + int tabbase = REGNO (operands[0]); + + ops[3] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + output_asm_insn ("vld3.16\t{%P0[], %P1[], %P2[]}, %A3", ops); + return ""; + } + [(set_attr "type" "neon_load3_all_lanes_q")] +) + (define_expand "vec_store_lanesei" [(set (match_operand:EI 0 "neon_struct_operand") (unspec:EI [(match_operand:EI 1 "s_register_operand") @@ -5955,7 +5996,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4))] "TARGET_NEON" { @@ -5983,7 +6024,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld4" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5998,7 +6039,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4qa" [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4A))] "TARGET_NEON" { @@ -6019,7 +6060,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") (match_operand:XI 2 "s_register_operand" "0") - (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4B))] "TARGET_NEON" { @@ -6099,7 +6140,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4_dup" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand: 1 "neon_struct_operand" "Um") - (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4_DUP))] "TARGET_NEON" { @@ -6125,6 +6166,27 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_load1_1reg")))] ) +(define_insn "neon_vld4_dupv8bf" + [(set (match_operand:XI 0 "s_register_operand" "=w") + (unspec:XI [(match_operand:V2BF 1 "neon_struct_operand" "Um") + (unspec:V8BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + UNSPEC_VLD2_DUP))] + "TARGET_BF16_SIMD" + { + rtx ops[5]; + int tabbase = REGNO (operands[0]); + + ops[4] = operands[1]; + ops[0] = gen_rtx_REG (V4BFmode, tabbase); + ops[1] = gen_rtx_REG (V4BFmode, tabbase + 2); + ops[2] = gen_rtx_REG (V4BFmode, tabbase + 4); + ops[3] = gen_rtx_REG (V4BFmode, tabbase + 6); + output_asm_insn ("vld4.16\t{%P0[], %P1[], %P2[], %P3[]}, %A4", ops); + return ""; + } + [(set_attr "type" "neon_load4_all_lanes_q")] +) + (define_expand "vec_store_lanesoi" [(set (match_operand:OI 0 "neon_struct_operand") (unspec:OI [(match_operand:OI 1 "s_register_operand") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index b7bbb47e7d1..13da5a8581d 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2020-03-06 Delia Burduv + + * gcc.target/arm/simd/bf16_vldn_1.c: New test. + 2020-03-06 Delia Burduv * gcc.target/arm/simd/bf16_vstn_1.c: New test. diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c new file mode 100644 index 00000000000..222e7af9453 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/bf16_vldn_1.c @@ -0,0 +1,152 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps -O2 -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + + +/* +**test_vld2_bf16: +** ... +** vld2.16 {d0-d1}, \[r0\] +** bx lr +*/ +bfloat16x4x2_t +test_vld2_bf16 (bfloat16_t * ptr) +{ + return vld2_bf16 (ptr); +} + +/* +**test_vld2q_bf16: +** ... +** vld2.16 {d0-d3}, \[r0\] +** bx lr +*/ +bfloat16x8x2_t +test_vld2q_bf16 (bfloat16_t * ptr) +{ + return vld2q_bf16 (ptr); +} + +/* +**test_vld2_dup_bf16: +** ... +** vld2.16 {d0\[\], d1\[\]}, \[r0\] +** bx lr +*/ +bfloat16x4x2_t +test_vld2_dup_bf16 (bfloat16_t * ptr) +{ + return vld2_dup_bf16 (ptr); +} + +/* +**test_vld2q_dup_bf16: +** ... +** vld2.16 {d0, d1, d2, d3}, \[r0\] +** bx lr +*/ +bfloat16x8x2_t +test_vld2q_dup_bf16 (bfloat16_t * ptr) +{ + return vld2q_dup_bf16 (ptr); +} + +/* +**test_vld3_bf16: +** ... +** vld3.16 {d0-d2}, \[r0\] +** bx lr +*/ +bfloat16x4x3_t +test_vld3_bf16 (bfloat16_t * ptr) +{ + return vld3_bf16 (ptr); +} + +/* +**test_vld3q_bf16: +** ... +** vld3.16 {d1, d3, d5}, \[r0\] +** bx lr +*/ +bfloat16x8x3_t +test_vld3q_bf16 (bfloat16_t * ptr) +{ + return vld3q_bf16 (ptr); +} + +/* +**test_vld3_dup_bf16: +** ... +** vld3.16 {d0\[\], d1\[\], d2\[\]}, \[r0\] +** bx lr +*/ +bfloat16x4x3_t +test_vld3_dup_bf16 (bfloat16_t * ptr) +{ + return vld3_dup_bf16 (ptr); +} + +/* +**test_vld3q_dup_bf16: +** ... +** vld3.16 {d0\[\], d1\[\], d2\[\]}, \[r0\] +** bx lr +*/ +bfloat16x8x3_t +test_vld3q_dup_bf16 (bfloat16_t * ptr) +{ + return vld3q_dup_bf16 (ptr); +} + +/* +**test_vld4_bf16: +** ... +** vld4.16 {d0-d3}, \[r0\] +** bx lr +*/ +bfloat16x4x4_t +test_vld4_bf16 (bfloat16_t * ptr) +{ + return vld4_bf16 (ptr); +} + +/* +**test_vld4q_bf16: +** ... +** vld4.16 {d1, d3, d5, d7}, \[r0\] +** bx lr +*/ +bfloat16x8x4_t +test_vld4q_bf16 (bfloat16_t * ptr) +{ + return vld4q_bf16 (ptr); +} + +/* +**test_vld4_dup_bf16: +** ... +** vld4.16 {d0\[\], d1\[\], d2\[\], d3\[\]}, \[r0\] +** bx lr +*/ +bfloat16x4x4_t +test_vld4_dup_bf16 (bfloat16_t * ptr) +{ + return vld4_dup_bf16 (ptr); +} + +/* +**test_vld4q_dup_bf16: +** ... +** vld4.16 {d0\[\], d1\[\], d2\[\], d3\[\]}, \[r0\] +** bx lr +*/ +bfloat16x8x4_t +test_vld4q_dup_bf16 (bfloat16_t * ptr) +{ + return vld4q_dup_bf16 (ptr); +} -- 2.30.2