The operands in RTL patterns of MVE vector scatter store intrinsics are wrongly grouped,
because of which few vector loads and stores instructions are wrongly getting optimized
out with -O2.
A new predicate "mve_scatter_memory" is defined in this patch, this predicate returns TRUE on
matching: (mem(reg)) for MVE scatter store intrinsics.
This patch fixes the issue by adding define_expand pattern with "mve_scatter_memory" predicate
and calls the corresponding define_insn by passing register_operand as first argument.
This register_operand is extracted from the operand with "mve_scatter_memory" predicate in
define_expand pattern.
gcc/ChangeLog:
2020-06-01 Srinath Parvathaneni <srinath.parvathaneni@arm.com>
PR target/94735
* config/arm/predicates.md (mve_scatter_memory): Define to
match (mem (reg)) for scatter store memory.
* config/arm/mve.md (mve_vstrbq_scatter_offset_<supf><mode>): Modify
define_insn to define_expand.
(mve_vstrbq_scatter_offset_p_<supf><mode>): Likewise.
(mve_vstrhq_scatter_offset_<supf><mode>): Likewise.
(mve_vstrhq_scatter_shifted_offset_p_<supf><mode>): Likewise.
(mve_vstrhq_scatter_shifted_offset_<supf><mode>): Likewise.
(mve_vstrdq_scatter_offset_p_<supf>v2di): Likewise.
(mve_vstrdq_scatter_offset_<supf>v2di): Likewise.
(mve_vstrdq_scatter_shifted_offset_p_<supf>v2di): Likewise.
(mve_vstrdq_scatter_shifted_offset_<supf>v2di): Likewise.
(mve_vstrhq_scatter_offset_fv8hf): Likewise.
(mve_vstrhq_scatter_offset_p_fv8hf): Likewise.
(mve_vstrhq_scatter_shifted_offset_fv8hf): Likewise.
(mve_vstrhq_scatter_shifted_offset_p_fv8hf): Likewise.
(mve_vstrwq_scatter_offset_fv4sf): Likewise.
(mve_vstrwq_scatter_offset_p_fv4sf): Likewise.
(mve_vstrwq_scatter_offset_p_<supf>v4si): Likewise.
(mve_vstrwq_scatter_offset_<supf>v4si): Likewise.
(mve_vstrwq_scatter_shifted_offset_fv4sf): Likewise.
(mve_vstrwq_scatter_shifted_offset_p_fv4sf): Likewise.
(mve_vstrwq_scatter_shifted_offset_p_<supf>v4si): Likewise.
(mve_vstrwq_scatter_shifted_offset_<supf>v4si): Likewise.
(mve_vstrbq_scatter_offset_<supf><mode>_insn): Define insn for scatter
stores.
(mve_vstrbq_scatter_offset_p_<supf><mode>_insn): Likewise.
(mve_vstrhq_scatter_offset_<supf><mode>_insn): Likewise.
(mve_vstrhq_scatter_shifted_offset_p_<supf><mode>_insn): Likewise.
(mve_vstrhq_scatter_shifted_offset_<supf><mode>_insn): Likewise.
(mve_vstrdq_scatter_offset_p_<supf>v2di_insn): Likewise.
(mve_vstrdq_scatter_offset_<supf>v2di_insn): Likewise.
(mve_vstrdq_scatter_shifted_offset_p_<supf>v2di_insn): Likewise.
(mve_vstrdq_scatter_shifted_offset_<supf>v2di_insn): Likewise.
(mve_vstrhq_scatter_offset_fv8hf_insn): Likewise.
(mve_vstrhq_scatter_offset_p_fv8hf_insn): Likewise.
(mve_vstrhq_scatter_shifted_offset_fv8hf_insn): Likewise.
(mve_vstrhq_scatter_shifted_offset_p_fv8hf_insn): Likewise.
(mve_vstrwq_scatter_offset_fv4sf_insn): Likewise.
(mve_vstrwq_scatter_offset_p_fv4sf_insn): Likewise.
(mve_vstrwq_scatter_offset_p_<supf>v4si_insn): Likewise.
(mve_vstrwq_scatter_offset_<supf>v4si_insn): Likewise.
(mve_vstrwq_scatter_shifted_offset_fv4sf_insn): Likewise.
(mve_vstrwq_scatter_shifted_offset_p_fv4sf_insn): Likewise.
(mve_vstrwq_scatter_shifted_offset_p_<supf>v4si_insn): Likewise.
(mve_vstrwq_scatter_shifted_offset_<supf>v4si_insn): Likewise.
gcc/testsuite/ChangeLog:
2020-06-01 Srinath Parvathaneni <srinath.parvathaneni@arm.com>
PR target/94735
* gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base.c: New test.
* gcc.target/arm/mve/intrinsics/mve_vstore_scatter_base_p.c: Likewise.
* gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset.c: Likewise.
* gcc.target/arm/mve/intrinsics/mve_vstore_scatter_offset_p.c: Likewise.
* gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset.c:
Likewise.
* gcc.target/arm/mve/intrinsics/mve_vstore_scatter_shifted_offset_p.c:
Likewise.
;;
;; [vstrbq_scatter_offset_s vstrbq_scatter_offset_u]
;;
-(define_insn "mve_vstrbq_scatter_offset_<supf><mode>"
- [(set (match_operand:<MVE_B_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_B_ELEM>
- [(match_operand:MVE_2 1 "s_register_operand" "w")
- (match_operand:MVE_2 2 "s_register_operand" "w")]
- VSTRBSOQ))
- ]
+(define_expand "mve_vstrbq_scatter_offset_<supf><mode>"
+ [(match_operand:<MVE_B_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_2 1 "s_register_operand")
+ (match_operand:MVE_2 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRBSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn("vstrb.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrbq_scatter_offset_<supf><mode>_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrbq_scatter_offset_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_2 1 "s_register_operand" "w")
+ (match_operand:MVE_2 2 "s_register_operand" "w")]
+ VSTRBSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrb.<V_sz_elem>\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;;
;; [vstrbq_scatter_offset_p_s vstrbq_scatter_offset_p_u]
;;
-(define_insn "mve_vstrbq_scatter_offset_p_<supf><mode>"
- [(set (match_operand:<MVE_B_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_B_ELEM>
- [(match_operand:MVE_2 1 "s_register_operand" "w")
- (match_operand:MVE_2 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRBSOQ))
- ]
+(define_expand "mve_vstrbq_scatter_offset_p_<supf><mode>"
+ [(match_operand:<MVE_B_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_2 1 "s_register_operand")
+ (match_operand:MVE_2 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand" "Up")
+ (unspec:V4SI [(const_int 0)] VSTRBSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrbt.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrbq_scatter_offset_p_<supf><mode>_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrbq_scatter_offset_p_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_2 1 "s_register_operand" "w")
+ (match_operand:MVE_2 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRBSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrbt.<V_sz_elem>\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
;;
;; [vstrhq_scatter_offset_p_s vstrhq_scatter_offset_p_u]
;;
-(define_insn "mve_vstrhq_scatter_offset_p_<supf><mode>"
- [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_H_ELEM>
- [(match_operand:MVE_6 1 "s_register_operand" "w")
- (match_operand:MVE_6 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRHSOQ))
- ]
+(define_expand "mve_vstrhq_scatter_offset_p_<supf><mode>"
+ [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_6 1 "s_register_operand")
+ (match_operand:MVE_6 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrht.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrhq_scatter_offset_p_<supf><mode>_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_offset_p_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_6 1 "s_register_operand" "w")
+ (match_operand:MVE_6 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRHSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrht.<V_sz_elem>\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
;; [vstrhq_scatter_offset_s vstrhq_scatter_offset_u]
;;
-(define_insn "mve_vstrhq_scatter_offset_<supf><mode>"
- [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_H_ELEM>
- [(match_operand:MVE_6 1 "s_register_operand" "w")
- (match_operand:MVE_6 2 "s_register_operand" "w")]
- VSTRHSOQ))
- ]
+(define_expand "mve_vstrhq_scatter_offset_<supf><mode>"
+ [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_6 1 "s_register_operand")
+ (match_operand:MVE_6 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrh.<V_sz_elem>\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrhq_scatter_offset_<supf><mode>_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_offset_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_6 1 "s_register_operand" "w")
+ (match_operand:MVE_6 2 "s_register_operand" "w")]
+ VSTRHSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrh.<V_sz_elem>\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrhq_scatter_shifted_offset_p_s vstrhq_scatter_shifted_offset_p_u]
;;
-(define_insn "mve_vstrhq_scatter_shifted_offset_p_<supf><mode>"
- [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Ux")
- (unspec:<MVE_H_ELEM>
- [(match_operand:MVE_6 1 "s_register_operand" "w")
- (match_operand:MVE_6 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRHSSOQ))
- ]
+(define_expand "mve_vstrhq_scatter_shifted_offset_p_<supf><mode>"
+ [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_6 1 "s_register_operand")
+ (match_operand:MVE_6 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrht.<V_sz_elem>\t%q2, [%m0, %q1, uxtw #1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrhq_scatter_shifted_offset_p_<supf><mode>_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_shifted_offset_p_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_6 1 "s_register_operand" "w")
+ (match_operand:MVE_6 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRHSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrht.<V_sz_elem>\t%q2, [%0, %q1, uxtw #1]"
[(set_attr "length" "8")])
;;
;; [vstrhq_scatter_shifted_offset_s vstrhq_scatter_shifted_offset_u]
;;
-(define_insn "mve_vstrhq_scatter_shifted_offset_<supf><mode>"
- [(set (match_operand:<MVE_H_ELEM> 0 "memory_operand" "=Us")
- (unspec:<MVE_H_ELEM>
- [(match_operand:MVE_6 1 "s_register_operand" "w")
- (match_operand:MVE_6 2 "s_register_operand" "w")]
- VSTRHSSOQ))
- ]
+(define_expand "mve_vstrhq_scatter_shifted_offset_<supf><mode>"
+ [(match_operand:<MVE_H_ELEM> 0 "mve_scatter_memory")
+ (match_operand:MVE_6 1 "s_register_operand")
+ (match_operand:MVE_6 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrh.<V_sz_elem>\t%q2, [%m0, %q1, uxtw #1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrhq_scatter_shifted_offset_<supf><mode>_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_shifted_offset_<supf><mode>_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:MVE_6 1 "s_register_operand" "w")
+ (match_operand:MVE_6 2 "s_register_operand" "w")]
+ VSTRHSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrh.<V_sz_elem>\t%q2, [%0, %q1, uxtw #1]"
[(set_attr "length" "4")])
;;
;;
;; [vstrdq_scatter_offset_p_s vstrdq_scatter_offset_p_u]
;;
-(define_insn "mve_vstrdq_scatter_offset_p_<supf>v2di"
- [(set (match_operand:V2DI 0 "memory_operand" "=Us")
- (unspec:V2DI
- [(match_operand:V2DI 1 "s_register_operand" "w")
- (match_operand:V2DI 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRDSOQ))
- ]
+(define_expand "mve_vstrdq_scatter_offset_p_<supf>v2di"
+ [(match_operand:V2DI 0 "mve_scatter_memory")
+ (match_operand:V2DI 1 "s_register_operand")
+ (match_operand:V2DI 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRDSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\;\tvstrdt.64\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrdq_scatter_offset_p_<supf>v2di_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrdq_scatter_offset_p_<supf>v2di_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V2DI 1 "s_register_operand" "w")
+ (match_operand:V2DI 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRDSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrdt.64\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
;; [vstrdq_scatter_offset_s vstrdq_scatter_offset_u]
;;
-(define_insn "mve_vstrdq_scatter_offset_<supf>v2di"
- [(set (match_operand:V2DI 0 "memory_operand" "=Us")
- (unspec:V2DI
- [(match_operand:V2DI 1 "s_register_operand" "w")
- (match_operand:V2DI 2 "s_register_operand" "w")]
- VSTRDSOQ))
- ]
+(define_expand "mve_vstrdq_scatter_offset_<supf>v2di"
+ [(match_operand:V2DI 0 "mve_scatter_memory")
+ (match_operand:V2DI 1 "s_register_operand")
+ (match_operand:V2DI 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRDSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrd.64\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrdq_scatter_offset_<supf>v2di_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrdq_scatter_offset_<supf>v2di_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V2DI 1 "s_register_operand" "w")
+ (match_operand:V2DI 2 "s_register_operand" "w")]
+ VSTRDSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrd.64\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrdq_scatter_shifted_offset_p_s vstrdq_scatter_shifted_offset_p_u]
;;
-(define_insn "mve_vstrdq_scatter_shifted_offset_p_<supf>v2di"
- [(set (match_operand:V2DI 0 "memory_operand" "=Us")
- (unspec:V2DI
- [(match_operand:V2DI 1 "s_register_operand" "w")
- (match_operand:V2DI 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRDSSOQ))
- ]
+(define_expand "mve_vstrdq_scatter_shifted_offset_p_<supf>v2di"
+ [(match_operand:V2DI 0 "mve_scatter_memory")
+ (match_operand:V2DI 1 "s_register_operand")
+ (match_operand:V2DI 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRDSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\;\tvstrdt.64\t%q2, [%m0, %q1, UXTW #3]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrdq_scatter_shifted_offset_p_<supf>v2di_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrdq_scatter_shifted_offset_p_<supf>v2di_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V2DI 1 "s_register_operand" "w")
+ (match_operand:V2DI 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRDSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrdt.64\t%q2, [%0, %q1, UXTW #3]"
[(set_attr "length" "8")])
;;
;; [vstrdq_scatter_shifted_offset_s vstrdq_scatter_shifted_offset_u]
;;
-(define_insn "mve_vstrdq_scatter_shifted_offset_<supf>v2di"
- [(set (match_operand:V2DI 0 "memory_operand" "=Us")
- (unspec:V2DI
- [(match_operand:V2DI 1 "s_register_operand" "w")
- (match_operand:V2DI 2 "s_register_operand" "w")]
- VSTRDSSOQ))
- ]
+(define_expand "mve_vstrdq_scatter_shifted_offset_<supf>v2di"
+ [(match_operand:V2DI 0 "mve_scatter_memory")
+ (match_operand:V2DI 1 "s_register_operand")
+ (match_operand:V2DI 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRDSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrd.64\t%q2, [%m0, %q1, UXTW #3]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrdq_scatter_shifted_offset_<supf>v2di_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrdq_scatter_shifted_offset_<supf>v2di_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V2DI 1 "s_register_operand" "w")
+ (match_operand:V2DI 2 "s_register_operand" "w")]
+ VSTRDSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrd.64\t%q2, [%0, %q1, UXTW #3]"
[(set_attr "length" "4")])
;;
;; [vstrhq_scatter_offset_f]
;;
-(define_insn "mve_vstrhq_scatter_offset_fv8hf"
- [(set (match_operand:V8HI 0 "memory_operand" "=Us")
- (unspec:V8HI
- [(match_operand:V8HI 1 "s_register_operand" "w")
- (match_operand:V8HF 2 "s_register_operand" "w")]
- VSTRHQSO_F))
- ]
+(define_expand "mve_vstrhq_scatter_offset_fv8hf"
+ [(match_operand:V8HI 0 "mve_scatter_memory")
+ (match_operand:V8HI 1 "s_register_operand")
+ (match_operand:V8HF 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHQSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrh.16\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrhq_scatter_offset_fv8hf_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_offset_fv8hf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")]
+ VSTRHQSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vstrh.16\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrhq_scatter_offset_p_f]
;;
-(define_insn "mve_vstrhq_scatter_offset_p_fv8hf"
- [(set (match_operand:V8HI 0 "memory_operand" "=Us")
- (unspec:V8HI
- [(match_operand:V8HI 1 "s_register_operand" "w")
- (match_operand:V8HF 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRHQSO_F))
- ]
+(define_expand "mve_vstrhq_scatter_offset_p_fv8hf"
+ [(match_operand:V8HI 0 "mve_scatter_memory")
+ (match_operand:V8HI 1 "s_register_operand")
+ (match_operand:V8HF 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRHQSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrht.16\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrhq_scatter_offset_p_fv8hf_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_offset_p_fv8hf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRHQSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vpst\;vstrht.16\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
;; [vstrhq_scatter_shifted_offset_f]
;;
-(define_insn "mve_vstrhq_scatter_shifted_offset_fv8hf"
- [(set (match_operand:V8HI 0 "memory_operand" "=Us")
- (unspec:V8HI
- [(match_operand:V8HI 1 "s_register_operand" "w")
- (match_operand:V8HF 2 "s_register_operand" "w")]
- VSTRHQSSO_F))
- ]
+(define_expand "mve_vstrhq_scatter_shifted_offset_fv8hf"
+ [(match_operand:V8HI 0 "memory_operand" "=Us")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")
+ (unspec:V4SI [(const_int 0)] VSTRHQSSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrh.16\t%q2, [%m0, %q1, uxtw #1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrhq_scatter_shifted_offset_fv8hf_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_shifted_offset_fv8hf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")]
+ VSTRHQSSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vstrh.16\t%q2, [%0, %q1, uxtw #1]"
[(set_attr "length" "4")])
;;
;; [vstrhq_scatter_shifted_offset_p_f]
;;
-(define_insn "mve_vstrhq_scatter_shifted_offset_p_fv8hf"
- [(set (match_operand:V8HI 0 "memory_operand" "=Us")
- (unspec:V8HI
- [(match_operand:V8HI 1 "s_register_operand" "w")
- (match_operand:V8HF 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRHQSSO_F))
- ]
+(define_expand "mve_vstrhq_scatter_shifted_offset_p_fv8hf"
+ [(match_operand:V8HI 0 "memory_operand" "=Us")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")
+ (unspec:V4SI [(const_int 0)] VSTRHQSSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrht.16\t%q2, [%m0, %q1, uxtw #1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrhq_scatter_shifted_offset_p_fv8hf_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrhq_scatter_shifted_offset_p_fv8hf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V8HI 1 "s_register_operand" "w")
+ (match_operand:V8HF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRHQSSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vpst\;vstrht.16\t%q2, [%0, %q1, uxtw #1]"
[(set_attr "length" "8")])
;;
;;
;; [vstrwq_scatter_offset_f]
;;
-(define_insn "mve_vstrwq_scatter_offset_fv4sf"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SF 2 "s_register_operand" "w")]
- VSTRWQSO_F))
- ]
+(define_expand "mve_vstrwq_scatter_offset_fv4sf"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SF 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWQSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrw.32\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_offset_fv4sf_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_offset_fv4sf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SF 2 "s_register_operand" "w")]
+ VSTRWQSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vstrw.32\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrwq_scatter_offset_p_f]
;;
-(define_insn "mve_vstrwq_scatter_offset_p_fv4sf"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SF 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRWQSO_F))
- ]
+(define_expand "mve_vstrwq_scatter_offset_p_fv4sf"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SF 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWQSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrwt.32\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_offset_p_fv4sf_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_offset_p_fv4sf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRWQSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vpst\;vstrwt.32\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
-;; [vstrwq_scatter_offset_p_s vstrwq_scatter_offset_p_u]
+;; [vstrwq_scatter_offset_s vstrwq_scatter_offset_u]
;;
-(define_insn "mve_vstrwq_scatter_offset_p_<supf>v4si"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SI 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRWSOQ))
- ]
+(define_expand "mve_vstrwq_scatter_offset_p_<supf>v4si"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SI 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\n\tvstrwt.32\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_offset_p_<supf>v4si_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_offset_p_<supf>v4si_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SI 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRWSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrwt.32\t%q2, [%0, %q1]"
[(set_attr "length" "8")])
;;
;; [vstrwq_scatter_offset_s vstrwq_scatter_offset_u]
;;
-(define_insn "mve_vstrwq_scatter_offset_<supf>v4si"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SI 2 "s_register_operand" "w")]
- VSTRWSOQ))
- ]
+(define_expand "mve_vstrwq_scatter_offset_<supf>v4si"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SI 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrw.32\t%q2, [%m0, %q1]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_offset_<supf>v4si_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_offset_<supf>v4si_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SI 2 "s_register_operand" "w")]
+ VSTRWSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrw.32\t%q2, [%0, %q1]"
[(set_attr "length" "4")])
;;
;; [vstrwq_scatter_shifted_offset_f]
;;
-(define_insn "mve_vstrwq_scatter_shifted_offset_fv4sf"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SF 2 "s_register_operand" "w")]
- VSTRWQSSO_F))
- ]
+(define_expand "mve_vstrwq_scatter_shifted_offset_fv4sf"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SF 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWQSSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrw.32\t%q2, [%m0, %q1, uxtw #2]",ops);
- return "";
-}
- [(set_attr "length" "4")])
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (gen_mve_vstrwq_scatter_shifted_offset_fv4sf_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_shifted_offset_fv4sf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SF 2 "s_register_operand" "w")]
+ VSTRWQSSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vstrw.32\t%q2, [%0, %q1, uxtw #2]"
+ [(set_attr "length" "8")])
;;
;; [vstrwq_scatter_shifted_offset_p_f]
;;
-(define_insn "mve_vstrwq_scatter_shifted_offset_p_fv4sf"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SF 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRWQSSO_F))
- ]
+(define_expand "mve_vstrwq_scatter_shifted_offset_p_fv4sf"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SF 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWQSSO_F)]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\;\tvstrwt.32\t%q2, [%m0, %q1, uxtw #2]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrwq_scatter_shifted_offset_p_fv4sf_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_shifted_offset_p_fv4sf_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SF 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRWQSSO_F))]
+ "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
+ "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]"
[(set_attr "length" "8")])
;;
;; [vstrwq_scatter_shifted_offset_p_s vstrwq_scatter_shifted_offset_p_u]
;;
-(define_insn "mve_vstrwq_scatter_shifted_offset_p_<supf>v4si"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SI 2 "s_register_operand" "w")
- (match_operand:HI 3 "vpr_register_operand" "Up")]
- VSTRWSSOQ))
- ]
+(define_expand "mve_vstrwq_scatter_shifted_offset_p_<supf>v4si"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SI 2 "s_register_operand")
+ (match_operand:HI 3 "vpr_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vpst\;\tvstrwt.32\t%q2, [%m0, %q1, uxtw #2]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrwq_scatter_shifted_offset_p_<supf>v4si_insn (ind, operands[1],
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_shifted_offset_p_<supf>v4si_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SI 2 "s_register_operand" "w")
+ (match_operand:HI 3 "vpr_register_operand" "Up")]
+ VSTRWSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vpst\;vstrwt.32\t%q2, [%0, %q1, uxtw #2]"
[(set_attr "length" "8")])
;;
;; [vstrwq_scatter_shifted_offset_s vstrwq_scatter_shifted_offset_u]
;;
-(define_insn "mve_vstrwq_scatter_shifted_offset_<supf>v4si"
- [(set (match_operand:V4SI 0 "memory_operand" "=Us")
- (unspec:V4SI
- [(match_operand:V4SI 1 "s_register_operand" "w")
- (match_operand:V4SI 2 "s_register_operand" "w")]
- VSTRWSSOQ))
- ]
+(define_expand "mve_vstrwq_scatter_shifted_offset_<supf>v4si"
+ [(match_operand:V4SI 0 "mve_scatter_memory")
+ (match_operand:V4SI 1 "s_register_operand")
+ (match_operand:V4SI 2 "s_register_operand")
+ (unspec:V4SI [(const_int 0)] VSTRWSSOQ)]
"TARGET_HAVE_MVE"
{
- rtx ops[3];
- ops[0] = operands[0];
- ops[1] = operands[1];
- ops[2] = operands[2];
- output_asm_insn ("vstrw.32\t%q2, [%m0, %q1, uxtw #2]",ops);
- return "";
-}
+ rtx ind = XEXP (operands[0], 0);
+ gcc_assert (REG_P (ind));
+ emit_insn (
+ gen_mve_vstrwq_scatter_shifted_offset_<supf>v4si_insn (ind, operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "mve_vstrwq_scatter_shifted_offset_<supf>v4si_insn"
+ [(set (mem:BLK (scratch))
+ (unspec:BLK
+ [(match_operand:SI 0 "register_operand" "r")
+ (match_operand:V4SI 1 "s_register_operand" "w")
+ (match_operand:V4SI 2 "s_register_operand" "w")]
+ VSTRWSSOQ))]
+ "TARGET_HAVE_MVE"
+ "vstrw.32\t%q2, [%0, %q1, uxtw #2]"
[(set_attr "length" "4")])
;;
&& mve_vector_mem_operand (GET_MODE (op), XEXP (op, 0),
false)")))
+(define_predicate "mve_scatter_memory"
+ (and (match_code "mem")
+ (match_test "TARGET_HAVE_MVE && REG_P (XEXP (op, 0))
+ && mve_vector_mem_operand (GET_MODE (op), XEXP (op, 0),
+ false)")))
+
;; True for immediates in the range of 1 to 16 for MVE.
(define_predicate "mve_imm_16"
(match_test "satisfies_constraint_Rd (op)"))
--- /dev/null
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+int
+foows32(uint32x4_t pDataDest, int32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_s32 (pDataDest, 4, value);
+ vstrwq_scatter_base_s32 (pDataDest, 132, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+int
+foowu32(uint32x4_t pDataDest, uint32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_u32 (pDataDest, 4, value);
+ vstrwq_scatter_base_u32 (pDataDest, 132, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+int
+foowf32(uint32x4_t pDataDest, float32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_f32 (pDataDest, 4, value);
+ vstrwq_scatter_base_f32 (pDataDest, 132, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+int
+foods64(uint64x2_t pDataDest, int64x2_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrdq_scatter_base_s64 (pDataDest, 256, value);
+ vstrdq_scatter_base_s64 (pDataDest, 512, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+int
+foodu64(uint64x2_t pDataDest, uint64x2_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrdq_scatter_base_u64 (pDataDest, 256, value);
+ vstrdq_scatter_base_u64 (pDataDest, 512, value);
+ vstrwq_scatter_offset_s32 (ret, vecOffs1, (int32x4_t) pDataDest);
+ vstrwq_scatter_offset_s32 (ret, vecOffs2, (int32x4_t) pDataDest);
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]" 20 } } */
--- /dev/null
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+mve_pred16_t __p;
+
+int
+foows32(uint32x4_t pDataDest, int32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_p_s32 (pDataDest, 4, value, __p);
+ vstrwq_scatter_base_p_s32 (pDataDest, 132, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+int
+foowu32(uint32x4_t pDataDest, uint32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_p_u32 (pDataDest, 4, value, __p);
+ vstrwq_scatter_base_p_u32 (pDataDest, 132, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+int
+foowf32(uint32x4_t pDataDest, float32x4_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrwq_scatter_base_p_f32 (pDataDest, 4, value, __p);
+ vstrwq_scatter_base_p_f32 (pDataDest, 132, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+int
+foods64(uint64x2_t pDataDest, int64x2_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrdq_scatter_base_p_s64 (pDataDest, 256, value, __p);
+ vstrdq_scatter_base_p_s64 (pDataDest, 512, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+int
+foodu64(uint64x2_t pDataDest, uint64x2_t value, int32_t * ret)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ vstrdq_scatter_base_p_u64 (pDataDest, 256, value, __p);
+ vstrdq_scatter_base_p_u64 (pDataDest, 512, value, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs1, (int32x4_t) pDataDest, __p);
+ vstrwq_scatter_offset_p_s32 (ret, vecOffs2, (int32x4_t) pDataDest, __p);
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]t" 20 } } */
--- /dev/null
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+int
+foobu8( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 15, 8, 14};
+ const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 17, 22, 16, 20, 18, 30};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[16]);
+ vstrbq_scatter_offset_u8 (pDataDest, vecOffs1, (uint8x16_t) vecIn1);
+ vstrbq_scatter_offset_u8 (pDataDest, vecOffs2, (uint8x16_t) vecIn2);
+ pDataDest[32] = pDataSrc[32];
+ return 0;
+}
+
+int
+foobu16( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrbq_scatter_offset_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1);
+ vstrbq_scatter_offset_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foobu32( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrbq_scatter_offset_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1);
+ vstrbq_scatter_offset_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foobs8( int8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 15, 8, 14};
+ const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 17, 22, 16, 20, 18, 30};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[16]);
+ vstrbq_scatter_offset_s8 (pDataDest, vecOffs1, (int8x16_t) vecIn1);
+ vstrbq_scatter_offset_s8 (pDataDest, vecOffs2, (int8x16_t) vecIn2);
+ pDataDest[32] = pDataSrc[32];
+ return 0;
+}
+
+int
+foobs16( int8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
+ vstrbq_scatter_offset_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1);
+ vstrbq_scatter_offset_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foobs32( uint8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrbq_scatter_offset_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1);
+ vstrbq_scatter_offset_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1);
+ vstrhq_scatter_offset_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohu32( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrhq_scatter_offset_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1);
+ vstrhq_scatter_offset_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohs16( int16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1);
+ vstrhq_scatter_offset_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohs32( uint16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrhq_scatter_offset_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1);
+ vstrhq_scatter_offset_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohf16( float16_t * pDataSrc, float16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_f16 (pDataDest, vecOffs1, (float16x8_t) vecIn1);
+ vstrhq_scatter_offset_f16 (pDataDest, vecOffs2, (float16x8_t) vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_offset_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1);
+ vstrwq_scatter_offset_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foows32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_offset_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1);
+ vstrwq_scatter_offset_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowf32( float32_t * pDataSrc, float32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrwq_scatter_offset_f32 (pDataDest, vecOffs1, (float32x4_t) vecIn1);
+ vstrwq_scatter_offset_f32 (pDataDest, vecOffs2, (float32x4_t) vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowu64( uint64_t * pDataSrc, uint64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 3};
+ const uint64x2_t vecOffs2 = { 1, 2};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+ vstrdq_scatter_offset_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1);
+ vstrdq_scatter_offset_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2);
+ pDataDest[4] = pDataSrc[4];
+ return 0;
+}
+
+int
+foows64( int64_t * pDataSrc, int64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 3};
+ const uint64x2_t vecOffs2 = { 1, 2};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+ vstrdq_scatter_offset_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1);
+ vstrdq_scatter_offset_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2);
+ pDataDest[4] = pDataSrc[4];
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]" 32 } } */
--- /dev/null
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+mve_pred16_t __p;
+int
+foobu8( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 15, 8, 14};
+ const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 17, 22, 16, 20, 18, 30};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[16]);
+ vstrbq_scatter_offset_p_u8(pDataDest, vecOffs1, (uint8x16_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_u8(pDataDest, vecOffs2, (uint8x16_t) vecIn2, __p);
+ pDataDest[32] = pDataSrc[32];
+ return 0;
+}
+
+int
+foobu16( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrbq_scatter_offset_p_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foobu32( uint8_t * pDataSrc, uint8_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrbq_scatter_offset_p_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foobs8( int8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint8x16_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5, 9, 11, 13, 10, 12, 15, 8, 14};
+ const uint8x16_t vecOffs2 = { 31, 29, 27, 25, 23, 28, 21, 26, 19, 24, 17, 22, 16, 20, 18, 30};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[16]);
+ vstrbq_scatter_offset_p_s8 (pDataDest, vecOffs1, (int8x16_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_s8 (pDataDest, vecOffs2, (int8x16_t) vecIn2, __p);
+ pDataDest[32] = pDataSrc[32];
+ return 0;
+}
+
+int
+foobs16( int8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
+ vstrbq_scatter_offset_p_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foobs32( uint8_t * pDataSrc, int8_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrbq_scatter_offset_p_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1, __p);
+ vstrbq_scatter_offset_p_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_p_u16 (pDataDest, vecOffs1, (uint16x8_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_u16 (pDataDest, vecOffs2, (uint16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohu32( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrhq_scatter_offset_p_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohs16( int16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_p_s16 (pDataDest, vecOffs1, (int16x8_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_s16 (pDataDest, vecOffs2, (int16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohs32( uint16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrhq_scatter_offset_p_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohf16( float16_t * pDataSrc, float16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 11, 13, 10, 12, 15, 8, 14, 9};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrhq_scatter_offset_p_f16 (pDataDest, vecOffs1, (float16x8_t) vecIn1, __p);
+ vstrhq_scatter_offset_p_f16 (pDataDest, vecOffs2, (float16x8_t) vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_offset_p_u32 (pDataDest, vecOffs1, (uint32x4_t) vecIn1, __p);
+ vstrwq_scatter_offset_p_u32 (pDataDest, vecOffs2, (uint32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foows32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_offset_p_s32 (pDataDest, vecOffs1, (int32x4_t) vecIn1, __p);
+ vstrwq_scatter_offset_p_s32 (pDataDest, vecOffs2, (int32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowf32( float32_t * pDataSrc, float32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[8]);
+ vstrwq_scatter_offset_p_f32 (pDataDest, vecOffs1, (float32x4_t) vecIn1, __p);
+ vstrwq_scatter_offset_p_f32 (pDataDest, vecOffs2, (float32x4_t) vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowu64( uint64_t * pDataSrc, uint64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 3};
+ const uint64x2_t vecOffs2 = { 1, 2};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+ vstrdq_scatter_offset_p_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1, __p);
+ vstrdq_scatter_offset_p_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2, __p);
+ pDataDest[4] = pDataSrc[4];
+ return 0;
+}
+
+int
+foows64( int64_t * pDataSrc, int64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 3};
+ const uint64x2_t vecOffs2 = { 1, 2};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+ vstrdq_scatter_offset_p_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1, __p);
+ vstrdq_scatter_offset_p_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2, __p);
+ pDataDest[4] = pDataSrc[4];
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]t" 32 } } */
--- /dev/null
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+int
+foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_shifted_offset_u32 (pDataDest, vecOffs1, vecIn1);
+ vstrwq_scatter_shifted_offset_u32 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowf32( float32_t * pDataSrc, float32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ float32x4_t vecIn1 = vldrwq_f32 ((float32_t const *) pDataSrc);
+ float32x4_t vecIn2 = vldrwq_f32 ((float32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_shifted_offset_f32 (pDataDest, vecOffs1, vecIn1);
+ vstrwq_scatter_shifted_offset_f32 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ uint16x8_t vecIn1 = vldrhq_u16 ((uint16_t const *) pDataSrc);
+ uint16x8_t vecIn2 = vldrhq_u16 ((uint16_t const *) &pDataSrc[8]);
+ vstrhq_scatter_shifted_offset_u16 (pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_u16 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrhq_u32 ((uint16_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrhq_u32 ((uint16_t const *) &pDataSrc[4]);
+ vstrhq_scatter_shifted_offset_u32 ((uint16_t *)pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_u32 ((uint16_t *)pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohf16( float16_t * pDataSrc, float16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ float16x8_t vecIn1 = vldrhq_f16 ((float16_t const *) pDataSrc);
+ float16x8_t vecIn2 = vldrhq_f16 ((float16_t const *) &pDataSrc[8]);
+ vstrhq_scatter_shifted_offset_f16 (pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_f16 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foodu64( uint64_t * pDataSrc, uint64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 1};
+ const uint64x2_t vecOffs2 = { 2, 3};
+ uint32x4_t vecIn1 = vldrwq_u32 ((uint32_t const *) pDataSrc);
+ uint32x4_t vecIn2 = vldrwq_u32 ((uint32_t const *) &pDataSrc[2]);
+
+ vstrdq_scatter_shifted_offset_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1);
+ vstrdq_scatter_shifted_offset_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2);
+
+ pDataDest[2] = pDataSrc[2];
+ return 0;
+}
+
+int
+foows32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[4]);
+ vstrwq_scatter_shifted_offset_s32 (pDataDest, vecOffs1, vecIn1);
+ vstrwq_scatter_shifted_offset_s32 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohs16( int16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ int16x8_t vecIn1 = vldrhq_s16 ((int16_t const *) pDataSrc);
+ int16x8_t vecIn2 = vldrhq_s16 ((int16_t const *) &pDataSrc[8]);
+ vstrhq_scatter_shifted_offset_s16 (pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_s16 (pDataDest, vecOffs2, vecIn2);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohs32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrhq_s32 ((int16_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrhq_s32 ((int16_t const *) &pDataSrc[4]);
+ vstrhq_scatter_shifted_offset_s32 ((int16_t *)pDataDest, vecOffs1, vecIn1);
+ vstrhq_scatter_shifted_offset_s32 ((int16_t *)pDataDest, vecOffs2, vecIn2);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foods64( int64_t * pDataSrc, int64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 1};
+ const uint64x2_t vecOffs2 = { 2, 3};
+ int32x4_t vecIn1 = vldrwq_s32 ((int32_t const *) pDataSrc);
+ int32x4_t vecIn2 = vldrwq_s32 ((int32_t const *) &pDataSrc[2]);
+
+ vstrdq_scatter_shifted_offset_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1);
+ vstrdq_scatter_shifted_offset_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2);
+
+ pDataDest[2] = pDataSrc[2];
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]" 20 } } */
--- /dev/null
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+mve_pred16_t __p;
+int
+foowu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrwq_z_u32 ((uint32_t const *) pDataSrc, __p);
+ uint32x4_t vecIn2 = vldrwq_z_u32 ((uint32_t const *) &pDataSrc[4], __p);
+ vstrwq_scatter_shifted_offset_p_u32 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrwq_scatter_shifted_offset_p_u32 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foowf32( float32_t * pDataSrc, float32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ float32x4_t vecIn1 = vldrwq_z_f32 ((float32_t const *) pDataSrc, __p);
+ float32x4_t vecIn2 = vldrwq_z_f32 ((float32_t const *) &pDataSrc[4], __p);
+ vstrwq_scatter_shifted_offset_p_f32 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrwq_scatter_shifted_offset_p_f32 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohu16( uint16_t * pDataSrc, uint16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ uint16x8_t vecIn1 = vldrhq_z_u16 ((uint16_t const *) pDataSrc, __p);
+ uint16x8_t vecIn2 = vldrhq_z_u16 ((uint16_t const *) &pDataSrc[8], __p);
+ vstrhq_scatter_shifted_offset_p_u16 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_u16 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohu32( uint32_t * pDataSrc, uint32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ uint32x4_t vecIn1 = vldrhq_z_u32 ((uint16_t const *) pDataSrc, __p);
+ uint32x4_t vecIn2 = vldrhq_z_u32 ((uint16_t const *) &pDataSrc[4], __p);
+ vstrhq_scatter_shifted_offset_p_u32 ((uint16_t *)pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_u32 ((uint16_t *)pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohf16( float16_t * pDataSrc, float16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ float16x8_t vecIn1 = vldrhq_z_f16 ((float16_t const *) pDataSrc, __p);
+ float16x8_t vecIn2 = vldrhq_z_f16 ((float16_t const *) &pDataSrc[8], __p);
+ vstrhq_scatter_shifted_offset_p_f16 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_f16 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foodu64( uint64_t * pDataSrc, uint64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 1};
+ const uint64x2_t vecOffs2 = { 2, 3};
+ uint32x4_t vecIn1 = vldrwq_z_u32 ((uint32_t const *) pDataSrc, __p);
+ uint32x4_t vecIn2 = vldrwq_z_u32 ((uint32_t const *) &pDataSrc[2], __p);
+
+ vstrdq_scatter_shifted_offset_p_u64 (pDataDest, vecOffs1, (uint64x2_t) vecIn1, __p);
+ vstrdq_scatter_shifted_offset_p_u64 (pDataDest, vecOffs2, (uint64x2_t) vecIn2, __p);
+
+ pDataDest[2] = pDataSrc[2];
+ return 0;
+}
+
+int
+foows32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrwq_z_s32 ((int32_t const *) pDataSrc, __p);
+ int32x4_t vecIn2 = vldrwq_z_s32 ((int32_t const *) &pDataSrc[4], __p);
+ vstrwq_scatter_shifted_offset_p_s32 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrwq_scatter_shifted_offset_p_s32 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foohs16( int16_t * pDataSrc, int16_t * pDataDest)
+{
+ const uint16x8_t vecOffs1 = { 0, 3, 6, 1, 4, 7, 2, 5};
+ const uint16x8_t vecOffs2 = { 9, 11, 13, 10, 12, 15, 8, 14};
+ int16x8_t vecIn1 = vldrhq_z_s16 ((int16_t const *) pDataSrc, __p);
+ int16x8_t vecIn2 = vldrhq_z_s16 ((int16_t const *) &pDataSrc[8], __p);
+ vstrhq_scatter_shifted_offset_p_s16 (pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_s16 (pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[16] = pDataSrc[16];
+ return 0;
+}
+
+int
+foohs32( int32_t * pDataSrc, int32_t * pDataDest)
+{
+ const uint32x4_t vecOffs1 = { 0, 3, 6, 1};
+ const uint32x4_t vecOffs2 = { 4, 7, 2, 5};
+ int32x4_t vecIn1 = vldrhq_z_s32 ((int16_t const *) pDataSrc, __p);
+ int32x4_t vecIn2 = vldrhq_z_s32 ((int16_t const *) &pDataSrc[4], __p);
+ vstrhq_scatter_shifted_offset_p_s32 ((int16_t *)pDataDest, vecOffs1, vecIn1, __p);
+ vstrhq_scatter_shifted_offset_p_s32 ((int16_t *)pDataDest, vecOffs2, vecIn2, __p);
+ pDataDest[8] = pDataSrc[8];
+ return 0;
+}
+
+int
+foods64( int64_t * pDataSrc, int64_t * pDataDest)
+{
+ const uint64x2_t vecOffs1 = { 0, 1};
+ const uint64x2_t vecOffs2 = { 2, 3};
+ int32x4_t vecIn1 = vldrwq_z_s32 ((int32_t const *) pDataSrc, __p);
+ int32x4_t vecIn2 = vldrwq_z_s32 ((int32_t const *) &pDataSrc[2], __p);
+
+ vstrdq_scatter_shifted_offset_p_s64 (pDataDest, vecOffs1, (int64x2_t) vecIn1, __p);
+ vstrdq_scatter_shifted_offset_p_s64 (pDataDest, vecOffs2, (int64x2_t) vecIn2, __p);
+
+ pDataDest[2] = pDataSrc[2];
+ return 0;
+}
+
+/* { dg-final { scan-assembler-times "vstr\[a-z\]t" 20 } } */