[SLP/AArch64] Fix unpack handling for big-endian SVE
authorRichard Sandiford <richard.sandiford@linaro.org>
Tue, 13 Mar 2018 15:13:37 +0000 (15:13 +0000)
committerRichard Sandiford <rsandifo@gcc.gnu.org>
Tue, 13 Mar 2018 15:13:37 +0000 (15:13 +0000)
I hadn't realised that on big-endian targets, VEC_UNPACK*HI_EXPR unpacks
the low-numbered lanes and VEC_UNPACK*LO_EXPR unpacks the high-numbered
lanes.  This meant that both the SVE patterns and the handling of
fully-masked loops were wrong.

The patch deals with that by making sure that all vec_unpack* optabs
are define_expands, using BYTES_BIG_ENDIAN to choose the appropriate
define_insn.  This in turn meant that we can get rid of the duplication
between the signed and unsigned patterns for predicates.  (We provide
implementations of both the signed and unsigned optabs because the sign
doesn't matter for predicates: every element contains only one
significant bit.)

Also, the float unpacks need to unpack one half of the input vector,
but the unpacked upper bits are "don't care".  There are two obvious
ways of handling that: use an unpack (filling with zeros) or use a ZIP
(filling with a duplicate of the low bits).  The code previously used
unpacks, but the sequence involved a subreg that is semantically an
element reverse on big-endian targets.  Using the ZIP patterns avoids
that, and at the moment there's no reason to prefer one over the other
for performance reasons, so the patch switches to ZIP unconditionally.
As the comment says, it would be easy to optimise this later if UUNPK
turns out to be better for some implementations.

2018-03-13  Richard Sandiford  <richard.sandiford@linaro.org>

gcc/
* tree-vect-loop-manip.c (vect_maybe_permute_loop_masks):
Reverse the choice between VEC_UNPACK_LO_EXPR and VEC_UNPACK_HI_EXPR
for big-endian.
* config/aarch64/iterators.md (hi_lanes_optab): New int attribute.
* config/aarch64/aarch64-sve.md
(*aarch64_sve_<perm_insn><perm_hilo><mode>): Rename to...
(aarch64_sve_<perm_insn><perm_hilo><mode>): ...this.
(*extend<mode><Vwide>2): Rename to...
(aarch64_sve_extend<mode><Vwide>2): ...this.
(vec_unpack<su>_<perm_hilo>_<mode>): Turn into a define_expand,
renaming the old pattern to...
(aarch64_sve_punpk<perm_hilo>_<mode>): ...this.  Only define
unsigned packs.
(vec_unpack<su>_<perm_hilo>_<SVE_BHSI:mode>): Turn into a
define_expand, renaming the old pattern to...
(aarch64_sve_<su>unpk<perm_hilo>_<SVE_BHSI:mode>): ...this.
(*vec_unpacku_<perm_hilo>_<mode>_no_convert): Delete.
(vec_unpacks_<perm_hilo>_<mode>): Take BYTES_BIG_ENDIAN into
account when deciding which SVE instruction the optab should use.
(vec_unpack<su_optab>_float_<perm_hilo>_vnx4si): Likewise.

gcc/testsuite/
* gcc.target/aarch64/sve/unpack_fcvt_signed_1.c: Expect zips rather
than unpacks.
* gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c: Likewise.
* gcc.target/aarch64/sve/unpack_float_1.c: Likewise.

From-SVN: r258489

gcc/ChangeLog
gcc/config/aarch64/aarch64-sve.md
gcc/config/aarch64/iterators.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c
gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c
gcc/testsuite/gcc.target/aarch64/sve/unpack_float_1.c
gcc/tree-vect-loop-manip.c

index 8e150de2a570ec4c4663e2b9a1ce6717cbcdc4b4..4a6f4d47b60d35cf62fc627f54836f6d528acd6d 100644 (file)
@@ -1,3 +1,26 @@
+2018-03-13  Richard Sandiford  <richard.sandiford@linaro.org>
+
+       * tree-vect-loop-manip.c (vect_maybe_permute_loop_masks):
+       Reverse the choice between VEC_UNPACK_LO_EXPR and VEC_UNPACK_HI_EXPR
+       for big-endian.
+       * config/aarch64/iterators.md (hi_lanes_optab): New int attribute.
+       * config/aarch64/aarch64-sve.md
+       (*aarch64_sve_<perm_insn><perm_hilo><mode>): Rename to...
+       (aarch64_sve_<perm_insn><perm_hilo><mode>): ...this.
+       (*extend<mode><Vwide>2): Rename to...
+       (aarch64_sve_extend<mode><Vwide>2): ...this.
+       (vec_unpack<su>_<perm_hilo>_<mode>): Turn into a define_expand,
+       renaming the old pattern to...
+       (aarch64_sve_punpk<perm_hilo>_<mode>): ...this.  Only define
+       unsigned packs.
+       (vec_unpack<su>_<perm_hilo>_<SVE_BHSI:mode>): Turn into a
+       define_expand, renaming the old pattern to...
+       (aarch64_sve_<su>unpk<perm_hilo>_<SVE_BHSI:mode>): ...this.
+       (*vec_unpacku_<perm_hilo>_<mode>_no_convert): Delete.
+       (vec_unpacks_<perm_hilo>_<mode>): Take BYTES_BIG_ENDIAN into
+       account when deciding which SVE instruction the optab should use.
+       (vec_unpack<su_optab>_float_<perm_hilo>_vnx4si): Likewise.
+
 2018-03-13  Richard Sandiford  <richard.sandiford@linaro.org>
 
        * config/aarch64/aarch64.md (V4_REGNUM, V8_REGNUM, V12_REGNUM)
index 2e7f0a45f793081d80e43a68929a07abc3b44117..d88553405094853ab58ba1c55cd212092a14116c 100644 (file)
   "<perm_insn><perm_hilo>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
 )
 
-(define_insn "*aarch64_sve_<perm_insn><perm_hilo><mode>"
+(define_insn "aarch64_sve_<perm_insn><perm_hilo><mode>"
   [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
        (unspec:SVE_ALL [(match_operand:SVE_ALL 1 "register_operand" "w")
                         (match_operand:SVE_ALL 2 "register_operand" "w")]
 )
 
 ;; Conversion of DI or SI to DF, predicated with a PTRUE.
-(define_insn "*<optab><mode>vnx2df2"
+(define_insn "aarch64_sve_<optab><mode>vnx2df2"
   [(set (match_operand:VNx2DF 0 "register_operand" "=w")
        (unspec:VNx2DF
          [(match_operand:VNx2BI 1 "register_operand" "Upl")
 
 ;; Conversion of SFs to the same number of DFs, or HFs to the same number
 ;; of SFs.
-(define_insn "*extend<mode><Vwide>2"
+(define_insn "aarch64_sve_extend<mode><Vwide>2"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
        (unspec:<VWIDE>
          [(match_operand:<VWIDE_PRED> 1 "register_operand" "Upl")
   "fcvt\t%0.<Vewtype>, %1/m, %2.<Vetype>"
 )
 
+;; Unpack the low or high half of a predicate, where "high" refers to
+;; the low-numbered lanes for big-endian and the high-numbered lanes
+;; for little-endian.
+(define_expand "vec_unpack<su>_<perm_hilo>_<mode>"
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand")]
+                  UNPACK)]
+  "TARGET_SVE"
+  {
+    emit_insn ((<hi_lanes_optab>
+               ? gen_aarch64_sve_punpkhi_<PRED_BHS:mode>
+               : gen_aarch64_sve_punpklo_<PRED_BHS:mode>)
+              (operands[0], operands[1]));
+    DONE;
+  }
+)
+
 ;; PUNPKHI and PUNPKLO.
-(define_insn "vec_unpack<su>_<perm_hilo>_<mode>"
+(define_insn "aarch64_sve_punpk<perm_hilo>_<mode>"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=Upa")
        (unspec:<VWIDE> [(match_operand:PRED_BHS 1 "register_operand" "Upa")]
-                       UNPACK))]
+                       UNPACK_UNSIGNED))]
   "TARGET_SVE"
   "punpk<perm_hilo>\t%0.h, %1.b"
 )
 
+;; Unpack the low or high half of a vector, where "high" refers to
+;; the low-numbered lanes for big-endian and the high-numbered lanes
+;; for little-endian.
+(define_expand "vec_unpack<su>_<perm_hilo>_<SVE_BHSI:mode>"
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (unspec:<VWIDE> [(match_operand:SVE_BHSI 1 "register_operand")] UNPACK)]
+  "TARGET_SVE"
+  {
+    emit_insn ((<hi_lanes_optab>
+               ? gen_aarch64_sve_<su>unpkhi_<SVE_BHSI:mode>
+               : gen_aarch64_sve_<su>unpklo_<SVE_BHSI:mode>)
+              (operands[0], operands[1]));
+    DONE;
+  }
+)
+
 ;; SUNPKHI, UUNPKHI, SUNPKLO and UUNPKLO.
-(define_insn "vec_unpack<su>_<perm_hilo>_<SVE_BHSI:mode>"
+(define_insn "aarch64_sve_<su>unpk<perm_hilo>_<SVE_BHSI:mode>"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
        (unspec:<VWIDE> [(match_operand:SVE_BHSI 1 "register_operand" "w")]
                        UNPACK))]
   "<su>unpk<perm_hilo>\t%0.<Vewtype>, %1.<Vetype>"
 )
 
-;; Used by the vec_unpacks_<perm_hilo>_<mode> expander to unpack the bit
-;; representation of a VNx4SF or VNx8HF without conversion.  The choice
-;; between signed and unsigned isn't significant.
-(define_insn "*vec_unpacku_<perm_hilo>_<mode>_no_convert"
-  [(set (match_operand:SVE_HSF 0 "register_operand" "=w")
-       (unspec:SVE_HSF [(match_operand:SVE_HSF 1 "register_operand" "w")]
-                       UNPACK_UNSIGNED))]
-  "TARGET_SVE"
-  "uunpk<perm_hilo>\t%0.<Vewtype>, %1.<Vetype>"
-)
-
 ;; Unpack one half of a VNx4SF to VNx2DF, or one half of a VNx8HF to VNx4SF.
 ;; First unpack the source without conversion, then float-convert the
 ;; unpacked source.
 (define_expand "vec_unpacks_<perm_hilo>_<mode>"
-  [(set (match_dup 2)
-       (unspec:SVE_HSF [(match_operand:SVE_HSF 1 "register_operand")]
-                       UNPACK_UNSIGNED))
-   (set (match_operand:<VWIDE> 0 "register_operand")
-       (unspec:<VWIDE> [(match_dup 3)
-                        (unspec:<VWIDE> [(match_dup 2)] UNSPEC_FLOAT_CONVERT)]
-                       UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
-  {
-    operands[2] = gen_reg_rtx (<MODE>mode);
-    operands[3] = force_reg (<VWIDE_PRED>mode, CONSTM1_RTX (<VWIDE_PRED>mode));
+  [(match_operand:<VWIDE> 0 "register_operand")
+   (unspec:SVE_HSF [(match_operand:SVE_HSF 1 "register_operand")]
+                  UNPACK_UNSIGNED)]
+  "TARGET_SVE"
+  {
+    /* Use ZIP to do the unpack, since we don't care about the upper halves
+       and since it has the nice property of not needing any subregs.
+       If using UUNPK* turns out to be preferable, we could model it as
+       a ZIP whose first operand is zero.  */
+    rtx temp = gen_reg_rtx (<MODE>mode);
+    emit_insn ((<hi_lanes_optab>
+               ? gen_aarch64_sve_zip2<mode>
+               : gen_aarch64_sve_zip1<mode>)
+               (temp, operands[1], operands[1]));
+    rtx ptrue = force_reg (<VWIDE_PRED>mode, CONSTM1_RTX (<VWIDE_PRED>mode));
+    emit_insn (gen_aarch64_sve_extend<mode><Vwide>2 (operands[0],
+                                                    ptrue, temp));
+    DONE;
   }
 )
 
 ;; to VNx2DI, reinterpret the VNx2DI as a VNx4SI, then convert the
 ;; unpacked VNx4SI to VNx2DF.
 (define_expand "vec_unpack<su_optab>_float_<perm_hilo>_vnx4si"
-  [(set (match_dup 2)
-       (unspec:VNx2DI [(match_operand:VNx4SI 1 "register_operand")]
-                      UNPACK_UNSIGNED))
-   (set (match_operand:VNx2DF 0 "register_operand")
-       (unspec:VNx2DF [(match_dup 3)
-                       (FLOATUORS:VNx2DF (match_dup 4))]
-                      UNSPEC_MERGE_PTRUE))]
-  "TARGET_SVE"
-  {
-    operands[2] = gen_reg_rtx (VNx2DImode);
-    operands[3] = force_reg (VNx2BImode, CONSTM1_RTX (VNx2BImode));
-    operands[4] = gen_rtx_SUBREG (VNx4SImode, operands[2], 0);
+  [(match_operand:VNx2DF 0 "register_operand")
+   (FLOATUORS:VNx2DF
+     (unspec:VNx2DI [(match_operand:VNx4SI 1 "register_operand")]
+                   UNPACK_UNSIGNED))]
+  "TARGET_SVE"
+  {
+    /* Use ZIP to do the unpack, since we don't care about the upper halves
+       and since it has the nice property of not needing any subregs.
+       If using UUNPK* turns out to be preferable, we could model it as
+       a ZIP whose first operand is zero.  */
+    rtx temp = gen_reg_rtx (VNx4SImode);
+    emit_insn ((<hi_lanes_optab>
+               ? gen_aarch64_sve_zip2vnx4si
+               : gen_aarch64_sve_zip1vnx4si)
+              (temp, operands[1], operands[1]));
+    rtx ptrue = force_reg (VNx2BImode, CONSTM1_RTX (VNx2BImode));
+    emit_insn (gen_aarch64_sve_<FLOATUORS:optab>vnx4sivnx2df2 (operands[0],
+                                                              ptrue, temp));
+    DONE;
   }
 )
 
index a2945a81848b326dcc317c771ca83822bdd64fe5..fa181794392d4dc48e9a6df5cf5db14a9824cd2d 100644 (file)
                            (UNSPEC_UNPACKSHI "hi") (UNSPEC_UNPACKUHI "hi")
                            (UNSPEC_UNPACKSLO "lo") (UNSPEC_UNPACKULO "lo")])
 
+;; Return true if the associated optab refers to the high-numbered lanes,
+;; false if it refers to the low-numbered lanes.  The convention is for
+;; "hi" to refer to the low-numbered lanes (the first ones in memory)
+;; for big-endian.
+(define_int_attr hi_lanes_optab [(UNSPEC_UNPACKSHI "!BYTES_BIG_ENDIAN")
+                                (UNSPEC_UNPACKUHI "!BYTES_BIG_ENDIAN")
+                                (UNSPEC_UNPACKSLO "BYTES_BIG_ENDIAN")
+                                (UNSPEC_UNPACKULO "BYTES_BIG_ENDIAN")])
+
 (define_int_attr frecp_suffix  [(UNSPEC_FRECPE "e") (UNSPEC_FRECPX "x")])
 
 (define_int_attr crc_variant [(UNSPEC_CRC32B "crc32b") (UNSPEC_CRC32H "crc32h")
index 84c64b2e77b6456edf29686944d95ab248fa2d97..cee70dc0bc9b8f567c7bdfdfcbdb90c5907199d0 100644 (file)
@@ -1,3 +1,10 @@
+2018-03-13  Richard Sandiford  <richard.sandiford@linaro.org>
+
+       * gcc.target/aarch64/sve/unpack_fcvt_signed_1.c: Expect zips rather
+       than unpacks.
+       * gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c: Likewise.
+       * gcc.target/aarch64/sve/unpack_float_1.c: Likewise.
+
 2018-03-13  Richard Sandiford  <richard.sandiford@linaro.org>
 
        * gcc.target/aarch64/sve/tls_1.c: New test.
index 83ffe8552c2091317191fbf550226fd09548bea6..0f96dc2ff007340541c2ba7d51e1ccfa0f3f2d39 100644 (file)
@@ -10,6 +10,6 @@ unpack_double_int_plus8 (double *d, int32_t *s, int size)
     d[i] = s[i] + 8;
 }
 
-/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
index e2f6b1a45ce239ea700b9d50de59cd5f3ac7553d..70465f91eba4f80140b2059481eb8f06bbc9ace7 100644 (file)
@@ -10,6 +10,6 @@ unpack_double_int_plus9 (double *d, uint32_t *s, int size)
     d[i] = (double) (s[i] + 9);
 }
 
-/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
index 14a636b5fda120243efc4f6a257aba0864c619c9..deb4cf5e940b186fb11468fea27b697cc2737010 100644 (file)
@@ -8,6 +8,6 @@ unpack_float_plus_7point9 (double *d, float *s, int size)
     d[i] = s[i] + 7.9;
 }
 
-/* { dg-final { scan-assembler-times {\tuunpklo\tz[0-9]+\.d, z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tuunpkhi\tz[0-9]+\.d, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.s, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
 /* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
index 96d40c8c4b328145a469e997addaf99862dd55a9..1c43ed0becd58abc81ca12c527672ec55c4a0809 100644 (file)
@@ -334,7 +334,8 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
        {
          tree src = src_rgm->masks[i / 2];
          tree dest = dest_rgm->masks[i];
-         tree_code code = (i & 1 ? VEC_UNPACK_HI_EXPR
+         tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
+                           ? VEC_UNPACK_HI_EXPR
                            : VEC_UNPACK_LO_EXPR);
          gassign *stmt;
          if (dest_masktype == unpack_masktype)