[Aarch64][SVE] Dot product support

author Alejandro Martinez <alejandro.martinezvicente@arm.com>

Thu, 2 May 2019 09:58:00 +0000 (09:58 +0000)

committer Alejandro Martinez <alejandro@gcc.gnu.org>

Thu, 2 May 2019 09:58:00 +0000 (09:58 +0000)
author Alejandro Martinez <alejandro.martinezvicente@arm.com>
Thu, 2 May 2019 09:58:00 +0000 (09:58 +0000)
committer Alejandro Martinez <alejandro@gcc.gnu.org>
Thu, 2 May 2019 09:58:00 +0000 (09:58 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index e99c6a1158926c9015811408df00a9191dc23cab..720627bc70293f56e99485623b8d8720cb2ff943 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,16 @@
+2019-05-02  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+
+       * config/aarch64/aarch64-sve.md (<sur>dot_prod<vsi2qi>): Taken from SVE
+       ACLE branch.
+       * config/aarch64/iterators.md: Copied Vetype_fourth, VSI2QI and vsi2qi from
+       SVE ACLE branch.
+       * tree-vect-loop.c (use_mask_by_cond_expr_p): New function to check if a
+       VEC_COND_EXPR be inserted to emulate a conditional internal function.
+       (build_vect_cond_expr): Emit the VEC_COND_EXPR.
+       (vectorizable_reduction): Use the functions above to vectorize in a
+       fully masked loop codes that don't have a conditional internal
+       function.
+
  2019-05-02  Martin Liska  <mliska@suse.cz>
  
         * cgraphclones.c: Call valid_attribute_p with 1 for
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index 3f39c4c5b63798515ed4c109836b036573de4aad..02d33b7276fb690ab97adcec623a65aabd5b8994 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3132,3 +3132,19 @@
      DONE;
    }
  )
+
+;; Unpredicated DOT product.
+(define_insn "<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:SVE_SDI 0 "register_operand" "=w, ?&w")
+       (plus:SVE_SDI
+         (unspec:SVE_SDI
+           [(match_operand:<VSI2QI> 1 "register_operand" "w, w")
+            (match_operand:<VSI2QI> 2 "register_operand" "w, w")]
+           DOTPROD)
+         (match_operand:SVE_SDI 3 "register_operand" "0, w")))]
+  "TARGET_SVE"
+  "@
+   <sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>
+   movprfx\t%0, %3\;<sur>dot\\t%0.<Vetype>, %1.<Vetype_fourth>, %2.<Vetype_fourth>"
+  [(set_attr "movprfx" "*,yes")]
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index 6caeeac80867edda29b5438efdcee475ed609ff6..b3b2d6e470a81c9727c0e8452e14794ef594bf04 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -663,6 +663,9 @@
                           (QI "b")   (HI "h")
                           (SI "s")   (DI "d")])
  
+;; Like Vetype, but map to types that are a quarter of the element size.
+(define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
+
  ;; Equivalent of "size" for a vector element.
  (define_mode_attr Vesize [(VNx16QI "b")
                           (VNx8HI  "h") (VNx8HF  "h")
@@ -1029,8 +1032,10 @@
                       (V2SF "p") (V4SF  "v")
                       (V4HF "v") (V8HF  "v")])
  
-(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
-(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")
+                         (VNx4SI "vnx16qi") (VNx2DI "vnx8hi")])
+(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")
+                         (VNx4SI "VNx16QI") (VNx2DI "VNx8HI")])
  
  
  ;; Register suffix for DOTPROD input types from the return type.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 851993426150785f81d23f55d905b8f5cb198bc3..37edbeada5a2ba445f1895cd8910063987edd5f8 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2019-05-02  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+
+       * gcc.target/aarch64/sve/dot_1.c: New test for dot product.
+
  2019-05-02  Martin Liska  <mliska@suse.cz>
  
         * gcc.target/i386/funcspec-4.c: Update scanned pattern.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c

new file mode 100644 (file)

index 0000000..8ff6671
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dot_1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define DEF_DOT(TYPE1, TYPE2)                                          \
+TYPE1 __attribute__ ((noinline, noclone))                              \
+dot_##TYPE1##_##TYPE2 (TYPE2 *restrict x, TYPE2 *restrict y, int n)    \
+{                                                                      \
+  TYPE1 sum = 0;                                                       \
+  for (int i = 0; i < n; i++)                                          \
+    {                                                                  \
+      sum += x[i] * y[i];                                              \
+    }                                                                  \
+  return sum;                                                          \
+}
+
+DEF_DOT(uint32_t, uint8_t)
+DEF_DOT(int32_t, int8_t)
+DEF_DOT(int64_t, int16_t)
+
+/* The uint16_t->uint64_t dot product requires a casting to satisfy the C
+   language rules.  */
+uint64_t __attribute__ ((noinline, noclone))
+dot_uint64_t_uint16_t (uint16_t *restrict x, uint16_t *restrict y, int n)
+{
+  uint64_t sum = 0;
+  for (int i = 0; i < n; i++)
+    {
+      sum += (unsigned int)x[i] * y[i];
+    }
+  return sum;
+}
+
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.s, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdot\tz[0-9]+\.d, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\t} 8 } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 0edcdc7ee5f35983e40ce69f1812686780fe95b0..493c1ab8c71ed1dce8f4e4c254498350d0c6ad3b 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5958,6 +5958,55 @@ is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
           <= TYPE_PRECISION (lhs_type));
  }
  
+/* Check if masking can be supported by inserting a conditional expression.
+   CODE is the code for the operation.  COND_FN is the conditional internal
+   function, if it exists.  VECTYPE_IN is the type of the vector input.  */
+static bool
+use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
+                        tree vectype_in)
+{
+  if (cond_fn != IFN_LAST
+      && direct_internal_fn_supported_p (cond_fn, vectype_in,
+                                        OPTIMIZE_FOR_SPEED))
+    return false;
+
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+      return true;
+
+    default:
+      return false;
+    }
+}
+
+/* Insert a conditional expression to enable masked vectorization.  CODE is the
+   code for the operation.  VOP is the array of operands.  MASK is the loop
+   mask.  GSI is a statement iterator used to place the new conditional
+   expression.  */
+static void
+build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
+                     gimple_stmt_iterator *gsi)
+{
+  switch (code)
+    {
+    case DOT_PROD_EXPR:
+      {
+       tree vectype = TREE_TYPE (vop[1]);
+       tree zero = build_zero_cst (vectype);
+       tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
+       gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
+                                              mask, vop[1], zero);
+       gsi_insert_before (gsi, select, GSI_SAME_STMT);
+       vop[1] = masked_op1;
+       break;
+      }
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
  /* Function vectorizable_reduction.
  
     Check if STMT_INFO performs a reduction operation that can be vectorized.
@@ -6931,6 +6980,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
  
    internal_fn cond_fn = get_conditional_internal_fn (code);
    vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
  
    if (!vec_stmt) /* transformation not required.  */
      {
@@ -6938,6 +6988,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
        if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
         {
           if (reduction_type != FOLD_LEFT_REDUCTION
+             && !mask_by_cond_expr
               && (cond_fn == IFN_LAST
                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
                                                       OPTIMIZE_FOR_SPEED)))
@@ -7101,7 +7152,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
        FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
          {
           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-         if (masked_loop_p)
+         if (masked_loop_p && !mask_by_cond_expr)
             {
               /* Make sure that the reduction accumulator is vop[0].  */
               if (reduc_index == 1)
@@ -7125,6 +7176,14 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
               if (op_type == ternary_op)
                 vop[2] = vec_oprnds2[i];
  
+             if (masked_loop_p && mask_by_cond_expr)
+               {
+                 tree mask = vect_get_loop_mask (gsi, masks,
+                                                 vec_num * ncopies,
+                                                 vectype_in, i * ncopies + j);
+                 build_vect_cond_expr (code, vop, mask, gsi);
+               }
+
               gassign *new_stmt = gimple_build_assign (vec_dest, code,
                                                        vop[0], vop[1], vop[2]);
               new_temp = make_ssa_name (vec_dest, new_stmt);
author	Alejandro Martinez <alejandro.martinezvicente@arm.com>
	Thu, 2 May 2019 09:58:00 +0000 (09:58 +0000)
committer	Alejandro Martinez <alejandro@gcc.gnu.org>
	Thu, 2 May 2019 09:58:00 +0000 (09:58 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| history
gcc/config/aarch64/iterators.md		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/dot_1.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-loop.c		patch \| blob \| history