2017-10-16 Tamar Christina <tamar.christina@arm.com>
authorTamar Christina <tamar.christina@arm.com>
Mon, 16 Oct 2017 09:56:41 +0000 (09:56 +0000)
committerTamar Christina <tnfchris@gcc.gnu.org>
Mon, 16 Oct 2017 09:56:41 +0000 (09:56 +0000)
* config/aarch64/aarch64-builtins.c
(aarch64_types_quadopu_lane_qualifiers): New.
(TYPES_QUADOPU_LANE): New.
* config/aarch64/aarch64-simd.md (aarch64_<sur>dot<vsi2qi>): New.
(<sur>dot_prod<vsi2qi>, aarch64_<sur>dot_lane<vsi2qi>): New.
(aarch64_<sur>dot_laneq<vsi2qi>): New.
* config/aarch64/aarch64-simd-builtins.def (sdot, udot): New.
(sdot_lane, udot_lane, sdot_laneq, udot_laneq): New.
* config/aarch64/iterators.md (sur): Add UNSPEC_SDOT, UNSPEC_UDOT.
(Vdottype, DOTPROD): New.
(sur): Add SDOT and UDOT.

From-SVN: r253783

gcc/ChangeLog
gcc/config/aarch64/aarch64-builtins.c
gcc/config/aarch64/aarch64-simd-builtins.def
gcc/config/aarch64/aarch64-simd.md
gcc/config/aarch64/iterators.md

index f4228ff23856b552cb48b6e1d5816a5f65658dbe..bf6c3d82907112201361d7c39a14b3b1f48a188d 100644 (file)
@@ -1,3 +1,17 @@
+2017-10-16  Tamar Christina  <tamar.christina@arm.com>
+
+       * config/aarch64/aarch64-builtins.c
+       (aarch64_types_quadopu_lane_qualifiers): New.
+       (TYPES_QUADOPU_LANE): New.
+       * config/aarch64/aarch64-simd.md (aarch64_<sur>dot<vsi2qi>): New.
+       (<sur>dot_prod<vsi2qi>, aarch64_<sur>dot_lane<vsi2qi>): New.
+       (aarch64_<sur>dot_laneq<vsi2qi>): New.
+       * config/aarch64/aarch64-simd-builtins.def (sdot, udot): New.
+       (sdot_lane, udot_lane, sdot_laneq, udot_laneq): New.
+       * config/aarch64/iterators.md (sur): Add UNSPEC_SDOT, UNSPEC_UDOT.
+       (Vdottype, DOTPROD): New.
+       (sur): Add SDOT and UDOT.
+
 2017-10-16  Tamar Christina  <tamar.christina@arm.com>
 
        * config/aarch64/aarch64.h (AARCH64_FL_DOTPROD): New.
index 7edf75c52effd3ef83e14ff6f2ed4b827fc99e54..242b2e3dc31e1792eac5106a2ddb639d3b3f7001 100644 (file)
@@ -168,6 +168,11 @@ aarch64_types_quadop_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_none,
       qualifier_none, qualifier_lane_index };
 #define TYPES_QUADOP_LANE (aarch64_types_quadop_lane_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_quadopu_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
+      qualifier_unsigned, qualifier_lane_index };
+#define TYPES_QUADOPU_LANE (aarch64_types_quadopu_lane_qualifiers)
 
 static enum aarch64_type_qualifiers
 aarch64_types_binop_imm_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
index d713d5d8b88837ec6f2dc51188fb252f8d5bc8bd..52d01342372e518b1238ea14097e8f0574e9a605 100644 (file)
   BUILTIN_VSDQ_I_DI (BINOP, srshl, 0)
   BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0)
 
+  /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
+  BUILTIN_VB (TERNOP, sdot, 0)
+  BUILTIN_VB (TERNOPU, udot, 0)
+  BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
+  BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
+  BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
+  BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
+
   BUILTIN_VDQ_I (SHIFTIMM, ashr, 3)
   VAR1 (SHIFTIMM, ashr_simd, 0, di)
   BUILTIN_VDQ_I (SHIFTIMM, lshr, 3)
index 12da8be73e83d0325879c23a86a28ad3f9c84e77..49f615cfdbf7083d17169e72330c8985fc8efde7 100644 (file)
 }
 )
 
+;; These instructions map to the __builtins for the Dot Product operations.
+(define_insn "aarch64_<sur>dot<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+       (plus:VS (match_operand:VS 1 "register_operand" "0")
+               (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                           (match_operand:<VSI2QI> 3 "register_operand" "w")]
+               DOTPROD)))]
+  "TARGET_DOTPROD"
+  "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
+  [(set_attr "type" "neon_dot")]
+)
+
+;; These expands map to the Dot Product optab the vectorizer checks for.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;;     c = a[i] * b[i];
+;;     r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations.  However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_expand "<sur>dot_prod<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand")
+       (plus:VS (unspec:VS [(match_operand:<VSI2QI> 1 "register_operand")
+                           (match_operand:<VSI2QI> 2 "register_operand")]
+                DOTPROD)
+               (match_operand:VS 3 "register_operand")))]
+  "TARGET_DOTPROD"
+{
+  emit_insn (
+    gen_aarch64_<sur>dot<vsi2qi> (operands[3], operands[3], operands[1],
+                                   operands[2]));
+  emit_insn (gen_rtx_SET (operands[0], operands[3]));
+  DONE;
+})
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+       (plus:VS (match_operand:VS 1 "register_operand" "0")
+               (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                           (match_operand:V8QI 3 "register_operand" "<h_con>")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+               DOTPROD)))]
+  "TARGET_DOTPROD"
+  {
+    operands[4]
+      = GEN_INT (ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
+    return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+  }
+  [(set_attr "type" "neon_dot")]
+)
+
+(define_insn "aarch64_<sur>dot_laneq<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+       (plus:VS (match_operand:VS 1 "register_operand" "0")
+               (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+                           (match_operand:V16QI 3 "register_operand" "<h_con>")
+                           (match_operand:SI 4 "immediate_operand" "i")]
+               DOTPROD)))]
+  "TARGET_DOTPROD"
+  {
+    operands[4]
+      = GEN_INT (ENDIAN_LANE_N (V16QImode, INTVAL (operands[4])));
+    return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+  }
+  [(set_attr "type" "neon_dot")]
+)
+
 (define_expand "copysign<mode>3"
   [(match_operand:VHSDF 0 "register_operand")
    (match_operand:VHSDF 1 "register_operand")
index 477dc35daf6a1184be15d942c62a111604f62f3c..48cedbe84a6b33a7df86c3a7424bf83d1fdcbb7b 100644 (file)
     UNSPEC_SQRDMLSH     ; Used in aarch64-simd.md.
     UNSPEC_FMAXNM       ; Used in aarch64-simd.md.
     UNSPEC_FMINNM       ; Used in aarch64-simd.md.
+    UNSPEC_SDOT                ; Used in aarch64-simd.md.
+    UNSPEC_UDOT                ; Used in aarch64-simd.md.
 ])
 
 ;; ------------------------------------------------------------------
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
 (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
 
+
+;; Register suffix for DOTPROD input types from the return type.
+(define_mode_attr Vdottype [(V2SI "8b") (V4SI "16b")])
+
 ;; Sum of lengths of instructions needed to move vector registers of a mode.
 (define_mode_attr insn_count [(OI "8") (CI "12") (XI "16")])
 
                              UNSPEC_SHSUB UNSPEC_UHSUB
                              UNSPEC_SRHSUB UNSPEC_URHSUB])
 
+(define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
 
 (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
                               UNSPEC_SUBHN UNSPEC_RSUBHN])
                      (UNSPEC_USHLL  "u")  (UNSPEC_SSHLL "s")
                      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")
                      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")
+                     (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
 ])
 
 (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")