rs6000: unaligned VSX in memcpy/memmove expansion

author Aaron Sawdey <acsawdey@linux.ibm.com>

Sat, 8 Aug 2020 00:02:20 +0000 (19:02 -0500)

committer Aaron Sawdey <acsawdey@linux.ibm.com>

Tue, 18 Aug 2020 19:16:45 +0000 (14:16 -0500)
author Aaron Sawdey <acsawdey@linux.ibm.com>
Sat, 8 Aug 2020 00:02:20 +0000 (19:02 -0500)
committer Aaron Sawdey <acsawdey@linux.ibm.com>
Tue, 18 Aug 2020 19:16:45 +0000 (14:16 -0500)
diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c

index c35d93180ca17e133a3ca90824e39136324168f2..82cc24ecdda38a19740c7ea7828ea4d9eea7cd2e 100644 (file)
--- a/gcc/config/rs6000/rs6000-string.c
+++ b/gcc/config/rs6000/rs6000-string.c
@@ -2708,6 +2708,32 @@ gen_lvx_v4si_move (rtx dest, rtx src)
      return gen_altivec_lvx_v4si_internal (dest, src);
  }
  
+static rtx
+gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
+{
+  gcc_assert (MEM_P (dest) ^ MEM_P (src));
+  gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
+  gcc_assert (length <= 16);
+
+  bool is_store = MEM_P (dest);
+  rtx addr;
+
+  /* If the address form is not a simple register, make it so.  */
+  if (is_store)
+    addr = XEXP (dest, 0);
+  else
+    addr = XEXP (src, 0);
+
+  if (!REG_P (addr))
+    addr = force_reg (Pmode, addr);
+
+  rtx len = force_reg (DImode, gen_int_mode (length, DImode));
+  if (is_store)
+    return gen_stxvl (src, addr, len);
+  else
+    return gen_lxvl (dest, addr, len);
+}
+
  /* Expand a block move operation, and return 1 if successful.  Return 0
     if we should let the compiler generate normal code.
  
@@ -2750,18 +2776,56 @@ expand_block_move (rtx operands[], bool might_overlap)
    if (bytes > rs6000_block_move_inline_limit)
      return 0;
  
+  int orig_bytes = bytes;
    for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
      {
        union {
-       rtx (*movmemsi) (rtx, rtx, rtx, rtx);
         rtx (*mov) (rtx, rtx);
+       rtx (*movlen) (rtx, rtx, int);
        } gen_func;
        machine_mode mode = BLKmode;
        rtx src, dest;
-
-      /* Altivec first, since it will be faster than a string move
-        when it applies, and usually not significantly larger.  */
-      if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+      bool move_with_length = false;
+
+      /* Use POImode for paired vsx load/store.  Use V2DI for single
+        unaligned vsx load/store, for consistency with what other
+        expansions (compare) already do, and so we can use lxvd2x on
+        p8.  Order is VSX pair unaligned, VSX unaligned, Altivec, VSX
+        with length < 16 (if allowed), then gpr load/store.  */
+
+      if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
+         && TARGET_BLOCK_OPS_VECTOR_PAIR
+         && bytes >= 32
+         && (align >= 256 || !STRICT_ALIGNMENT))
+       {
+         move_bytes = 32;
+         mode = POImode;
+         gen_func.mov = gen_movpoi;
+       }
+      else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
+              && VECTOR_MEM_VSX_P (V2DImode)
+              && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
+       {
+         move_bytes = 16;
+         mode = V2DImode;
+         gen_func.mov = gen_vsx_movv2di_64bit;
+       }
+      else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
+              && TARGET_POWER10 && bytes < 16
+              && orig_bytes > 16
+              && !(bytes == 1 || bytes == 2
+                   || bytes == 4 || bytes == 8)
+              && (align >= 128 || !STRICT_ALIGNMENT))
+       {
+         /* Only use lxvl/stxvl if it could replace multiple ordinary
+            loads+stores.  Also don't use it unless we likely already
+            did one vsx copy so we aren't mixing gpr and vsx.  */
+         move_bytes = bytes;
+         mode = V16QImode;
+         gen_func.movlen = gen_lxvl_stxvl_move;
+         move_with_length = true;
+       }
+      else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
         {
           move_bytes = 16;
           mode = V4SImode;
@@ -2818,7 +2882,16 @@ expand_block_move (rtx operands[], bool might_overlap)
           gen_func.mov = gen_movqi;
         }
  
-      /* Mode is always set to something other than BLKmode by one of the 
+      /* If we can't succeed in doing the move in one pass, we can't
+        do it in the might_overlap case.  Bail out and return
+        failure.  We test num_reg + 1 >= MAX_MOVE_REG here to check
+        the same condition as the test of num_reg >= MAX_MOVE_REG
+        that is done below after the increment of num_reg.  */
+      if (might_overlap && num_reg + 1 >= MAX_MOVE_REG
+         && bytes > move_bytes)
+       return 0;
+
+      /* Mode is always set to something other than BLKmode by one of the
          cases of the if statement above.  */
        gcc_assert (mode != BLKmode);
  
@@ -2826,15 +2899,17 @@ expand_block_move (rtx operands[], bool might_overlap)
        dest = adjust_address (orig_dest, mode, offset);
  
        rtx tmp_reg = gen_reg_rtx (mode);
-      
-      loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
-      stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
  
-      /* If we didn't succeed in doing it in one pass, we can't do it in the 
-        might_overlap case.  Bail out and return failure.  */
-      if (might_overlap && num_reg >= MAX_MOVE_REG
-         && bytes > move_bytes)
-       return 0;
+      if (move_with_length)
+       {
+         loads[num_reg]    = (*gen_func.movlen) (tmp_reg, src, move_bytes);
+         stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes);
+       }
+      else
+       {
+         loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
+         stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
+       }
  
        /* Emit loads and stores saved up.  */
        if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c

index fe93cf6ff2b2fb46e3f6f49f9f657fc8f8bdb646..1c1caa90edeb7326a53a47d2ee0fdd75ede2b4e0 100644 (file)
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -4018,6 +4018,14 @@ rs6000_option_override_internal (bool global_init_p)
         rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX;
      }
  
+  if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
+    {
+      if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX)
+       rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+      else
+       rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+    }
+
    /* Use long double size to select the appropriate long double.  We use
       TYPE_PRECISION to differentiate the 3 different long double types.  We map
       128 into the precision used for TFmode.  */
@@ -23222,8 +23230,10 @@ struct rs6000_opt_mask {
  static struct rs6000_opt_mask const rs6000_opt_masks[] =
  {
    { "altivec",                 OPTION_MASK_ALTIVEC,            false, true  },
-  { "block-ops-unaligned-vsx",  OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
-                                                                false, true  },
+  { "block-ops-unaligned-vsx", OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
+                                                               false, true  },
+  { "block-ops-vector-pair",   OPTION_MASK_BLOCK_OPS_VECTOR_PAIR,
+                                                               false, true  },
    { "cmpb",                    OPTION_MASK_CMPB,               false, true  },
    { "crypto",                  OPTION_MASK_CRYPTO,             false, true  },
    { "direct-move",             OPTION_MASK_DIRECT_MOVE,        false, true  },
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt

index 9d3e740e93086bb97b8d3d75c30200228ffca5fd..b2a70e88ca8ff057bda03e79d219b0a4d7911edf 100644 (file)
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -328,6 +328,10 @@ mblock-ops-unaligned-vsx
  Target Report Mask(BLOCK_OPS_UNALIGNED_VSX) Var(rs6000_isa_flags)
  Generate unaligned VSX load/store for inline expansion of memcpy/memmove.
  
+mblock-ops-vector-pair
+Target Undocumented Mask(BLOCK_OPS_VECTOR_PAIR) Var(rs6000_isa_flags)
+Generate unaligned VSX vector pair load/store for inline expansion of memcpy/memmove.
+
  mblock-compare-inline-limit=
  Target Report Var(rs6000_block_compare_inline_limit) Init(63) RejectNegative Joined UInteger Save
  Max number of bytes to compare without loops.
author	Aaron Sawdey <acsawdey@linux.ibm.com>
	Sat, 8 Aug 2020 00:02:20 +0000 (19:02 -0500)
committer	Aaron Sawdey <acsawdey@linux.ibm.com>
	Tue, 18 Aug 2020 19:16:45 +0000 (14:16 -0500)
gcc/config/rs6000/rs6000-string.c		patch \| blob \| history
gcc/config/rs6000/rs6000.c		patch \| blob \| history
gcc/config/rs6000/rs6000.opt		patch \| blob \| history