return gen_altivec_lvx_v4si_internal (dest, src);
}
+static rtx
+gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
+{
+ gcc_assert (MEM_P (dest) ^ MEM_P (src));
+ gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
+ gcc_assert (length <= 16);
+
+ bool is_store = MEM_P (dest);
+ rtx addr;
+
+ /* If the address form is not a simple register, make it so. */
+ if (is_store)
+ addr = XEXP (dest, 0);
+ else
+ addr = XEXP (src, 0);
+
+ if (!REG_P (addr))
+ addr = force_reg (Pmode, addr);
+
+ rtx len = force_reg (DImode, gen_int_mode (length, DImode));
+ if (is_store)
+ return gen_stxvl (src, addr, len);
+ else
+ return gen_lxvl (dest, addr, len);
+}
+
/* Expand a block move operation, and return 1 if successful. Return 0
if we should let the compiler generate normal code.
if (bytes > rs6000_block_move_inline_limit)
return 0;
+ int orig_bytes = bytes;
for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
{
union {
- rtx (*movmemsi) (rtx, rtx, rtx, rtx);
rtx (*mov) (rtx, rtx);
+ rtx (*movlen) (rtx, rtx, int);
} gen_func;
machine_mode mode = BLKmode;
rtx src, dest;
-
- /* Altivec first, since it will be faster than a string move
- when it applies, and usually not significantly larger. */
- if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+ bool move_with_length = false;
+
+ /* Use POImode for paired vsx load/store. Use V2DI for single
+ unaligned vsx load/store, for consistency with what other
+ expansions (compare) already do, and so we can use lxvd2x on
+ p8. Order is VSX pair unaligned, VSX unaligned, Altivec, VSX
+ with length < 16 (if allowed), then gpr load/store. */
+
+ if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
+ && TARGET_BLOCK_OPS_VECTOR_PAIR
+ && bytes >= 32
+ && (align >= 256 || !STRICT_ALIGNMENT))
+ {
+ move_bytes = 32;
+ mode = POImode;
+ gen_func.mov = gen_movpoi;
+ }
+ else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
+ && VECTOR_MEM_VSX_P (V2DImode)
+ && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
+ {
+ move_bytes = 16;
+ mode = V2DImode;
+ gen_func.mov = gen_vsx_movv2di_64bit;
+ }
+ else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
+ && TARGET_POWER10 && bytes < 16
+ && orig_bytes > 16
+ && !(bytes == 1 || bytes == 2
+ || bytes == 4 || bytes == 8)
+ && (align >= 128 || !STRICT_ALIGNMENT))
+ {
+ /* Only use lxvl/stxvl if it could replace multiple ordinary
+ loads+stores. Also don't use it unless we likely already
+ did one vsx copy so we aren't mixing gpr and vsx. */
+ move_bytes = bytes;
+ mode = V16QImode;
+ gen_func.movlen = gen_lxvl_stxvl_move;
+ move_with_length = true;
+ }
+ else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
{
move_bytes = 16;
mode = V4SImode;
gen_func.mov = gen_movqi;
}
- /* Mode is always set to something other than BLKmode by one of the
+ /* If we can't succeed in doing the move in one pass, we can't
+ do it in the might_overlap case. Bail out and return
+ failure. We test num_reg + 1 >= MAX_MOVE_REG here to check
+ the same condition as the test of num_reg >= MAX_MOVE_REG
+ that is done below after the increment of num_reg. */
+ if (might_overlap && num_reg + 1 >= MAX_MOVE_REG
+ && bytes > move_bytes)
+ return 0;
+
+ /* Mode is always set to something other than BLKmode by one of the
cases of the if statement above. */
gcc_assert (mode != BLKmode);
dest = adjust_address (orig_dest, mode, offset);
rtx tmp_reg = gen_reg_rtx (mode);
-
- loads[num_reg] = (*gen_func.mov) (tmp_reg, src);
- stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
- /* If we didn't succeed in doing it in one pass, we can't do it in the
- might_overlap case. Bail out and return failure. */
- if (might_overlap && num_reg >= MAX_MOVE_REG
- && bytes > move_bytes)
- return 0;
+ if (move_with_length)
+ {
+ loads[num_reg] = (*gen_func.movlen) (tmp_reg, src, move_bytes);
+ stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes);
+ }
+ else
+ {
+ loads[num_reg] = (*gen_func.mov) (tmp_reg, src);
+ stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
+ }
/* Emit loads and stores saved up. */
if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX;
}
+ if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
+ {
+ if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX)
+ rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+ else
+ rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
+ }
+
/* Use long double size to select the appropriate long double. We use
TYPE_PRECISION to differentiate the 3 different long double types. We map
128 into the precision used for TFmode. */
static struct rs6000_opt_mask const rs6000_opt_masks[] =
{
{ "altivec", OPTION_MASK_ALTIVEC, false, true },
- { "block-ops-unaligned-vsx", OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
- false, true },
+ { "block-ops-unaligned-vsx", OPTION_MASK_BLOCK_OPS_UNALIGNED_VSX,
+ false, true },
+ { "block-ops-vector-pair", OPTION_MASK_BLOCK_OPS_VECTOR_PAIR,
+ false, true },
{ "cmpb", OPTION_MASK_CMPB, false, true },
{ "crypto", OPTION_MASK_CRYPTO, false, true },
{ "direct-move", OPTION_MASK_DIRECT_MOVE, false, true },