+2014-09-04 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
+
+ * config/rs6000/rs6000.c (special_handling_values): Add
+ SH_EXTRACT.
+ (rtx_is_swappable_p): Look for patterns with a VEC_SELECT, perhaps
+ wrapped in a VEC_DUPLICATE, representing an extract. Mark these
+ as swappable with special handling SH_EXTRACT. Remove
+ UNSPEC_VSX_XXSPLTW from the list of disallowed unspecs for the
+ optimization.
+ (adjust_extract): New function.
+ (handle_special_swappables): Add default to case statement; add
+ case for SH_EXTRACT that calls adjust_extract.
+ (dump_swap_insn_table): Handle SH_EXTRACT.
+
2014-09-04 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
* config/rs6000/vsx.md (*vsx_extract_<mode>_load): Always match
SH_CONST_VECTOR,
SH_SUBREG,
SH_NOSWAP_LD,
- SH_NOSWAP_ST
+ SH_NOSWAP_ST,
+ SH_EXTRACT
};
/* Union INSN with all insns containing definitions that reach USE.
{
enum rtx_code code = GET_CODE (op);
int i, j;
+ rtx parallel;
switch (code)
{
return 1;
case VEC_CONCAT:
- case VEC_SELECT:
case ASM_INPUT:
case ASM_OPERANDS:
return 0;
handling. */
if (GET_CODE (XEXP (op, 0)) == CONST_INT)
return 1;
+ else if (GET_CODE (XEXP (op, 0)) == REG
+ && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
+ /* This catches V2DF and V2DI splat, at a minimum. */
+ return 1;
+ else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
+ /* If the duplicated item is from a select, defer to the select
+ processing to see if we can change the lane for the splat. */
+ return rtx_is_swappable_p (XEXP (op, 0), special);
+ else
+ return 0;
+
+ case VEC_SELECT:
+ /* A vec_extract operation is ok if we change the lane. */
+ if (GET_CODE (XEXP (op, 0)) == REG
+ && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
+ && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
+ && XVECLEN (parallel, 0) == 1
+ && GET_CODE (XVECEXP (parallel, 0, 0)) == CONST_INT)
+ {
+ *special = SH_EXTRACT;
+ return 1;
+ }
else
return 0;
|| val == UNSPEC_VSX_CVSPDPN
|| val == UNSPEC_VSX_SET
|| val == UNSPEC_VSX_SLDWI
- || val == UNSPEC_VSX_XXSPLTW
|| val == UNSPEC_VUNPACK_HI_SIGN
|| val == UNSPEC_VUNPACK_HI_SIGN_DIRECT
|| val == UNSPEC_VUNPACK_LO_SIGN
INSN_UID (insn));
}
+/* Given OP that contains a vector extract operation, change the index
+ of the extracted lane to count from the other side of the vector. */
+static void
+adjust_extract (rtx_insn *insn)
+{
+ rtx body = PATTERN (insn);
+ /* The vec_select may be wrapped in a vec_duplicate for a splat, so
+ account for that. */
+ rtx sel = (GET_CODE (body) == VEC_DUPLICATE
+ ? XEXP (XEXP (body, 0), 1)
+ : XEXP (body, 1));
+ rtx par = XEXP (sel, 1);
+ int nunits = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0)));
+ XVECEXP (par, 0, 0) = GEN_INT (nunits - 1 - INTVAL (XVECEXP (par, 0, 0)));
+ INSN_CODE (insn) = -1; /* Force re-recognition. */
+ df_insn_rescan (insn);
+
+ if (dump_file)
+ fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
+}
+
/* The insn described by INSN_ENTRY[I] can be swapped, but only
with special handling. Take care of that here. */
static void
switch (insn_entry[i].special_handling)
{
+ default:
+ gcc_unreachable ();
case SH_CONST_VECTOR:
{
/* A CONST_VECTOR will only show up somewhere in the RHS of a SET. */
/* Convert a non-permuting store to a permuting one. */
permute_store (insn);
break;
+ case SH_EXTRACT:
+ /* Change the lane on an extract operation. */
+ adjust_extract (insn);
}
}
fputs ("special:load ", dump_file);
else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
fputs ("special:store ", dump_file);
+ else if (insn_entry[i].special_handling == SH_EXTRACT)
+ fputs ("special:extract ", dump_file);
}
if (insn_entry[i].web_not_optimizable)
fputs ("unoptimizable ", dump_file);
+2014-09-04 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
+
+ * gcc.target/powerpc/swaps-p8-13.c: New test.
+ * gcc.target/powerpc/swaps-p8-14.c: New test.
+ * gcc.target/powerpc/swaps-p8-15.c: New test.
+
2014-09-04 Bill Schmidt <wschmidt@linux.vnet.ibm.com>
* gcc.target/powerpc/vsx-extract-1.c: Test 0th doubleword
--- /dev/null
+/* { dg-do run { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ vector long long va, vb, vc, vd, tmp;
+ volatile unsigned long long three = 3;
+ vector unsigned long long threes = vec_splats (three);
+ for (i = 0; i < N; i+=2) {
+ vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+ vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+ vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+ tmp = vec_add (vb, vc);
+ tmp = vec_sub (tmp, vd);
+ tmp = vec_sra (tmp, threes);
+ x = vec_extract (tmp, 0);
+ vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+ }
+}
+
+__attribute__((noinline)) void init ()
+{
+ int i;
+ for (i = 0; i < N; ++i) {
+ cb[i] = 3 * i - 2048;
+ cc[i] = -5 * i + 93;
+ cd[i] = i + 14;
+ }
+}
+
+int main ()
+{
+ int i;
+ init ();
+ foo ();
+ for (i = 0; i < N; ++i)
+ if (ca[i] != (-3 * i - 1969) >> 3)
+ abort ();
+ if (x != ca[N-1])
+ abort ();
+ return 0;
+}
--- /dev/null
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "stxsdx" } } */
+/* { dg-final { scan-assembler-times "xxpermdi" 1 } } */
+
+/* The only xxpermdi expected is for the vec_splats. */
+
+#include <altivec.h>
+void abort ();
+
+#define N 4096
+long long ca[N] __attribute__((aligned(16)));
+long long cb[N] __attribute__((aligned(16)));
+long long cc[N] __attribute__((aligned(16)));
+long long cd[N] __attribute__((aligned(16)));
+long long x;
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ vector long long va, vb, vc, vd, tmp;
+ volatile unsigned long long three = 3;
+ vector unsigned long long threes = vec_splats (three);
+ for (i = 0; i < N; i+=2) {
+ vb = vec_vsx_ld (0, (vector long long *)&cb[i]);
+ vc = vec_vsx_ld (0, (vector long long *)&cc[i]);
+ vd = vec_vsx_ld (0, (vector long long *)&cd[i]);
+ tmp = vec_add (vb, vc);
+ tmp = vec_sub (tmp, vd);
+ tmp = vec_sra (tmp, threes);
+ x = vec_extract (tmp, 0);
+ vec_vsx_st (tmp, 0, (vector long long *)&ca[i]);
+ }
+}
+
+int main ()
+{
+ foo ();
+ return 0;
+}
--- /dev/null
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-options "-mcpu=power8 -O3" } */
+/* { dg-final { scan-assembler "lxvd2x" } } */
+/* { dg-final { scan-assembler "stxvd2x" } } */
+/* { dg-final { scan-assembler "xxspltw" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+
+#include <altivec.h>
+void abort();
+
+typedef struct xx {vector double l; vector double h;} xx;
+
+#define N 4096
+#define M 10000000
+vector float ca[N][4] = {0};
+vector float cb[N][4] = {0};
+vector float cc[N][4] = {0};
+
+__attribute__((noinline)) void foo ()
+{
+ int i;
+ for (i = 0; i < N; i++) {
+ cc[i][0] = vec_mul(vec_splats(cb[i][0][0]), ca[i][0]);
+ cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][1]), ca[i][1]);
+ cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][2]), ca[i][2]);
+ cc[i][0] = vec_madd(cc[i][0],vec_splats(cb[i][0][3]), ca[i][3]);
+
+ cc[i][1] = vec_mul(vec_splats(cb[i][1][0]), ca[i][0]);
+ cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][1]), ca[i][1]);
+ cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][2]), ca[i][2]);
+ cc[i][1] = vec_madd(cc[i][0],vec_splats(cb[i][1][3]), ca[i][3]);
+
+ cc[i][2] = vec_mul(vec_splats(cb[i][2][0]), ca[i][0]);
+ cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][1]), ca[i][1]);
+ cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][2]), ca[i][2]);
+ cc[i][2] = vec_madd(cc[i][0],vec_splats(cb[i][2][3]), ca[i][3]);
+
+ cc[i][3] = vec_mul(vec_splats(cb[i][3][0]), ca[i][0]);
+ cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][1]), ca[i][1]);
+ cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][2]), ca[i][2]);
+ cc[i][3] = vec_madd(cc[i][0],vec_splats(cb[i][3][3]), ca[i][3]);
+ }
+}
+
+int main ()
+{
+ foo ();
+ return 0;
+}