+2008-05-15 H.J. Lu <hongjiu.lu@intel.com>
+
+ * config/i386/i386.c (ix86_expand_vector_init_general): Optimize
+ V8HImode for SSE2 and V16QImode for SSE4.1.
+
2008-05-15 Kenneth Zadeck <zadeck@naturalbridge.com>
* cgraph.h (compute_inline_parameters): Made public.
break;
case V8HImode:
+ if (TARGET_SSE2)
+ {
+ rtx ops[4];
+ unsigned int i, j;
+
+ for (i = 0; i < ARRAY_SIZE (ops); i++)
+ {
+ /* Extend the odd elment from HImode to SImode using
+ a paradoxical SUBREG. */
+ op0 = gen_reg_rtx (SImode);
+ emit_move_insn (op0, gen_lowpart (SImode,
+ XVECEXP (vals, 0,
+ i + i)));
+
+ /* Insert the SImode value as low element of V4SImode
+ vector. */
+ op1 = gen_reg_rtx (V4SImode);
+ op0 = gen_rtx_VEC_MERGE (V4SImode,
+ gen_rtx_VEC_DUPLICATE (V4SImode,
+ op0),
+ CONST0_RTX (V4SImode),
+ const1_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
+
+ /* Cast the V4SImode vector back to a V8HImode vector. */
+ op0 = gen_reg_rtx (mode);
+ emit_move_insn (op0, gen_lowpart (mode, op1));
+
+ /* Load even HI elements into the second positon. */
+ emit_insn (gen_vec_setv8hi (op0, XVECEXP (vals, 0,
+ i + i + 1),
+ const1_rtx));
+
+ /* Cast V8HImode vector to V4SImode vector. */
+ ops[i] = gen_reg_rtx (V4SImode);
+ emit_move_insn (ops[i], gen_lowpart (V4SImode, op0));
+ }
+
+ /* Interleave low V4SIs. */
+ for (i = j = 0; i < ARRAY_SIZE (ops); i += 2, j++)
+ {
+ op0 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_vec_interleave_lowv4si (op0, ops[i],
+ ops[i + 1]));
+
+ /* Cast V4SImode vectors to V2DImode vectors. */
+ op1 = gen_reg_rtx (V2DImode);
+ emit_move_insn (op1, gen_lowpart (V2DImode, op0));
+ ops[j] = op1;
+ }
+
+ /* Interleave low V2DIs. */
+ op0 = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vec_interleave_lowv2di (op0, ops[0], ops[1]));
+
+ /* Cast the V2DImode vector back to a V8HImode vector. */
+ emit_insn (gen_rtx_SET (VOIDmode, target,
+ gen_lowpart (mode, op0)));
+ return;
+ }
+
case V16QImode:
+ if (TARGET_SSE4_1)
+ {
+ rtx ops[8];
+ unsigned int i, j;
+
+ for (i = 0; i < ARRAY_SIZE (ops); i++)
+ {
+ /* Extend the odd elment from QImode to SImode using
+ a paradoxical SUBREG. */
+ op0 = gen_reg_rtx (SImode);
+ emit_move_insn (op0, gen_lowpart (SImode,
+ XVECEXP (vals, 0,
+ i + i)));
+
+ /* Insert the SImode value as low element of V4SImode
+ vector. */
+ op1 = gen_reg_rtx (V4SImode);
+ op0 = gen_rtx_VEC_MERGE (V4SImode,
+ gen_rtx_VEC_DUPLICATE (V4SImode,
+ op0),
+ CONST0_RTX (V4SImode),
+ const1_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
+
+ /* Cast the V4SImode vector back to a V16QImode vector. */
+ op0 = gen_reg_rtx (mode);
+ emit_move_insn (op0, gen_lowpart (mode, op1));
+
+ /* Load even QI elements into the second positon. */
+ emit_insn (gen_vec_setv16qi (op0, XVECEXP (vals, 0,
+ i + i + 1),
+ const1_rtx));
+
+ /* Cast V16QImode vector to V8HImode vector. */
+ ops[i] = gen_reg_rtx (V8HImode);
+ emit_move_insn (ops[i], gen_lowpart (V8HImode, op0));
+ }
+
+ /* Interleave low V8HIs. */
+ for (i = j = 0; i < ARRAY_SIZE (ops); i += 2, j++)
+ {
+ op0 = gen_reg_rtx (V8HImode);
+ emit_insn (gen_vec_interleave_lowv8hi (op0, ops[i],
+ ops[i + 1]));
+
+ /* Cast V8HImode vector to V4SImode vector. */
+ op1 = gen_reg_rtx (V4SImode);
+ emit_move_insn (op1, gen_lowpart (V4SImode, op0));
+ ops[j] = op1;
+ }
+
+ /* Interleave low V4SIs. */
+ for (i = j = 0; i < ARRAY_SIZE (ops) / 2; i += 2, j++)
+ {
+ op0 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_vec_interleave_lowv4si (op0, ops[i],
+ ops[i + 1]));
+
+ /* Cast V4SImode vectors to V2DImode vectors. */
+ op1 = gen_reg_rtx (V2DImode);
+ emit_move_insn (op1, gen_lowpart (V2DImode, op0));
+ ops[j] = op1;
+ }
+
+ /* Interleave low V2DIs. */
+ op0 = gen_reg_rtx (V2DImode);
+ emit_insn (gen_vec_interleave_lowv2di (op0, ops[0], ops[1]));
+
+ /* Cast the V2DImode vector back to a V8HImode vector. */
+ emit_insn (gen_rtx_SET (VOIDmode, target,
+ gen_lowpart (mode, op0)));
+ return;
+ }
+
case V4HImode:
case V8QImode:
break;
+2008-05-15 H.J. Lu <hongjiu.lu@intel.com>
+
+ * gcc.target/i386/m128-check.h: New.
+ * gcc.target/i386/set-v16qi-1.h: Likewise.
+ * gcc.target/i386/set-v16qi-2.h: Likewise.
+ * gcc.target/i386/set-v8hi-1.h: Likewise.
+ * gcc.target/i386/set-v8hi-2.h: Likewise.
+ * gcc.target/i386/sse2-set-v16qi-1.c: Likewise.
+ * gcc.target/i386/sse2-set-v16qi-2.c: Likewise.
+ * gcc.target/i386/sse2-set-v8hi-1.c: Likewise.
+ * gcc.target/i386/sse2-set-v8hi-2.c: Likewise.
+ * gcc.target/i386/sse4_1-set-v16qi-1.c: Likewise.
+ * gcc.target/i386/sse4_1-set-v16qi-2.c: Likewise.
+
+ * gcc.target/i386/sse2-check.h: Include m128-check.h. Don't
+ include <stdio.h>.
+ * gcc.target/i386/sse4_1-check.h: Likewise.
+
2008-05-15 Adam Nemet <anemet@caviumnetworks.com>
PR middle-end/36194
--- /dev/null
+#include <stdio.h>
+#include <emmintrin.h>
+
+typedef union
+{
+ __m128i x;
+ char a[16];
+} union128i_b;
+
+typedef union
+{
+ __m128i x;
+ short a[8];
+} union128i_w;
+
+typedef union
+{
+ __m128i x;
+ int a[4];
+} union128i_d;
+
+typedef union
+{
+ __m128i x;
+ long long a[2];
+} union128i_q;
+
+typedef union
+{
+ __m128 x;
+ float a[4];
+} union128;
+
+typedef union
+{
+ __m128d x;
+ double a[2];
+} union128d;
+
+#ifdef DEBUG
+#define PRINTF printf
+#else
+#define PRINTF(...)
+#endif
+
+#define CHECK_EXP(UINON_TYPE, VALUE_TYPE, FMT) \
+static int \
+__attribute__((noinline, unused)) \
+check_##UINON_TYPE (UINON_TYPE u, const VALUE_TYPE *v) \
+{ \
+ int i; \
+ int err = 0; \
+ \
+ for (i = 0; i < sizeof (u.a) / sizeof (u.a[0]); i++) \
+ if (u.a[i] != v[i]) \
+ { \
+ err++; \
+ PRINTF ("%i: " FMT " != " FMT "\n", \
+ i, v[i], u.a[i]); \
+ } \
+ return err; \
+}
+
+CHECK_EXP (union128i_b, char, "%d")
+CHECK_EXP (union128i_w, short, "%d")
+CHECK_EXP (union128i_d, int, "0x%x")
+CHECK_EXP (union128i_q, long long, "0x%llx")
+CHECK_EXP (union128, float, "%f")
+CHECK_EXP (union128d, double, "%f")
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#include CHECK_H
+
+static __m128i
+__attribute__((noinline))
+foo (char *v)
+{
+ return _mm_set_epi8 (v[15], v[14], v[13], v[12],
+ v[11], v[10], v[9], v[8],
+ v[7], v[6], v[5], v[4],
+ v[3], v[2], v[1], v[0]);
+}
+
+static void
+TEST (void)
+{
+ char v[16] =
+ {
+ -3, 60, 48, 104, -90, 37, -48, 78,
+ 4, 33, 81, 4, -89, 17, 8, 68
+ };
+ union128i_b u;
+
+ u.x = foo (v);
+ if (check_union128i_b (u, v))
+ abort ();
+}
--- /dev/null
+#include CHECK_H
+
+static __m128i
+__attribute__((noinline))
+foo (char x1, char x2, char x3, char x4,
+ char x5, char x6, char x7, char x8,
+ char x9, char x10, char x11, char x12,
+ char x13, char x14, char x15, char x16)
+{
+ return _mm_set_epi8 (x1, x2, x3, x4, x5, x6, x7, x8,
+ x9, x10, x11, x12, x13, x14, x15, x16);
+}
+
+static void
+TEST (void)
+{
+ char v[16] =
+ {
+ -3, 60, 48, 104, -90, 37, -48, 78,
+ 4, 33, 81, 4, -89, 17, 8, 68
+ };
+ union128i_b u;
+
+ u.x = foo (v[15], v[14], v[13], v[12],
+ v[11], v[10], v[9], v[8],
+ v[7], v[6], v[5], v[4],
+ v[3], v[2], v[1], v[0]);
+ if (check_union128i_b (u, v))
+ abort ();
+}
--- /dev/null
+#include CHECK_H
+
+static __m128i
+__attribute__((noinline))
+foo (short *v)
+{
+ return _mm_set_epi16 (v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
+}
+
+static void
+TEST (void)
+{
+ short v[8] = { -3, 6000, 48, 104, -90, 34567, -1248, 34678 };
+ union128i_w u;
+
+ u.x = foo (v);
+ if (check_union128i_w (u, v))
+ abort ();
+}
--- /dev/null
+#include CHECK_H
+
+__m128i
+__attribute__((noinline))
+foo (short x1, short x2, short x3, short x4,
+ short x5, short x6, short x7, short x8)
+{
+ return _mm_set_epi16 (x1, x2, x3, x4, x5, x6, x7, x8);
+}
+
+static void
+TEST (void)
+{
+ short v[8] = { -3, 2, 1, 9, 23, -173, -13, 69 };
+ union128i_w u;
+
+ u.x = foo (v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
+
+ if (check_union128i_w (u, v))
+ abort ();
+}
-#include <stdio.h>
#include <stdlib.h>
-
#include "cpuid.h"
+#include "m128-check.h"
static void sse2_test (void);
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v16qi-1.h"
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v16qi-2.h"
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v8hi-1.h"
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#define CHECK_H "sse2-check.h"
+#define TEST sse2_test
+
+#include "set-v8hi-2.h"
-#include <stdio.h>
#include <stdlib.h>
#include "cpuid.h"
+#include "m128-check.h"
static void sse4_1_test (void);
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "set-v16qi-1.h"
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -msse4.1" } */
+
+#define CHECK_H "sse4_1-check.h"
+#define TEST sse4_1_test
+
+#include "set-v16qi-2.h"