From 287cc750b0887e86cb309d976b17c7ee95f7ad48 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Mon, 19 Oct 2020 16:04:39 +0800
Subject: [PATCH] Support variable index vec_set.

gcc/ChangeLog:

	PR target/97194
	* config/i386/i386-expand.c (ix86_expand_vector_set_var): New function.
	* config/i386/i386-protos.h (ix86_expand_vector_set_var): New Decl.
	* config/i386/predicates.md (vec_setm_operand): New predicate,
	true for const_int_operand or register_operand under TARGET_AVX2.
	* config/i386/sse.md (vec_set<mode>): Support both constant
	and variable index vec_set.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx2-vec-set-1.c: New test.
	* gcc.target/i386/avx2-vec-set-2.c: New test.
	* gcc.target/i386/avx512bw-vec-set-1.c: New test.
	* gcc.target/i386/avx512bw-vec-set-2.c: New test.
	* gcc.target/i386/avx512f-vec-set-2.c: New test.
	* gcc.target/i386/avx512vl-vec-set-2.c: New test.
---
 gcc/config/i386/i386-expand.c                 | 106 ++++++++++++++++++
 gcc/config/i386/i386-protos.h                 |   1 +
 gcc/config/i386/predicates.md                 |   6 +
 gcc/config/i386/sse.md                        |   9 +-
 .../gcc.target/i386/avx2-vec-set-1.c          |  49 ++++++++
 .../gcc.target/i386/avx2-vec-set-2.c          |  50 +++++++++
 .../gcc.target/i386/avx512bw-vec-set-1.c      |  20 ++++
 .../gcc.target/i386/avx512bw-vec-set-2.c      |  44 ++++++++
 .../gcc.target/i386/avx512f-vec-set-2.c       |  42 +++++++
 .../gcc.target/i386/avx512vl-vec-set-2.c      |  55 +++++++++
 10 files changed, 379 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vec-set-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vec-set-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-vec-set-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-vec-set-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-vec-set-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-vec-set-2.c
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 044faf3423f..73e3358b290 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -14564,6 +14564,112 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
 }
 
+/* Implemented as
+   V setg (V v, int idx, T val)
+   {
+     V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
+     V valv = (V){val, val, val, val, val, val, val, val};
+     V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
+     v = (v & ~mask) | (valv & mask);
+     return v;
+   }.  */
+void
+ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
+{
+  rtx vec[64];
+  machine_mode mode = GET_MODE (target);
+  machine_mode cmp_mode = mode;
+  int n_elts = GET_MODE_NUNITS (mode);
+  rtx valv,idxv,constv,idx_tmp;
+  bool ok = false;
+
+  /* 512-bits vector byte/word broadcast and comparison only available
+     under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
+     when without TARGET_AVX512BW.  */
+  if ((mode == V32HImode || mode == V64QImode) && !TARGET_AVX512BW)
+    {
+      gcc_assert (TARGET_AVX512F);
+      rtx vhi, vlo, idx_hi;
+      machine_mode half_mode;
+      rtx (*extract_hi)(rtx, rtx);
+      rtx (*extract_lo)(rtx, rtx);
+
+      if (mode == V32HImode)
+	{
+	  half_mode = V16HImode;
+	  extract_hi = gen_vec_extract_hi_v32hi;
+	  extract_lo = gen_vec_extract_lo_v32hi;
+	}
+      else
+	{
+	  half_mode = V32QImode;
+	  extract_hi = gen_vec_extract_hi_v64qi;
+	  extract_lo = gen_vec_extract_lo_v64qi;
+	}
+
+      vhi = gen_reg_rtx (half_mode);
+      vlo = gen_reg_rtx (half_mode);
+      idx_hi = gen_reg_rtx (GET_MODE (idx));
+      emit_insn (extract_hi (vhi, target));
+      emit_insn (extract_lo (vlo, target));
+      vec[0] = idx_hi;
+      vec[1] = idx;
+      vec[2] = GEN_INT (n_elts/2);
+      ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
+      ix86_expand_vector_set_var (vhi, val, idx_hi);
+      ix86_expand_vector_set_var (vlo, val, idx);
+      emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
+      return;
+    }
+
+  if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
+    {
+      switch (mode)
+	{
+	case E_V2DFmode:
+	  cmp_mode = V2DImode;
+	  break;
+	case E_V4DFmode:
+	  cmp_mode = V4DImode;
+	  break;
+	case E_V8DFmode:
+	  cmp_mode = V8DImode;
+	  break;
+	case E_V4SFmode:
+	  cmp_mode = V4SImode;
+	  break;
+	case E_V8SFmode:
+	  cmp_mode = V8SImode;
+	  break;
+	case E_V16SFmode:
+	  cmp_mode = V16SImode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  for (int i = 0; i != n_elts; i++)
+    vec[i] = GEN_INT (i);
+  constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
+  valv = gen_reg_rtx (mode);
+  idxv = gen_reg_rtx (cmp_mode);
+  idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
+
+  ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
+  gcc_assert (ok);
+  ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
+  gcc_assert (ok);
+  vec[0] = target;
+  vec[1] = valv;
+  vec[2] = target;
+  vec[3] = gen_rtx_EQ (mode, idxv, constv);
+  vec[4] = idxv;
+  vec[5] = constv;
+  ok = ix86_expand_int_vcond (vec);
+  gcc_assert (ok);
+}
+
 void
 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
 {
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index a3d9f9eaf14..65347a59b79 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -245,6 +245,7 @@ extern rtx ix86_rewrite_tls_address (rtx);
 
 extern void ix86_expand_vector_init (bool, rtx, rtx);
 extern void ix86_expand_vector_set (bool, rtx, rtx, int);
+extern void ix86_expand_vector_set_var (rtx, rtx, rtx);
 extern void ix86_expand_vector_extract (bool, rtx, rtx, int);
 extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
 
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 36f9dfcc586..be5aaa4d76f 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1023,6 +1023,12 @@
   return op == const1_rtx || op == constm1_rtx;
 })
 
+;; True for registers, or const_int_operand, used to vec_setm expander.
+(define_predicate "vec_setm_operand"
+  (ior (and (match_operand 0 "register_operand")
+	    (match_test "TARGET_AVX2"))
+       (match_code "const_int")))
+
 ;; True for registers, or 1 or -1.  Used to optimize double-word shifts.
 (define_predicate "reg_or_pm1_operand"
   (ior (match_operand 0 "register_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8437ad27087..11936809561 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -8098,11 +8098,14 @@
 (define_expand "vec_set<mode>"
   [(match_operand:V 0 "register_operand")
    (match_operand:<ssescalarmode> 1 "register_operand")
-   (match_operand 2 "const_int_operand")]
+   (match_operand 2 "vec_setm_operand")]
   "TARGET_SSE"
 {
-  ix86_expand_vector_set (false, operands[0], operands[1],
-			  INTVAL (operands[2]));
+  if (CONST_INT_P (operands[2]))
+    ix86_expand_vector_set (false, operands[0], operands[1],
+			    INTVAL (operands[2]));
+  else
+    ix86_expand_vector_set_var (operands[0], operands[1], operands[2]);
   DONE;
 })
 
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vec-set-1.c b/gcc/testsuite/gcc.target/i386/avx2-vec-set-1.c
new file mode 100644
index 00000000000..4c16ec5dfc4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vec-set-1.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-times {(?n)vpcmpeq[bwdq]} 12 } } */
+/* { dg-final { scan-assembler-times {(?n)vp?blendv} 12 } } */
+
+typedef char v32qi __attribute__ ((vector_size (32)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+typedef short v16hi __attribute__ ((vector_size (32)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+
+typedef int v8si __attribute__ ((vector_size (32)));
+typedef int v4si __attribute__ ((vector_size (16)));
+
+typedef long long v4di __attribute__ ((vector_size (32)));
+typedef long long v2di __attribute__ ((vector_size (16)));
+
+typedef float v8sf __attribute__ ((vector_size (32)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+
+typedef double v4df __attribute__ ((vector_size (32)));
+typedef double v2df __attribute__ ((vector_size (16)));
+
+#define FOO(VTYPE, TYPE)			\
+  VTYPE						\
+  __attribute__ ((noipa))			\
+  foo_##VTYPE (VTYPE a, TYPE b, unsigned int c)	\
+  {						\
+    a[c] = b;					\
+    return a;					\
+  }						\
+
+FOO (v16qi, char);
+FOO (v32qi, char);
+
+FOO (v8hi, short);
+FOO (v16hi, short);
+
+FOO (v4si, int);
+FOO (v8si, int);
+
+FOO (v2di, long long);
+FOO (v4di, long long);
+
+FOO (v4sf, float);
+FOO (v8sf, float);
+
+FOO (v2df, double);
+FOO (v4df, double);
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vec-set-2.c b/gcc/testsuite/gcc.target/i386/avx2-vec-set-2.c
new file mode 100644
index 00000000000..9086ef406f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vec-set-2.c
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx2 } */
+/* { dg-options "-O2 -mavx2" } */
+
+
+#ifndef CHECK
+#define CHECK "avx2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx2_test
+#endif
+
+#include CHECK
+
+#include "avx2-vec-set-1.c"
+
+#define CALC_TEST(vtype, type, N, idx)				\
+do								\
+  {								\
+    int i,val = idx * idx - idx * 3 + 16;			\
+    type res[N],exp[N];						\
+    vtype resv;							\
+    for (i = 0; i < N; i++)					\
+      {								\
+	res[i] = i * i - i * 3 + 15;				\
+	exp[i] = res[i];					\
+      }								\
+    exp[idx] = val;						\
+    resv = foo_##vtype (*(vtype *)&res[0], val, idx);		\
+    for (i = 0; i < N; i++)					\
+      {								\
+	if (resv[i] != exp[i])					\
+	  abort ();						\
+      }								\
+  }								\
+while (0)
+
+static void
+TEST (void)
+{
+  CALC_TEST (v32qi, char, 32, 17);
+  CALC_TEST (v16qi, char, 16, 5);
+  CALC_TEST (v16hi, short, 16, 9);
+  CALC_TEST (v8hi, short, 8, 6);
+  CALC_TEST (v8si, int, 8, 3);
+  CALC_TEST (v4si, int, 4, 2);
+  CALC_TEST (v4di, long long, 4, 1);
+  CALC_TEST (v2di, long long, 2, 0);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vec-set-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-vec-set-1.c
new file mode 100644
index 00000000000..5cfbc85732e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-vec-set-1.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times {(?n)(?:vp?broadcast|vmovddup)} 36 } } */
+/* { dg-final { scan-assembler-times {(?n)vpcmp[bwdq][ \t]+\$0} 18 } } */
+
+typedef char v64qi __attribute__ ((vector_size (64)));
+typedef short v32hi __attribute__ ((vector_size (64)));
+typedef int v16si __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__ ((vector_size (64)));
+typedef float v16sf __attribute__ ((vector_size (64)));
+typedef double v8df __attribute__ ((vector_size (64)));
+
+#include "avx2-vec-set-1.c"
+
+FOO (v64qi, char);
+FOO (v32hi, short);
+FOO (v16si, int);
+FOO (v8di, long long);
+FOO (v16sf, float);
+FOO (v8df, double);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vec-set-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-vec-set-2.c
new file mode 100644
index 00000000000..22e64183ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-vec-set-2.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-options "-O2 -mavx512bw" } */
+
+
+#ifndef CHECK
+#define CHECK "avx512f-check.h"
+#endif
+
+#define AVX512BW
+
+#include CHECK
+
+#include "avx512bw-vec-set-1.c"
+
+#define CALC_TEST(vtype, type, N, idx)				\
+do								\
+  {								\
+    int i,val = idx * idx - idx * 3 + 16;			\
+    type res[N],exp[N];						\
+    vtype resv;							\
+    for (i = 0; i < N; i++)					\
+      {								\
+	res[i] = i * i - i * 3 + 15;				\
+	exp[i] = res[i];					\
+      }								\
+    exp[idx] = val;						\
+    resv = foo_##vtype (*(vtype *)&res[0], val, idx);		\
+    for (i = 0; i < N; i++)					\
+      {								\
+	if (resv[i] != exp[i])					\
+	  abort ();						\
+      }								\
+  }								\
+while (0)
+
+static void
+test_512 (void)
+{
+  CALC_TEST (v64qi, char, 64, 50);
+  CALC_TEST (v32hi, short, 32, 30);
+  CALC_TEST (v16si, int, 16, 15);
+  CALC_TEST (v8di, long long, 8, 7);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vec-set-2.c b/gcc/testsuite/gcc.target/i386/avx512f-vec-set-2.c
new file mode 100644
index 00000000000..8f2aa03ec11
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512f-vec-set-2.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-options "-O2 -mavx512f -mno-avx512bw" } */
+
+
+#ifndef CHECK
+#define CHECK "avx512f-check.h"
+#endif
+
+#define AVX512F
+
+#include CHECK
+
+#include "avx512bw-vec-set-1.c"
+
+#define CALC_TEST(vtype, type, N, idx)				\
+do								\
+  {								\
+    int i,val = idx * idx - idx * 3 + 16;			\
+    type res[N],exp[N];						\
+    vtype resv;							\
+    for (i = 0; i < N; i++)					\
+      {								\
+	res[i] = i * i - i * 3 + 15;				\
+	exp[i] = res[i];					\
+      }								\
+    exp[idx] = val;						\
+    resv = foo_##vtype (*(vtype *)&res[0], val, idx);		\
+    for (i = 0; i < N; i++)					\
+      {								\
+	if (resv[i] != exp[i])					\
+	  abort ();						\
+      }								\
+  }								\
+while (0)
+
+static void
+test_512 (void)
+{
+  CALC_TEST (v64qi, char, 64, 50);
+  CALC_TEST (v32hi, short, 32, 30);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vec-set-2.c b/gcc/testsuite/gcc.target/i386/avx512vl-vec-set-2.c
new file mode 100644
index 00000000000..4f327427a64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vec-set-2.c
@@ -0,0 +1,55 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+
+#ifndef CHECK
+#define CHECK "avx512f-check.h"
+#endif
+
+#define AVX512VL
+#define AVX512BW
+
+#include CHECK
+
+#include "avx512bw-vec-set-1.c"
+
+#define CALC_TEST(vtype, type, N, idx)				\
+do								\
+  {								\
+    int i,val = idx * idx - idx * 3 + 16;			\
+    type res[N],exp[N];						\
+    vtype resv;							\
+    for (i = 0; i < N; i++)					\
+      {								\
+	res[i] = i * i - i * 3 + 15;				\
+	exp[i] = res[i];					\
+      }								\
+    exp[idx] = val;						\
+    resv = foo_##vtype (*(vtype *)&res[0], val, idx);		\
+    for (i = 0; i < N; i++)					\
+      {								\
+	if (resv[i] != exp[i])					\
+	  abort ();						\
+      }								\
+  }								\
+while (0)
+
+static void
+test_256 (void)
+{
+  CALC_TEST (v32qi, char, 32, 17);
+  CALC_TEST (v16hi, short, 16, 9);
+  CALC_TEST (v8si, int, 8, 3);
+  CALC_TEST (v4di, long long, 4, 1);
+}
+
+static void
+test_128 (void)
+{
+  CALC_TEST (v16qi, char, 16, 5);
+  CALC_TEST (v8hi, short, 8, 6);
+  CALC_TEST (v4si, int, 4, 2);
+  CALC_TEST (v2di, long long, 2, 0);
+}
-- 
2.30.2