From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 13 Jan 2021 10:28:48 +0000 (+0100)
Subject: i386, expand: Optimize also 256-bit and 512-bit permutatations as vpmovzx if possible... 
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b1d1e2b54c6b9cf13f021176ba37d24cc4dc2fe1;p=gcc.git

i386, expand: Optimize also 256-bit and 512-bit permutatations as vpmovzx if possible [PR95905]

The following patch implements what I've talked about, i.e. to no longer
force operands of vec_perm_const into registers in the generic code, but let
each of the (currently 8) targets force it into registers individually,
giving the targets better control on if it does that and when and allowing
them to do something special with some particular operands.
And then defines the define_insn_and_split for the 256-bit and 512-bit
permutations into vpmovzx* (only the bw, wd and dq cases, in theory we could
add define_insn_and_split patterns also for the bd, bq and wq).

2021-01-13  Jakub Jelinek  <jakub@redhat.com>

	PR target/95905
	* optabs.c (expand_vec_perm_const): Don't force v0 and v1 into
	registers before calling targetm.vectorize.vec_perm_const, only after
	that.
	* config/i386/i386-expand.c (ix86_vectorize_vec_perm_const): Handle
	two argument permutation when one operand is zero vector and only
	after that force operands into registers.
	* config/i386/sse.md (*avx2_zero_extendv16qiv16hi2_1): New
	define_insn_and_split pattern.
	(*avx512bw_zero_extendv32qiv32hi2_1): Likewise.
	(*avx512f_zero_extendv16hiv16si2_1): Likewise.
	(*avx2_zero_extendv8hiv8si2_1): Likewise.
	(*avx512f_zero_extendv8siv8di2_1): Likewise.
	(*avx2_zero_extendv4siv4di2_1): Likewise.
	* config/mips/mips.c (mips_vectorize_vec_perm_const): Force operands
	into registers.
	* config/arm/arm.c (arm_vectorize_vec_perm_const): Likewise.
	* config/sparc/sparc.c (sparc_vectorize_vec_perm_const): Likewise.
	* config/ia64/ia64.c (ia64_vectorize_vec_perm_const): Likewise.
	* config/aarch64/aarch64.c (aarch64_vectorize_vec_perm_const): Likewise.
	* config/rs6000/rs6000.c (rs6000_vectorize_vec_perm_const): Likewise.
	* config/gcn/gcn.c (gcn_vectorize_vec_perm_const): Likewise.  Use std::swap.

	* gcc.target/i386/pr95905-2.c: Use scan-assembler-times instead of
	scan-assembler.  Add tests with zero vector as first __builtin_shuffle
	operand.
	* gcc.target/i386/pr95905-3.c: New test.
	* gcc.target/i386/pr95905-4.c: New test.
---

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 7536b75003b..c19dc6cd895 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -21084,8 +21084,11 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   d.vmode = vmode;
   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
   d.target = target;
-  d.op0 = op0;
-  d.op1 = op1;
+  d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
+  if (op0 == op1)
+    d.op1 = d.op0;
+  else
+    d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
   d.testing_p = !target;
 
   if (!d.testing_p)
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index c8e25714386..bebccc13456 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -31482,6 +31482,15 @@ arm_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, rtx op1,
     return false;
 
   d.target = target;
+  if (op0)
+    {
+      rtx nop0 = force_reg (vmode, op0);
+      if (op0 == op1)
+        op1 = nop0;
+      op0 = nop0;
+    }
+  if (op1)
+    op1 = force_reg (vmode, op1);
   d.op0 = op0;
   d.op1 = op1;
 
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index b08f4b32c9c..3b1762efd3a 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -3982,13 +3982,14 @@ gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst,
   for (unsigned int i = 0; i < nelt; ++i)
     perm[i] = sel[i] & (2 * nelt - 1);
 
+  src0 = force_reg (vmode, src0);
+  src1 = force_reg (vmode, src1);
+
   /* Make life a bit easier by swapping operands if necessary so that
      the first element always comes from src0.  */
   if (perm[0] >= nelt)
     {
-      rtx temp = src0;
-      src0 = src1;
-      src1 = temp;
+      std::swap (src0, src1);
 
       for (unsigned int i = 0; i < nelt; ++i)
 	if (perm[i] < nelt)
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index d793e5a5bce..280645f60d5 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -19929,6 +19929,32 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
 
   two_args = canonicalize_perm (&d);
 
+  /* If one of the operands is a zero vector, try to match pmovzx.  */
+  if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
+    {
+      struct expand_vec_perm_d dzero = d;
+      if (d.op0 == CONST0_RTX (vmode))
+	{
+	  d.op1 = dzero.op1 = force_reg (vmode, d.op1);
+	  std::swap (dzero.op0, dzero.op1);
+	  for (i = 0; i < nelt; ++i)
+	    dzero.perm[i] ^= nelt;
+	}
+      else
+	d.op0 = dzero.op0 = force_reg (vmode, d.op0);
+
+      if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
+				  dzero.perm, nelt, dzero.testing_p))
+	return true;
+    }
+
+  /* Force operands into registers.  */
+  rtx nop0 = force_reg (vmode, d.op0);
+  if (d.op0 == d.op1)
+    d.op1 = nop0;
+  d.op0 = nop0;
+  d.op1 = force_reg (vmode, d.op1);
+
   if (ix86_expand_vec_perm_const_1 (&d))
     return true;
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2a260c1cfbd..7f03fc491c3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17611,6 +17611,23 @@
    (set_attr "prefix" "maybe_evex")
    (set_attr "mode" "OI")])
 
+(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1"
+  [(set (match_operand:V32QI 0 "register_operand" "=v")
+	(vec_select:V32QI
+	  (vec_concat:V64QI
+	    (match_operand:V32QI 1 "nonimmediate_operand" "vm")
+	    (match_operand:V32QI 2 "const0_operand" "C"))
+	  (match_parallel 3 "pmovzx_parallel"
+	    [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode);
+  operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode);
+})
+
 (define_expand "<insn>v16qiv16hi2"
   [(set (match_operand:V16HI 0 "register_operand")
 	(any_extend:V16HI
@@ -17628,6 +17645,23 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
+(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_1"
+  [(set (match_operand:V64QI 0 "register_operand" "=v")
+	(vec_select:V64QI
+	  (vec_concat:V128QI
+	    (match_operand:V64QI 1 "nonimmediate_operand" "vm")
+	    (match_operand:V64QI 2 "const0_operand" "C"))
+	  (match_parallel 3 "pmovzx_parallel"
+	    [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode);
+  operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode);
+})
+
 (define_expand "<insn>v32qiv32hi2"
   [(set (match_operand:V32HI 0 "register_operand")
 	(any_extend:V32HI
@@ -17883,6 +17917,23 @@
 	  (match_operand:V16HI 1 "nonimmediate_operand")))]
   "TARGET_AVX512F")
 
+(define_insn_and_split "avx512f_zero_extendv16hiv16si2_1"
+  [(set (match_operand:V32HI 0 "register_operand" "=v")
+	(vec_select:V32HI
+	  (vec_concat:V64HI
+	    (match_operand:V32HI 1 "nonimmediate_operand" "vm")
+	    (match_operand:V32HI 2 "const0_operand" "C"))
+	  (match_parallel 3 "pmovzx_parallel"
+	    [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX512F"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode);
+  operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode);
+})
+
 (define_insn "avx2_<code>v8hiv8si2<mask_name>"
   [(set (match_operand:V8SI 0 "register_operand" "=v")
 	(any_extend:V8SI
@@ -17900,6 +17951,23 @@
 	  (match_operand:V8HI 1 "nonimmediate_operand")))]
   "TARGET_AVX2")
 
+(define_insn_and_split "avx2_zero_extendv8hiv8si2_1"
+  [(set (match_operand:V16HI 0 "register_operand" "=v")
+	(vec_select:V16HI
+	  (vec_concat:V32HI
+	    (match_operand:V16HI 1 "nonimmediate_operand" "vm")
+	    (match_operand:V16HI 2 "const0_operand" "C"))
+	  (match_parallel 3 "pmovzx_parallel"
+	    [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode);
+  operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode);
+})
+
 (define_insn "sse4_1_<code>v4hiv4si2<mask_name>"
   [(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
 	(any_extend:V4SI
@@ -18275,6 +18343,23 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
+(define_insn_and_split "*avx512f_zero_extendv8siv8di2_1"
+  [(set (match_operand:V16SI 0 "register_operand" "=v")
+	(vec_select:V16SI
+	  (vec_concat:V32SI
+	    (match_operand:V16SI 1 "nonimmediate_operand" "vm")
+	    (match_operand:V16SI 2 "const0_operand" "C"))
+	  (match_parallel 3 "pmovzx_parallel"
+	    [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX512F"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode);
+  operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode);
+})
+
 (define_expand "<insn>v8siv8di2"
   [(set (match_operand:V8DI 0 "register_operand" "=v")
 	(any_extend:V8DI
@@ -18292,6 +18377,23 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "OI")])
 
+(define_insn_and_split "*avx2_zero_extendv4siv4di2_1"
+  [(set (match_operand:V8SI 0 "register_operand" "=v")
+	(vec_select:V8SI
+	  (vec_concat:V16SI
+	    (match_operand:V8SI 1 "nonimmediate_operand" "vm")
+	    (match_operand:V8SI 2 "const0_operand" "C"))
+	  (match_parallel 3 "pmovzx_parallel"
+	    [(match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_AVX2"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))]
+{
+  operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode);
+  operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode);
+})
+
 (define_expand "<insn>v4siv4di2"
   [(set (match_operand:V4DI 0 "register_operand" "=v")
 	(any_extend:V4DI
diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c
index 8ddaccef75e..f1a6de14cdd 100644
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@@ -11759,6 +11759,15 @@ ia64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   unsigned int i, nelt, which;
 
   d.target = target;
+  if (op0)
+    {
+      rtx nop0 = force_reg (vmode, op0);
+      if (op0 == op1)
+        op1 = nop0;
+      op0 = nop0;
+    }
+  if (op1)
+    op1 = force_reg (vmode, op1);
   d.op0 = op0;
   d.op1 = op1;
 
diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c
index 8556ebba320..ebb04b72b2b 100644
--- a/gcc/config/mips/mips.c
+++ b/gcc/config/mips/mips.c
@@ -21624,6 +21624,15 @@ mips_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   bool ok;
 
   d.target = target;
+  if (op0)
+    {
+      rtx nop0 = force_reg (vmode, op0);
+      if (op0 == op1)
+        op1 = nop0;
+      op0 = nop0;
+    }
+  if (op1)
+    op1 = force_reg (vmode, op1);
   d.op0 = op0;
   d.op1 = op1;
 
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 67681d18150..b9e90ae0468 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -22946,6 +22946,16 @@ rs6000_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   if (TARGET_ALTIVEC && testing_p)
     return true;
 
+  if (op0)
+    {
+      rtx nop0 = force_reg (vmode, op0);
+      if (op0 == op1)
+        op1 = nop0;
+      op0 = nop0;
+    }
+  if (op1)
+    op1 = force_reg (vmode, op1);
+
   /* Check for ps_merge* or xxpermdi insns.  */
   if ((vmode == V2DFmode || vmode == V2DImode) && VECTOR_MEM_VSX_P (vmode))
     {
diff --git a/gcc/config/sparc/sparc.c b/gcc/config/sparc/sparc.c
index 8a5a269e621..f3557936114 100644
--- a/gcc/config/sparc/sparc.c
+++ b/gcc/config/sparc/sparc.c
@@ -12942,6 +12942,12 @@ sparc_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   if (vmode != V8QImode)
     return false;
 
+  rtx nop0 = force_reg (vmode, op0);
+  if (op0 == op1)
+    op1 = nop0;
+  op0 = nop0;
+  op1 = force_reg (vmode, op1);
+
   unsigned int i, mask;
   for (i = mask = 0; i < 8; ++i)
     mask |= (sel[i] & 0xf) << (28 - i*4);
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 6f671fd5c66..f4614a39458 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -6070,11 +6070,8 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
 
   if (targetm.vectorize.vec_perm_const != NULL)
     {
-      v0 = force_reg (mode, v0);
       if (single_arg_p)
 	v1 = v0;
-      else
-	v1 = force_reg (mode, v1);
 
       if (targetm.vectorize.vec_perm_const (mode, target, v0, v1, indices))
 	return target;
@@ -6095,6 +6092,11 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1,
 	return gen_lowpart (mode, target_qi);
     }
 
+  v0 = force_reg (mode, v0);
+  if (single_arg_p)
+    v1 = v0;
+  v1 = force_reg (mode, v1);
+
   /* Otherwise expand as a fully variable permuation.  */
 
   /* The optabs are only defined for selectors with the same width
diff --git a/gcc/testsuite/gcc.target/i386/pr95905-2.c b/gcc/testsuite/gcc.target/i386/pr95905-2.c
index 7cd20a3654a..231335c4dad 100644
--- a/gcc/testsuite/gcc.target/i386/pr95905-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr95905-2.c
@@ -1,9 +1,9 @@
 /* PR target/95905 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -msse4.1" } */
-/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */
-/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */
-/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */
+/* { dg-final { scan-assembler-times "\tv?pmovzxbw\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tv?pmovzxwd\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tv?pmovzxdq\t" 4 } } */
 
 typedef unsigned char V1 __attribute__((vector_size (16)));
 typedef unsigned short V2 __attribute__((vector_size (16)));
@@ -44,3 +44,39 @@ f6 (V3 *x)
 {
   return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 });
 }
+
+V1
+f7 (V1 x)
+{
+  return __builtin_shuffle ((V1) {}, x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V2
+f8 (V2 x)
+{
+  return __builtin_shuffle ((V2) {}, x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
+
+V3
+f9 (V3 x)
+{
+  return __builtin_shuffle ((V3) {}, x, (V3) { 4, 0, 5, 1 });
+}
+
+V1
+f10 (V1 *x)
+{
+  return __builtin_shuffle ((V1) {}, *x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V2
+f11 (V2 *x)
+{
+  return __builtin_shuffle ((V2) {}, *x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
+
+V3
+f12 (V3 *x)
+{
+  return __builtin_shuffle ((V3) {}, *x, (V3) { 4, 0, 5, 1 });
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95905-3.c b/gcc/testsuite/gcc.target/i386/pr95905-3.c
new file mode 100644
index 00000000000..b7b4bc5c4f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95905-3.c
@@ -0,0 +1,82 @@
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
+
+typedef unsigned char V1 __attribute__((vector_size (32)));
+typedef unsigned short V2 __attribute__((vector_size (32)));
+typedef unsigned int V3 __attribute__((vector_size (32)));
+
+V1
+f1 (V1 x)
+{
+  return __builtin_shuffle (x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
+}
+
+V2
+f2 (V2 x)
+{
+  return __builtin_shuffle (x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V3
+f3 (V3 x)
+{
+  return __builtin_shuffle (x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V1
+f4 (V1 *x)
+{
+  return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
+}
+
+V2
+f5 (V2 *x)
+{
+  return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V3
+f6 (V3 *x)
+{
+  return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V1
+f7 (V1 x)
+{
+  return __builtin_shuffle ((V1) {}, x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
+}
+
+V2
+f8 (V2 x)
+{
+  return __builtin_shuffle ((V2) {}, x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V3
+f9 (V3 x)
+{
+  return __builtin_shuffle ((V3) {}, x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
+
+V1
+f10 (V1 *x)
+{
+  return __builtin_shuffle ((V1) {}, *x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
+}
+
+V2
+f11 (V2 *x)
+{
+  return __builtin_shuffle ((V2) {}, *x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V3
+f12 (V3 *x)
+{
+  return __builtin_shuffle ((V3) {}, *x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95905-4.c b/gcc/testsuite/gcc.target/i386/pr95905-4.c
new file mode 100644
index 00000000000..43cdf7f7a78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95905-4.c
@@ -0,0 +1,82 @@
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
+
+typedef unsigned char V1 __attribute__((vector_size (64)));
+typedef unsigned short V2 __attribute__((vector_size (64)));
+typedef unsigned int V3 __attribute__((vector_size (64)));
+
+V1
+f1 (V1 x)
+{
+  return __builtin_shuffle (x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
+}
+
+V2
+f2 (V2 x)
+{
+  return __builtin_shuffle (x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
+}
+
+V3
+f3 (V3 x)
+{
+  return __builtin_shuffle (x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V1
+f4 (V1 *x)
+{
+  return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
+}
+
+V2
+f5 (V2 *x)
+{
+  return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
+}
+
+V3
+f6 (V3 *x)
+{
+  return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V1
+f7 (V1 x)
+{
+  return __builtin_shuffle ((V1) {}, x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
+}
+
+V2
+f8 (V2 x)
+{
+  return __builtin_shuffle ((V2) {}, x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
+}
+
+V3
+f9 (V3 x)
+{
+  return __builtin_shuffle ((V3) {}, x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V1
+f10 (V1 *x)
+{
+  return __builtin_shuffle ((V1) {}, *x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
+}
+
+V2
+f11 (V2 *x)
+{
+  return __builtin_shuffle ((V2) {}, *x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
+}
+
+V3
+f12 (V3 *x)
+{
+  return __builtin_shuffle ((V3) {}, *x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}