Fix incorrect replacement of vmovdqu32 with vpblendd which can cause fault.
authorliuhongt <hongtao.liu@intel.com>
Tue, 3 Nov 2020 09:26:43 +0000 (17:26 +0800)
committerliuhongt <hongtao.liu@intel.com>
Thu, 3 Dec 2020 05:34:05 +0000 (13:34 +0800)
gcc/ChangeLog:

PR target/97642
* config/i386/i386-expand.c
(ix86_expand_special_args_builtin): Don't move all-ones mask
operands into register.
* config/i386/sse.md (UNSPEC_MASKLOAD): New unspec.
(*<avx512>_load<mode>_mask): New define_insns for masked load
instructions.
(<avx512>_load<mode>_mask): Changed to define_expands which
specifically handle memory or all-ones mask operands.
(<avx512>_blendm<mode>): Changed to define_insns which are same
as original <avx512>_load<mode>_mask with adjustment of
operands order.
(*<avx512>_load<mode>): New define_insn_and_split which is
used to optimize for masked load with all one mask.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-vmovdqu16-1.c: Adjust testcase to
make sure only masked load instruction is generated.
* gcc.target/i386/avx512bw-vmovdqu8-1.c: Ditto.
* gcc.target/i386/avx512f-vmovapd-1.c: Ditto.
* gcc.target/i386/avx512f-vmovaps-1.c: Ditto.
* gcc.target/i386/avx512f-vmovdqa32-1.c: Ditto.
* gcc.target/i386/avx512f-vmovdqa64-1.c: Ditto.
* gcc.target/i386/avx512vl-vmovapd-1.c: Ditto.
* gcc.target/i386/avx512vl-vmovaps-1.c: Ditto.
* gcc.target/i386/avx512vl-vmovdqa32-1.c: Ditto.
* gcc.target/i386/avx512vl-vmovdqa64-1.c: Ditto.
* gcc.target/i386/pr97642-1.c: New test.
* gcc.target/i386/pr97642-2.c: New test.

14 files changed:
gcc/config/i386/i386-expand.c
gcc/config/i386/sse.md
gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu16-1.c
gcc/testsuite/gcc.target/i386/avx512bw-vmovdqu8-1.c
gcc/testsuite/gcc.target/i386/avx512f-vmovapd-1.c
gcc/testsuite/gcc.target/i386/avx512f-vmovaps-1.c
gcc/testsuite/gcc.target/i386/avx512f-vmovdqa32-1.c
gcc/testsuite/gcc.target/i386/avx512f-vmovdqa64-1.c
gcc/testsuite/gcc.target/i386/avx512vl-vmovapd-1.c
gcc/testsuite/gcc.target/i386/avx512vl-vmovaps-1.c
gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa32-1.c
gcc/testsuite/gcc.target/i386/avx512vl-vmovdqa64-1.c
gcc/testsuite/gcc.target/i386/pr97642-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/pr97642-2.c [new file with mode: 0644]

index bf775a38fc3c2deeaa86f6779f949a5e94b64dba..7c31cc7daac41c78c2403eddfe7a4d85aafa47ce 100644 (file)
@@ -10832,7 +10832,13 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
 
          op = fixup_modeless_constant (op, mode);
 
-         if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
+         /* NB: 3-operands load implied it's a mask load,
+            and that mask operand shoud be at the end.
+            Keep all-ones mask which would be simplified by the expander.  */
+         if (nargs == 3 && i == 2 && klass == load
+             && constm1_operand (op, mode))
+           ;
+         else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
            op = copy_to_mode_reg (mode, op);
          else
            {
index 559f8426283ac1b9321dc48018b17b6e6ce64171..78f73676e880075d0ebb98e749eea5acf1f704a5 100644 (file)
   UNSPEC_MASKOP
   UNSPEC_KORTEST
   UNSPEC_KTEST
+  ;; Mask load
+  UNSPEC_MASKLOAD
 
   ;; For embed. rounding feature
   UNSPEC_EMBEDDED_ROUNDING
              ]
              (symbol_ref "true")))])
 
-(define_insn "<avx512>_load<mode>_mask"
-  [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v,v")
+;; If mem_addr points to a memory region with less than whole vector size bytes
+;; of accessible memory and k is a mask that would prevent reading the inaccessible
+;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to vpblendd
+;; See pr97642.
+(define_expand "<avx512>_load<mode>_mask"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand")
        (vec_merge:V48_AVX512VL
-         (match_operand:V48_AVX512VL 1 "nonimmediate_operand" "vm,vm")
-         (match_operand:V48_AVX512VL 2 "nonimm_or_0_operand" "0C,v")
-         (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+         (match_operand:V48_AVX512VL 1 "nonimmediate_operand")
+         (match_operand:V48_AVX512VL 2 "nonimm_or_0_operand")
+         (match_operand:<avx512fmaskmode> 3 "register_or_constm1_operand")))]
   "TARGET_AVX512F"
 {
-  if (REG_P (operands[2])
-     && REGNO (operands[2]) != REGNO (operands[0]))
-    return "v<sseintprefix>blendm<ssemodesuffix>\t{%1, %2, %0%{%3%}|%0%{%3%}, %2, %1}";
+  if (CONST_INT_P (operands[3]))
+    {
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  else if (MEM_P (operands[1]))
+    operands[1] = gen_rtx_UNSPEC (<MODE>mode,
+                                gen_rtvec(1, operands[1]),
+                                UNSPEC_MASKLOAD);
+})
 
+(define_insn "*<avx512>_load<mode>_mask"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
+       (vec_merge:V48_AVX512VL
+         (unspec:V48_AVX512VL
+           [(match_operand:V48_AVX512VL 1 "memory_operand" "m")]
+           UNSPEC_MASKLOAD)
+         (match_operand:V48_AVX512VL 2 "nonimm_or_0_operand" "0C")
+         (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
+  "TARGET_AVX512F"
+{
   if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode)))
     {
       if (misaligned_operand (operands[1], <MODE>mode))
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "<avx512>_load<mode>_mask"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v")
+(define_insn_and_split "*<avx512>_load<mode>"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand")
+       (unspec:V48_AVX512VL
+         [(match_operand:V48_AVX512VL 1 "memory_operand")]
+         UNSPEC_MASKLOAD))]
+  "TARGET_AVX512F"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (match_dup 1))])
+
+(define_expand "<avx512>_load<mode>_mask"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
        (vec_merge:VI12_AVX512VL
-         (match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "vm,vm")
-         (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C,v")
-         (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+         (match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
+         (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand")
+         (match_operand:<avx512fmaskmode> 3 "register_or_constm1_operand")))]
   "TARGET_AVX512BW"
-  "@
-    vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}
-    vpblendm<ssemodesuffix>\t{%1, %2, %0%{%3%}|%0%{%3%}, %2, %1}"
+{
+  if (CONST_INT_P (operands[3]))
+    {
+      emit_insn (gen_rtx_SET (operands[0], operands[1]));
+      DONE;
+    }
+  else if (MEM_P (operands[1]))
+    operands[1] = gen_rtx_UNSPEC (<MODE>mode,
+                                gen_rtvec(1, operands[1]),
+                                UNSPEC_MASKLOAD);
+
+})
+
+(define_insn "*<avx512>_load<mode>_mask"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+       (vec_merge:VI12_AVX512VL
+         (unspec:VI12_AVX512VL
+           [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+           UNSPEC_MASKLOAD)
+         (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C")
+         (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
+  "TARGET_AVX512BW"
+  "vmovdqu<ssescalarsize>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*<avx512>_load<mode>"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+       (unspec:VI12_AVX512VL
+         [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+         UNSPEC_MASKLOAD))]
+  "TARGET_AVX512BW"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (match_dup 1))])
+
 (define_insn "avx512f_mov<ssescalarmodelower>_mask"
   [(set (match_operand:VF_128 0 "register_operand" "=v")
        (vec_merge:VF_128
    (set_attr "memory" "store")
    (set_attr "mode" "<MODE>")])
 
-(define_expand "<avx512>_blendm<mode>"
-  [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v")
+(define_insn "<avx512>_blendm<mode>"
+  [(set (match_operand:V48_AVX512VL 0 "register_operand" "=v,v")
        (vec_merge:V48_AVX512VL
-         (match_operand:V48_AVX512VL 2 "nonimmediate_operand" "vm")
-         (match_operand:V48_AVX512VL 1 "register_operand" "v")
-         (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
-  "TARGET_AVX512F")
+         (match_operand:V48_AVX512VL 2 "nonimmediate_operand" "vm,vm")
+         (match_operand:V48_AVX512VL 1 "nonimm_or_0_operand" "0C,v")
+         (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512F"
+{
+  if (REG_P (operands[1])
+     && REGNO (operands[1]) != REGNO (operands[0]))
+    return "v<sseintprefix>blendm<ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}";
 
-(define_expand "<avx512>_blendm<mode>"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
+  if (FLOAT_MODE_P (GET_MODE_INNER (<MODE>mode)))
+    {
+      if (misaligned_operand (operands[2], <MODE>mode))
+       return "vmovu<ssemodesuffix>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}";
+      else
+       return "vmova<ssemodesuffix>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}";
+    }
+  else
+    {
+      if (misaligned_operand (operands[2], <MODE>mode))
+       return "vmovdqu<ssescalarsize>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}";
+      else
+       return "vmovdqa<ssescalarsize>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}";
+    }
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<avx512>_blendm<mode>"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v")
        (vec_merge:VI12_AVX512VL
-         (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
-         (match_operand:VI12_AVX512VL 1 "register_operand" "v")
-         (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk")))]
-  "TARGET_AVX512BW")
+         (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm,vm")
+         (match_operand:VI12_AVX512VL 1 "nonimm_or_0_operand" "0C,v")
+         (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")))]
+  "TARGET_AVX512BW"
+  "@
+    vmovdqu<ssescalarsize>\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}
+    vpblendm<ssemodesuffix>\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<avx512>_store<mode>_mask"
   [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
index dcb8caaa73e2bf37b29f9ab2cc404a7e33c027b1..8603a1909c792eea004c003f33e4cc8f6224a74d 100644 (file)
@@ -1,8 +1,8 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu16|vpblendmw)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu16|vpblendmw)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu16|vpblendmw)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
index a335bcab3b2782f51b48cf50652443b9507d5a5f..d1e33926c81fdc6a8806751057a0692b2666b44c 100644 (file)
@@ -1,8 +1,8 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512bw -mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu8|vpblendmb)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu8|vpblendmb)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqu8|vpblendmb)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqu8\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
index 7fc84b16e2b50fe49705315b9cb0f77b60ac1959..e869f70665aaddba48cacc970cb0e5c8b8feabf1 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovapd|vblendmpd)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
index c2e2655fda6e96aefce1031fe59dba9bb7c23cec..a7635a3ebf257eb225a230f0ce100bd15e06b2f6 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovaps|vblendmps)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
index 8fb816c13176f73640667af9cfa7fa4a97fd52d7..b93727d9ef2635efe614098280c5422e9ef0ae86 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa32|vpblendmd)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
index 4352b12b6e7f4c7e3496b0de9f5cd2e8ae1f1bd6..1c372c4f92a6cfba3f02a1b94fa6c42d04881d22 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa64|vpblendmq)\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
index fd59660f93227f0ba5e8fee000b240aab112f2e0..89c3ebefe35bc3389351224acb1fab53c687608b 100644 (file)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovapd|vblendmpd)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovapd|vblendmpd)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovapd\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
index 455b1a9dc37efa7b04bf6860f10f5186165c4a88..2196ebb55d903de1e595883a3a2e082bc8db8fc1 100644 (file)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovaps|vblendmps)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovaps|vblendmps)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovaps\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
index 5c6a3d0bfb40f9d61c33c7310deb05bccff2d5bb..9f991dbaca2028c72c4eaee5194e2b6a506d7f1d 100644 (file)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa32|vpblendmd)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa32|vpblendmd)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa32\[ \\t\]+\[^\{\n\]*\\)\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
index 592541aeb8e93e24ffcc246b4f74a1a2ed6bae83..d20b4a7b9973619a40a3da37e0345306de91d53a 100644 (file)
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512vl -O2" } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa64|vpblendmq)\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
-/* { dg-final { scan-assembler-times "(?:vmovdqa64|vpblendmq)\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa64\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } } */
 /* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\\(\[^\n\]*%ymm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 { target nonpic } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr97642-1.c b/gcc/testsuite/gcc.target/i386/pr97642-1.c
new file mode 100644 (file)
index 0000000..f511440
--- /dev/null
@@ -0,0 +1,41 @@
+/* PR target/97642 */
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -O2" } */
+/* { dg-final { scan-assembler-not { k[0-8] } } } */
+
+#include <immintrin.h>
+__m128i
+foo1 (__m128i src, void const* P)
+{
+  return _mm_mask_loadu_epi32 (src, 15, P);
+}
+
+__m256i
+foo2 (__m256i src, void const* P)
+{
+  return _mm256_mask_loadu_epi32 (src, 255, P);
+}
+
+__m512i
+foo3 (__m512i src, void const* P)
+{
+  return _mm512_mask_loadu_epi32 (src, 65535 , P);
+}
+
+__m128i
+foo4 (__m128i src, void const* P)
+{
+  return _mm_mask_loadu_epi32 (src, -1, P);
+}
+
+__m256i
+foo5 (__m256i src, void const* P)
+{
+  return _mm256_mask_loadu_epi32 (src, -1, P);
+}
+
+__m512i
+foo6 (__m512i src, void const* P)
+{
+  return _mm512_mask_loadu_epi32 (src, -1 , P);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr97642-2.c b/gcc/testsuite/gcc.target/i386/pr97642-2.c
new file mode 100644 (file)
index 0000000..53a6154
--- /dev/null
@@ -0,0 +1,77 @@
+/* PR target/97642 */
+/* { dg-do run { target *-*-linux* } } */
+/* { dg-options "-O2 -mavx512dq -mavx512vl -mavx512bw" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512dq } */
+/* { dg-require-effective-target avx512bw } */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <stdint.h>
+#include <sys/mman.h>
+
+#define N 5
+
+// Faults with GCC because usage of vpblendd
+__m256i __attribute__((noinline)) mask_load(uint32_t * arr) {
+  __m256i tmp;
+  return _mm256_mask_loadu_epi32(tmp, (1 << N) - 1, arr);
+}
+
+// Faults
+__m256i __attribute__((noinline)) blend_load_asm(uint32_t * arr) {
+  __m256i tmp = _mm256_set1_epi64x(0);
+  asm volatile("vpblendd %[m], (%[arr]), %[tmp], %[tmp]\n\t"
+              : [ tmp ] "+x"(tmp)
+              : [ arr ] "r"(arr), [ m ] "i"(((1 << N) - 1))
+              :);
+  return tmp;
+}
+
+// Does not fault
+__m256i __attribute__((noinline)) mask_load_asm(uint32_t * arr) {
+  __m256i           tmp;
+  asm volatile(
+              "movb %[m], %%al\n\t"
+              "kmovb %%eax, %%k1\n\t"
+              "vmovdqu32 (%[arr]), %[tmp] %{%%k1} %{z%}\n\t"
+              : [ tmp ] "+x"(tmp)
+              : [ arr ] "r"(arr), [ m ] "i"(((1 << N) - 1))
+              : "eax", "k1");
+  return tmp;
+}
+
+
+void __attribute__((noinline)) mask_store(uint32_t * arr, __m256i v) {
+  return _mm256_mask_storeu_epi32(arr, (1 << N) - 1, v);
+}
+
+
+#define NPAGES      (2)
+#define END_OF_PAGE (1024 - N)
+
+#ifndef LOAD_METHOD
+#define LOAD_METHOD mask_load // mask_load_asm does not fault
+#endif
+
+
+int
+main() {
+  if (!(__builtin_cpu_supports ("avx512dq")
+       && __builtin_cpu_supports ("avx512vl")
+       && __builtin_cpu_supports ("avx512bw")))
+    return 0;
+
+  uint32_t * addr =
+    (uint32_t *)mmap(NULL, NPAGES * 4096, PROT_READ | PROT_WRITE,
+                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+  for (uint32_t i = 0; i < NPAGES; i += 2) {
+
+    uint32_t page_offset      = 1024 * i + END_OF_PAGE;
+    uint32_t next_page_offset = 1024 * (i + 1);
+
+    assert(!mprotect(addr + next_page_offset, 4096, PROT_NONE));
+    mask_store(addr + page_offset, LOAD_METHOD(addr + page_offset));
+  }
+}