Fix TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling.
authorUros Bizjak <ubizjak@gmail.com>
Fri, 31 Jan 2020 15:44:36 +0000 (16:44 +0100)
committerUros Bizjak <ubizjak@gmail.com>
Fri, 31 Jan 2020 15:44:36 +0000 (16:44 +0100)
The reason for TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL on AMD target is
only insn size, as advised in e.g. Software Optimization Guide for the
AMD Family 15h Processors [1], section 7.1.2, where it is said:

--quote--
7.1.2 Reduce Instruction SizeOptimization

Reduce the size of instructions when possible.

Rationale

Using smaller instruction sizes improves instruction fetch throughput.
Specific examples include the following:

*In SIMD code, use the single-precision (PS) form of instructions
instead of the double-precision (PD) form. For example, for register
to register moves, MOVAPS achieves the same result as MOVAPD, but uses
one less byte to encode the instruction and has no prefix byte. Other
examples in which single-precision forms can be substituted for
double-precision forms include MOVUPS, MOVNTPS, XORPS, ORPS, ANDPS,
and SHUFPS.
...
--/quote--

Please note that this optimization applies only to non-AVX forms, as
demonstrated by:

   0:   0f 28 c8                movaps %xmm0,%xmm1
   3:   66 0f 28 c8             movapd %xmm0,%xmm1
   7:   c5 f8 28 d1             vmovaps %xmm1,%xmm2
   b:   c5 f9 28 d1             vmovapd %xmm1,%xmm2

Also note that MOVDQA is missing in the above optimization. It is
harmful to substitute MOVDQA with MOVAPS, as it can (and does)
introduce +1 cycle forwarding penalty between FLT (FPA/FPM) and INT
(VALU) FP clusters.

[1] https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf

gcc/ChangeLog
gcc/config/i386/i386.md
gcc/config/i386/mmx.md
gcc/config/i386/sse.md
gcc/config/i386/x86-tune.def

index 78a831086775c96ef071dd3a24a590722523959a..0e99b72043762dfdfcce25a711c8344d7b521c04 100644 (file)
@@ -1,3 +1,32 @@
+2020-01-31  Uroš Bizjak  <ubizjak@gmail.com>
+
+       * config/i386/i386.md (*movoi_internal_avx): Do not check for
+       TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.  Remove MODE_V8SF handling.
+       (*movti_internal): Do not check for
+       TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.
+       (*movtf_internal): Move check for TARGET_SSE2 and size optimization
+       just after check for TARGET_AVX.
+       (*movdf_internal): Ditto.
+       * config/i386/mmx.md (*mov<mode>_internal): Do not check for
+       TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL.
+       * config/i386/sse.md (mov<mode>_internal): Only check
+       TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL with V2DFmode.  Move check
+       for TARGET_SSE2 and size optimization just after check for TARGET_AVX.
+       (<sse>_andnot<mode>3<mask_name>): Move check for
+       TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL after check for TARGET_AVX.
+       (<code><mode>3<mask_name>): Ditto.
+       (*andnot<mode>3): Ditto.
+       (*andnottf3): Ditto.
+       (*<code><mode>3): Ditto.
+       (*<code>tf3): Ditto.
+       (*andnot<VI:mode>3): Remove
+       TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL handling.
+       (<mask_codefor><code><VI48_AVX_AVX512F:mode>3<mask_name>): Ditto.
+       (*<code><VI12_AVX_AVX512F:mode>3): Ditto.
+       (sse4_1_blendv<ssemodesuffix>): Ditto.
+       * config/i386/x86-tune.def (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL):
+       Explain that tune applies to 128bit instructions only.
+
 2020-01-31  Kwok Cheung Yeung  <kcy@codesourcery.com>
 
        * config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count
index f5c8d552af8ddef6f0e564617f767d60991974bb..46b442dae51c66bc956ce5125c0e8143a48a524d 100644 (file)
       if (misaligned_operand (operands[0], OImode)
          || misaligned_operand (operands[1], OImode))
        {
-         if (get_attr_mode (insn) == MODE_V8SF)
-           return "vmovups\t{%1, %0|%0, %1}";
-         else if (get_attr_mode (insn) == MODE_XI)
+         if (get_attr_mode (insn) == MODE_XI)
            return "vmovdqu32\t{%1, %0|%0, %1}";
          else
            return "vmovdqu\t{%1, %0|%0, %1}";
        }
       else
        {
-         if (get_attr_mode (insn) == MODE_V8SF)
-           return "vmovaps\t{%1, %0|%0, %1}";
-         else if (get_attr_mode (insn) == MODE_XI)
+         if (get_attr_mode (insn) == MODE_XI)
            return "vmovdqa32\t{%1, %0|%0, %1}";
          else
            return "vmovdqa\t{%1, %0|%0, %1}";
               (and (eq_attr "alternative" "1")
                    (match_test "TARGET_AVX512VL"))
                 (const_string "XI")
-              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
-                (const_string "V8SF")
              ]
              (const_string "OI")))])
 
               (match_test "TARGET_AVX")
                 (const_string "TI")
               (ior (not (match_test "TARGET_SSE2"))
-                   (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
-                        (and (eq_attr "alternative" "5")
-                             (match_test "TARGET_SSE_TYPELESS_STORES"))))
+                   (match_test "optimize_function_for_size_p (cfun)"))
                 (const_string "V4SF")
-              (match_test "optimize_function_for_size_p (cfun)")
+              (and (eq_attr "alternative" "5")
+                   (match_test "TARGET_SSE_TYPELESS_STORES"))
                 (const_string "V4SF")
               ]
               (const_string "TI")))
              (cond [(ior (match_operand 0 "ext_sse_reg_operand")
                          (match_operand 1 "ext_sse_reg_operand"))
                       (const_string "TI")
-                    (ior (not (match_test "TARGET_SSE2"))
-                         (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                      (const_string "V4SF")
                     (match_test "TARGET_AVX")
                       (const_string "TI")
-                    (match_test "optimize_function_for_size_p (cfun)")
+                    (ior (not (match_test "TARGET_SSE2"))
+                         (match_test "optimize_function_for_size_p (cfun)"))
                       (const_string "V4SF")
                    ]
                    (const_string "TI"))
              (cond [(ior (match_operand 0 "ext_sse_reg_operand")
                          (match_operand 1 "ext_sse_reg_operand"))
                       (const_string "XI")
-                    (ior (not (match_test "TARGET_SSE2"))
-                         (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                      (const_string "V4SF")
                     (match_test "TARGET_AVX")
                       (const_string "TI")
-                    (match_test "optimize_function_for_size_p (cfun)")
+                    (ior (not (match_test "TARGET_SSE2"))
+                         (match_test "optimize_function_for_size_p (cfun)"))
                       (const_string "V4SF")
                    ]
                    (const_string "TI"))
                 (const_string "DI")
               (match_test "TARGET_AVX")
                 (const_string "TI")
+              (ior (not (match_test "TARGET_SSE2"))
+                   (match_test "optimize_function_for_size_p (cfun)"))
+                (const_string "V4SF")
               (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
                 (const_string "V4SF")
               (and (eq_attr "alternative" "2")
                    (match_test "TARGET_SSE_TYPELESS_STORES"))
                 (const_string "V4SF")
-              (ior (not (match_test "TARGET_SSE2"))
-                   (match_test "optimize_function_for_size_p (cfun)"))
-                (const_string "V4SF")
               ]
               (const_string "TI")))])
 
 
               /* xorps is one byte shorter for non-AVX targets.  */
               (eq_attr "alternative" "12,16")
-                (cond [(not (match_test "TARGET_SSE2"))
-                         (const_string "V4SF")
-                       (and (match_test "TARGET_AVX512F")
-                         (not (match_test "TARGET_PREFER_AVX256")))
+                (cond [(and (match_test "TARGET_AVX512F")
+                            (not (match_test "TARGET_PREFER_AVX256")))
                          (const_string "XI")
                        (match_test "TARGET_AVX")
                          (const_string "V2DF")
-                       (match_test "optimize_function_for_size_p (cfun)")
+                       (ior (not (match_test "TARGET_SSE2"))
+                            (match_test "optimize_function_for_size_p (cfun)"))
                          (const_string "V4SF")
                        (match_test "TARGET_SSE_LOAD0_BY_PXOR")
                          (const_string "TI")
                             (ior (match_operand 0 "ext_sse_reg_operand")
                                  (match_operand 1 "ext_sse_reg_operand")))
                          (const_string "V8DF")
+                       (match_test "TARGET_AVX")
+                         (const_string "DF")
                        (ior (not (match_test "TARGET_SSE2"))
-                            (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+                            (match_test "optimize_function_for_size_p (cfun)"))
+                         (const_string "V4SF")
+                       (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
                          (const_string "V4SF")
                        (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
                          (const_string "V2DF")
-                       (match_test "TARGET_AVX")
-                         (const_string "DF")
-                       (match_test "optimize_function_for_size_p (cfun)")
-                         (const_string "V4SF")
                       ]
                       (const_string "DF"))
 
               (eq_attr "alternative" "11")
                 (const_string "DI")
               (eq_attr "alternative" "5")
-                (cond [(not (match_test "TARGET_SSE2"))
-                         (const_string "V4SF")
-                       (and (match_test "TARGET_AVX512F")
-                         (not (match_test "TARGET_PREFER_AVX256")))
+                (cond [(and (match_test "TARGET_AVX512F")
+                            (not (match_test "TARGET_PREFER_AVX256")))
                          (const_string "V16SF")
                        (match_test "TARGET_AVX")
                          (const_string "V4SF")
-                       (match_test "optimize_function_for_size_p (cfun)")
+                       (ior (not (match_test "TARGET_SSE2"))
+                            (match_test "optimize_function_for_size_p (cfun)"))
                          (const_string "V4SF")
-                       (match_test "TARGET_SSE_LOAD0_BY_PXOR")
+                       (match_test "TARGET_SSE_LOAD0_BY_PXOR")
                          (const_string "TI")
                       ]
                       (const_string "V4SF"))
index d06e3b1fcb49989098e76d1cdf77219f3b430601..f695831b5b90d012ad6efa922a80cf00c2c4ab05 100644 (file)
                     (match_test "<MODE>mode == V2SFmode")
                       (const_string "V4SF")
                     (ior (not (match_test "TARGET_SSE2"))
-                         (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                      (const_string "V4SF")
-                    (match_test "TARGET_AVX")
-                      (const_string "TI")
-                    (match_test "optimize_function_for_size_p (cfun)")
+                         (match_test "optimize_function_for_size_p (cfun)"))
                       (const_string "V4SF")
                    ]
                    (const_string "TI"))
index f5ff2e930213b771704ed7d3dc74dbc10dc52026..46f00e3d007996373ae3783ef8a5d6b865da87bc 100644 (file)
                 (const_string "<sseinsnmode>")
               (match_test "TARGET_AVX")
                 (const_string "<sseinsnmode>")
-              (ior (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
-                   (and (eq_attr "alternative" "3")
-                        (match_test "TARGET_SSE_TYPELESS_STORES")))
-                (const_string "<ssePSmode>")
               (ior (not (match_test "TARGET_SSE2"))
                    (match_test "optimize_function_for_size_p (cfun)"))
                 (const_string "V4SF")
+              (and (match_test "<MODE>mode == V2DFmode")
+                   (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
+                (const_string "V4SF")
+              (and (eq_attr "alternative" "3")
+                   (match_test "TARGET_SSE_TYPELESS_STORES"))
+                (const_string "V4SF")
               (and (eq_attr "alternative" "0")
                    (match_test "TARGET_SSE_LOAD0_BY_PXOR"))
                 (const_string "TI")
                 (const_string "<sseintvecmode2>")
               (eq_attr "alternative" "3")
                 (const_string "<sseintvecmode2>")
-              (and (match_test "<MODE_SIZE> == 16")
-                   (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                (const_string "<ssePSmode>")
               (match_test "TARGET_AVX")
                 (const_string "<MODE>")
               (match_test "optimize_function_for_size_p (cfun)")
                 (const_string "V4SF")
-              ]
-              (const_string "<MODE>")))])
-
+              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "V4SF")
+             ]
+             (const_string "<MODE>")))])
 
 (define_insn "<sse>_andnot<mode>3<mask_name>"
   [(set (match_operand:VF_512 0 "register_operand" "=v")
                 (const_string "<sseintvecmode2>")
               (eq_attr "alternative" "3")
                 (const_string "<sseintvecmode2>")
-              (and (match_test "<MODE_SIZE> == 16")
-                   (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                (const_string "<ssePSmode>")
               (match_test "TARGET_AVX")
                 (const_string "<MODE>")
               (match_test "optimize_function_for_size_p (cfun)")
                 (const_string "V4SF")
-              ]
-              (const_string "<MODE>")))])
+              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "V4SF")
+             ]
+             (const_string "<MODE>")))])
 
 (define_insn "*<code><mode>3<mask_name>"
   [(set (match_operand:VF_512 0 "register_operand" "=v")
                 (if_then_else (match_test "TARGET_AVX512DQ")
                               (const_string "<avx512fvecmode>")
                               (const_string "XI"))
-              (and (match_test "<MODE_SIZE> == 16")
-                   (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                (const_string "V4SF")
               (match_test "TARGET_AVX")
                 (const_string "<ssevecmode>")
               (match_test "optimize_function_for_size_p (cfun)")
                 (const_string "V4SF")
-              ]
-              (const_string "<ssevecmode>")))])
+              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "V4SF")
+             ]
+             (const_string "<ssevecmode>")))])
 
 (define_insn "*andnottf3"
   [(set (match_operand:TF 0 "register_operand" "=x,x,v,v")
                 (const_string "TI")
               (eq_attr "alternative" "3")
                 (const_string "XI")
-              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
-                (const_string "V4SF")
               (match_test "TARGET_AVX")
                 (const_string "TI")
               (ior (not (match_test "TARGET_SSE2"))
                    (match_test "optimize_function_for_size_p (cfun)"))
                 (const_string "V4SF")
-              ]
-              (const_string "TI")))])
+              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "V4SF")
+             ]
+             (const_string "TI")))])
 
 (define_insn "*<code><mode>3"
   [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v")
                 (if_then_else (match_test "TARGET_AVX512DQ")
                               (const_string "<avx512fvecmode>")
                               (const_string "XI"))
-              (and (match_test "<MODE_SIZE> == 16")
-                   (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                (const_string "V4SF")
               (match_test "TARGET_AVX")
                 (const_string "<ssevecmode>")
               (match_test "optimize_function_for_size_p (cfun)")
                 (const_string "V4SF")
-              ]
-              (const_string "<ssevecmode>")))])
+              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "V4SF")
+             ]
+             (const_string "<ssevecmode>")))])
 
 (define_expand "<code>tf3"
   [(set (match_operand:TF 0 "register_operand")
                 (const_string "TI")
               (eq_attr "alternative" "3")
                 (const_string "QI")
-              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
-                (const_string "V4SF")
               (match_test "TARGET_AVX")
                 (const_string "TI")
               (ior (not (match_test "TARGET_SSE2"))
                    (match_test "optimize_function_for_size_p (cfun)"))
                 (const_string "V4SF")
-              ]
-              (const_string "TI")))])
+              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "V4SF")
+             ]
+             (const_string "TI")))])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
        (const_string "*")))
    (set_attr "prefix" "orig,vex,evex")
    (set (attr "mode")
-       (cond [(and (match_test "<MODE_SIZE> == 16")
-                   (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                (const_string "<ssePSmode>")
-              (match_test "TARGET_AVX2")
+       (cond [(match_test "TARGET_AVX2")
                 (const_string "<sseinsnmode>")
               (match_test "TARGET_AVX")
                 (if_then_else
        (const_string "*")))
    (set_attr "prefix" "<mask_prefix3>,evex")
    (set (attr "mode")
-       (cond [(and (match_test "<MODE_SIZE> == 16")
-                   (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                (const_string "<ssePSmode>")
-              (match_test "TARGET_AVX2")
+       (cond [(match_test "TARGET_AVX2")
                 (const_string "<sseinsnmode>")
               (match_test "TARGET_AVX")
                 (if_then_else
        (const_string "*")))
    (set_attr "prefix" "orig,vex,evex")
    (set (attr "mode")
-       (cond [(and (match_test "<MODE_SIZE> == 16")
-                   (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL"))
-                (const_string "<ssePSmode>")
-              (match_test "TARGET_AVX2")
+       (cond [(match_test "TARGET_AVX2")
                 (const_string "<sseinsnmode>")
               (match_test "TARGET_AVX")
                 (if_then_else
    (set_attr "prefix" "orig,orig,vex")
    (set_attr "btver2_decode" "vector,vector,vector") 
    (set (attr "mode")
-       (cond [(match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
-                (const_string "V4SF")
-              (match_test "TARGET_AVX")
+       (cond [(match_test "TARGET_AVX")
                 (const_string "<ssevecmode>")
               (match_test "optimize_function_for_size_p (cfun)")
                 (const_string "V4SF")
-              ]
-              (const_string "<ssevecmode>")))])
+              (match_test "TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL")
+                (const_string "V4SF")
+             ]
+             (const_string "<ssevecmode>")))])
 
 (define_insn_and_split "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt"
   [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
index 212547405a3f20f87d133a2db9884ce687fa8704..41b3d52653d91c42d4593e063624b34eb3c230c7 100644 (file)
@@ -366,15 +366,15 @@ DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
          | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
          | m_TREMONT | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_GENERIC)
 
-/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
-   of a sequence loading registers by parts.  */
+/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores
+   instead of a sequence loading registers by parts.  */
 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
          m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
          | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
          | m_TREMONT | m_BDVER | m_ZNVER | m_GENERIC)
 
-/* Use packed single precision instructions where posisble.  I.e. movups instead
-   of movupd.  */
+/* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single
+   precision 128bit instructions instead of double where possible.   */
 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
          m_BDVER | m_ZNVER)