i386.c (ix86_multiplication_cost, [...]): Break out from ...

author Jan Hubicka <hubicka@ucw.cz>

Fri, 17 Nov 2017 17:53:49 +0000 (18:53 +0100)

committer Jan Hubicka <hubicka@gcc.gnu.org>

Fri, 17 Nov 2017 17:53:49 +0000 (17:53 +0000)
author Jan Hubicka <hubicka@ucw.cz>
Fri, 17 Nov 2017 17:53:49 +0000 (18:53 +0100)
committer Jan Hubicka <hubicka@gcc.gnu.org>
Fri, 17 Nov 2017 17:53:49 +0000 (17:53 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 3864ff2fa05afdffc60358ae756a1f4830deab0d..9d96f9564ef621c7def3e44e4617871763f6e7f1 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2017-11-17  Jan Hubicka  <hubicka@ucw.cz>
+
+       * i386.c (ix86_multiplication_cost, ix86_division_cost,
+       ix86_shift_rotate_cost): Break out from ...
+       (ix86_rtx_costs): ... here.
+       (ix86_add_stmt_cost): Use rtx cost machinery to compute cost of
+       vector operations.
+
  2017-11-17  Jan Hubicka  <hubicka@ucw.cz>
  
         * predict.c (determine_unlikely_bbs): Set cgraph node count to 0
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index a53666966d6ca356269fdf62b870c5d3fca1281b..835c73db9d23c6b23c991c4a5c6a5d7ba6e8c396 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -39285,6 +39285,168 @@ ix86_vec_cost (machine_mode mode, int cost, bool parallel)
    return cost;
  }
  
+/* Return cost of multiplication in MODE.  */
+
+static int
+ix86_multiplication_cost (const struct processor_costs *cost,
+                         enum machine_mode mode)
+{
+  machine_mode inner_mode = mode;
+  if (VECTOR_MODE_P (mode))
+    inner_mode = GET_MODE_INNER (mode);
+
+  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+    return inner_mode == DFmode ? cost->mulsd : cost->mulss;
+  else if (X87_FLOAT_MODE_P (mode))
+    return cost->fmul;
+  else if (FLOAT_MODE_P (mode))
+    return  ix86_vec_cost (mode,
+                          inner_mode == DFmode
+                          ? cost->mulsd : cost->mulss, true);
+  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      /* V*QImode is emulated with 7-13 insns.  */
+      if (mode == V16QImode || mode == V32QImode)
+       {
+         int extra = 11;
+         if (TARGET_XOP && mode == V16QImode)
+           extra = 5;
+         else if (TARGET_SSSE3)
+           extra = 6;
+         return ix86_vec_cost (mode,
+                               cost->mulss * 2 + cost->sse_op * extra,
+                               true);
+       }
+      /* V*DImode is emulated with 5-8 insns.  */
+      else if (mode == V2DImode || mode == V4DImode)
+       {
+         if (TARGET_XOP && mode == V2DImode)
+           return ix86_vec_cost (mode,
+                                 cost->mulss * 2 + cost->sse_op * 3,
+                                 true);
+         else
+           return ix86_vec_cost (mode,
+                                 cost->mulss * 3 + cost->sse_op * 5,
+                                 true);
+       }
+      /* Without sse4.1, we don't have PMULLD; it's emulated with 7
+        insns, including two PMULUDQ.  */
+      else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+       return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
+                               true);
+      else
+       return ix86_vec_cost (mode, cost->mulss, true);
+    }
+  else
+    return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
+}
+
+/* Return cost of multiplication in MODE.  */
+
+static int
+ix86_division_cost (const struct processor_costs *cost,
+                         enum machine_mode mode)
+{
+  machine_mode inner_mode = mode;
+  if (VECTOR_MODE_P (mode))
+    inner_mode = GET_MODE_INNER (mode);
+
+  if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+    return inner_mode == DFmode ? cost->divsd : cost->divss;
+  else if (X87_FLOAT_MODE_P (mode))
+    return cost->fdiv;
+  else if (FLOAT_MODE_P (mode))
+    return ix86_vec_cost (mode,
+                           inner_mode == DFmode ? cost->divsd : cost->divss,
+                           true);
+  else
+    return cost->divide[MODE_INDEX (mode)];
+}
+
+/* Return cost of shift in MODE.
+   If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
+   AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
+   if op1 is a result of subreg.
+
+   SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored.  */
+
+static int
+ix86_shift_rotate_cost (const struct processor_costs *cost,
+                       enum machine_mode mode, bool constant_op1,
+                       HOST_WIDE_INT op1_val,
+                       bool speed,
+                       bool and_in_op1,
+                       bool shift_and_truncate,
+                       bool *skip_op0, bool *skip_op1)
+{
+  if (skip_op0)
+    *skip_op0 = *skip_op1 = false;
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      /* V*QImode is emulated with 1-11 insns.  */
+      if (mode == V16QImode || mode == V32QImode)
+       {
+         int count = 11;
+         if (TARGET_XOP && mode == V16QImode)
+           {
+             /* For XOP we use vpshab, which requires a broadcast of the
+                value to the variable shift insn.  For constants this
+                means a V16Q const in mem; even when we can perform the
+                shift with one insn set the cost to prefer paddb.  */
+             if (constant_op1)
+               {
+                 if (skip_op1)
+                   *skip_op1 = true;
+                 return ix86_vec_cost (mode,
+                           cost->sse_op
+                           + (speed
+                              ? 2
+                              : COSTS_N_BYTES
+                                (GET_MODE_UNIT_SIZE (mode))), true);
+               }
+             count = 3;
+           }
+         else if (TARGET_SSSE3)
+           count = 7;
+         return ix86_vec_cost (mode, cost->sse_op * count, true);
+       }
+      else
+       return ix86_vec_cost (mode, cost->sse_op, true);
+    }
+  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+    {
+      if (constant_op1)
+       {
+         if (op1_val > 32)
+           return cost->shift_const + COSTS_N_INSNS (2);
+         else
+           return cost->shift_const * 2;
+       }
+      else
+       {
+         if (and_in_op1)
+           return cost->shift_var * 2;
+         else
+           return cost->shift_var * 6 + COSTS_N_INSNS (2);
+       }
+    }
+  else
+    {
+      if (constant_op1)
+       return cost->shift_const;
+      else if (shift_and_truncate)
+       {
+         if (skip_op0)
+           *skip_op0 = *skip_op1 = true;
+         /* Return the cost after shift-and truncation.  */
+         return cost->shift_var;
+       }
+      else
+       return cost->shift_var;
+    }
+  return cost->shift_const;
+}
+
  /* Compute a (partial) cost for rtx X.  Return true if the complete
     cost has been computed, and false if subexpressions should be
     scanned.  In either case, *TOTAL contains the cost result.  */
@@ -39298,9 +39460,6 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
    enum rtx_code outer_code = (enum rtx_code) outer_code_i;
    const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
    int src_cost;
-  machine_mode inner_mode = mode;
-  if (VECTOR_MODE_P (mode))
-    inner_mode = GET_MODE_INNER (mode);
  
    switch (code)
      {
@@ -39428,68 +39587,22 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
      case ASHIFTRT:
      case LSHIFTRT:
      case ROTATERT:
-      if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-       {
-         /* ??? Should be SSE vector operation cost.  */
-         /* At least for published AMD latencies, this really is the same
-            as the latency for a simple fpu operation like fabs.  */
-         /* V*QImode is emulated with 1-11 insns.  */
-         if (mode == V16QImode || mode == V32QImode)
-           {
-             int count = 11;
-             if (TARGET_XOP && mode == V16QImode)
-               {
-                 /* For XOP we use vpshab, which requires a broadcast of the
-                    value to the variable shift insn.  For constants this
-                    means a V16Q const in mem; even when we can perform the
-                    shift with one insn set the cost to prefer paddb.  */
-                 if (CONSTANT_P (XEXP (x, 1)))
-                   {
-                     *total = ix86_vec_cost (mode,
-                               cost->sse_op
-                               + rtx_cost (XEXP (x, 0), mode, code, 0, speed)
-                               + (speed ? 2 : COSTS_N_BYTES (16)), true);
-                     return true;
-                   }
-                 count = 3;
-               }
-             else if (TARGET_SSSE3)
-               count = 7;
-             *total = ix86_vec_cost (mode, cost->sse_op * count, true);
-           }
-         else
-           *total = ix86_vec_cost (mode, cost->sse_op, true);
-       }
-      else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
-       {
-         if (CONST_INT_P (XEXP (x, 1)))
-           {
-             if (INTVAL (XEXP (x, 1)) > 32)
-               *total = cost->shift_const + COSTS_N_INSNS (2);
-             else
-               *total = cost->shift_const * 2;
-           }
-         else
-           {
-             if (GET_CODE (XEXP (x, 1)) == AND)
-               *total = cost->shift_var * 2;
-             else
-               *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
-           }
-       }
-      else
-       {
-         if (CONST_INT_P (XEXP (x, 1)))
-           *total = cost->shift_const;
-         else if (SUBREG_P (XEXP (x, 1))
-                  && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
-           {
-             /* Return the cost after shift-and truncation.  */
-             *total = cost->shift_var;
-             return true;
-           }
-         else
-           *total = cost->shift_var;
+      bool skip_op0, skip_op1;
+      *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
+                                      CONST_INT_P (XEXP (x, 1))
+                                        ? INTVAL (XEXP (x, 1)) : -1,
+                                      speed,
+                                      GET_CODE (XEXP (x, 1)) == AND,
+                                      SUBREG_P (XEXP (x, 1))
+                                      && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
+                                      &skip_op0, &skip_op1);
+      if (skip_op0 || skip_op1)
+       {
+         if (!skip_op0)
+           *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
+         if (!skip_op1)
+           *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
+         return true;
         }
        return false;
  
@@ -39519,59 +39632,7 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
        }
  
      case MULT:
-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-       {
-         *total = inner_mode == DFmode ? cost->mulsd : cost->mulss;
-         return false;
-       }
-      else if (X87_FLOAT_MODE_P (mode))
-       {
-         *total = cost->fmul;
-         return false;
-       }
-      else if (FLOAT_MODE_P (mode))
-       {
-         *total = ix86_vec_cost (mode,
-                                 inner_mode == DFmode
-                                 ? cost->mulsd : cost->mulss, true);
-         return false;
-       }
-      else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-       {
-         /* V*QImode is emulated with 7-13 insns.  */
-         if (mode == V16QImode || mode == V32QImode)
-           {
-             int extra = 11;
-             if (TARGET_XOP && mode == V16QImode)
-               extra = 5;
-             else if (TARGET_SSSE3)
-               extra = 6;
-             *total = ix86_vec_cost (mode,
-                                     cost->mulss * 2 + cost->sse_op * extra,
-                                     true);
-           }
-         /* V*DImode is emulated with 5-8 insns.  */
-         else if (mode == V2DImode || mode == V4DImode)
-           {
-             if (TARGET_XOP && mode == V2DImode)
-               *total = ix86_vec_cost (mode,
-                                       cost->mulss * 2 + cost->sse_op * 3,
-                                       true);
-             else
-               *total = ix86_vec_cost (mode,
-                                       cost->mulss * 3 + cost->sse_op * 5,
-                                       true);
-           }
-         /* Without sse4.1, we don't have PMULLD; it's emulated with 7
-            insns, including two PMULUDQ.  */
-         else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
-           *total = ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
-                                   true);
-         else
-           *total = ix86_vec_cost (mode, cost->mulss, true);
-         return false;
-       }
-      else
+      if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
         {
           rtx op0 = XEXP (x, 0);
           rtx op1 = XEXP (x, 1);
@@ -39616,21 +39677,14 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
  
            return true;
         }
+      *total = ix86_multiplication_cost (cost, mode);
+      return false;
  
      case DIV:
      case UDIV:
      case MOD:
      case UMOD:
-      if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
-       *total = inner_mode == DFmode ? cost->divsd : cost->divss;
-      else if (X87_FLOAT_MODE_P (mode))
-       *total = cost->fdiv;
-      else if (FLOAT_MODE_P (mode))
-       *total = ix86_vec_cost (mode,
-                               inner_mode == DFmode ? cost->divsd : cost->divss,
-                               true);
-      else
-       *total = cost->divide[MODE_INDEX (mode)];
+      *total = ix86_division_cost (cost, mode);
        return false;
  
      case PLUS:
@@ -48827,7 +48881,117 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
    unsigned retval = 0;
  
    tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
-  int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+  int stmt_cost = - 1;
+
+  if ((kind == vector_stmt || kind == scalar_stmt)
+      && stmt_info
+      && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
+    {
+      tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
+      bool fp = false;
+      machine_mode mode = TImode;
+
+      if (vectype != NULL)
+       {
+         fp = FLOAT_TYPE_P (vectype);
+         mode = TYPE_MODE (vectype);
+       }
+      /*machine_mode inner_mode = mode;
+      if (VECTOR_MODE_P (mode))
+       inner_mode = GET_MODE_INNER (mode);*/
+
+      switch (subcode)
+       {
+       case PLUS_EXPR:
+       case POINTER_PLUS_EXPR:
+       case MINUS_EXPR:
+         if (kind == scalar_stmt)
+           {
+             if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+               stmt_cost = ix86_cost->addss;
+             else if (X87_FLOAT_MODE_P (mode))
+               stmt_cost = ix86_cost->fadd;
+             else
+               stmt_cost = ix86_cost->add;
+           }
+         else
+           stmt_cost = ix86_vec_cost (mode,
+                                      fp ? ix86_cost->addss
+                                      : ix86_cost->sse_op,
+                                      true);
+         break;
+
+       case MULT_EXPR:
+       case WIDEN_MULT_EXPR:
+       case MULT_HIGHPART_EXPR:
+         stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
+         break;
+       case FMA_EXPR:
+          stmt_cost = ix86_vec_cost (mode,
+                                    mode == SFmode ? ix86_cost->fmass
+                                    : ix86_cost->fmasd,
+                                    true);
+         break;
+       case NEGATE_EXPR:
+         if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+           stmt_cost = ix86_cost->sse_op;
+         else if (X87_FLOAT_MODE_P (mode))
+           stmt_cost = ix86_cost->fchs;
+         else if (VECTOR_MODE_P (mode))
+           stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
+         else
+           stmt_cost = ix86_cost->add;
+         break;
+       case TRUNC_DIV_EXPR:
+       case CEIL_DIV_EXPR:
+       case FLOOR_DIV_EXPR:
+       case ROUND_DIV_EXPR:
+       case TRUNC_MOD_EXPR:
+       case CEIL_MOD_EXPR:
+       case FLOOR_MOD_EXPR:
+       case RDIV_EXPR:
+       case ROUND_MOD_EXPR:
+       case EXACT_DIV_EXPR:
+         stmt_cost = ix86_division_cost (ix86_cost, mode);
+         break;
+
+       case RSHIFT_EXPR:
+       case LSHIFT_EXPR:
+       case LROTATE_EXPR:
+       case RROTATE_EXPR:
+         {
+           tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
+           stmt_cost = ix86_shift_rotate_cost
+                          (ix86_cost, mode,
+                           TREE_CODE (op2) == INTEGER_CST,
+                           cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
+                           true, false, false, NULL, NULL);
+         }
+         break;
+       case NOP_EXPR:
+         stmt_cost = 0;
+         break;
+
+       case BIT_IOR_EXPR:
+       case ABS_EXPR:
+       case MIN_EXPR:
+       case MAX_EXPR:
+       case BIT_XOR_EXPR:
+       case BIT_AND_EXPR:
+       case BIT_NOT_EXPR:
+         if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
+           stmt_cost = ix86_cost->sse_op;
+         else if (VECTOR_MODE_P (mode))
+           stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
+         else
+           stmt_cost = ix86_cost->add;
+         break;
+       default:
+         break;
+       }
+    }
+  if (stmt_cost == -1)
+    stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
  
    /* Penalize DFmode vector operations for Bonnell.  */
    if (TARGET_BONNELL && kind == vector_stmt
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 149ed11a502c2ae2d2d16ebe5b6740d5b22c6a5d..c75df82e97dc8f070ff7437acdcc61e7375e09c8 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2017-11-17  Jan Hubicka  <hubicka@ucw.cz>
+
+       * gcc.target/i386/xop-imul64-vector.c: Disable vectorizer costmodel.
+
  2017-11-17  Jan Hubicka  <hubicka@ucw.cz>
  
         * gcc.dg/tree-ssa/dump-2.c: Fixup template for profile precision
diff --git a/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c b/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c

index fc8c8806f78c88c0a5f92f0150ab3874fd5ed87a..43a27a5cb24a0d84bf640c80fa0d39b2d89f2511 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c
+++ b/gcc/testsuite/gcc.target/i386/xop-imul64-vector.c
@@ -3,7 +3,7 @@
  
  /* { dg-do compile } */
  /* { dg-require-effective-target lp64 } */
-/* { dg-options "-O2 -mxop -mno-avx2 -ftree-vectorize" } */
+/* { dg-options "-O2 -mxop -mno-avx2 -ftree-vectorize -fno-vect-cost-model" } */
  
  extern void exit (int);
author	Jan Hubicka <hubicka@ucw.cz>
	Fri, 17 Nov 2017 17:53:49 +0000 (18:53 +0100)
committer	Jan Hubicka <hubicka@gcc.gnu.org>
	Fri, 17 Nov 2017 17:53:49 +0000 (17:53 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/i386/i386.c		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/i386/xop-imul64-vector.c		patch \| blob \| history