Model cache auto-prefetcher in scheduler

author Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>

Sat, 17 Jan 2015 01:06:43 +0000 (01:06 +0000)

committer Maxim Kuvyrkov <mkuvyrkov@gcc.gnu.org>

Sat, 17 Jan 2015 01:06:43 +0000 (01:06 +0000)
author Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
Sat, 17 Jan 2015 01:06:43 +0000 (01:06 +0000)
committer Maxim Kuvyrkov <mkuvyrkov@gcc.gnu.org>
Sat, 17 Jan 2015 01:06:43 +0000 (01:06 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 8cc1dc7bf99fd677fd7464dd7dac06419b5819d2..4f0414a6b1651f10b9891b26c2f2c03996c87805 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,36 @@
+2015-01-17  Maxim Kuvyrkov  <maxim.kuvyrkov@linaro.org>
+
+       * config/arm/arm-protos.h (struct tune_params): New field
+       sched_autopref_queue_depth.
+       * config/arm/arm.c (sched-int.h): Include header.
+       (arm_first_cycle_multipass_dfa_lookahead_guard,)
+       (TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD): Define hook.
+       (arm_slowmul_tune, arm_fastmul_tune, arm_strongarm_tune,)
+       (arm_xscale_tune, arm_9e_tune, arm_v6t2_tune, arm_cortex_tune,)
+       (arm_cortex_a8_tune, arm_cortex_a7_tune, arm_cortex_a15_tune,)
+       (arm_cortex_a53_tune, arm_cortex_a57_tune, arm_xgene1_tune,)
+       (arm_cortex_a5_tune, arm_cortex_a9_tune, arm_cortex_a12_tune,)
+       (arm_v7m_tune, arm_cortex_m7_tune, arm_v6m_tune, arm_fa726te_tune):
+       Specify sched_autopref_queue_depth value.  Enabled for A15 and A57.
+       * config/arm/t-arm (arm.o): Update.
+       * haifa-sched.c (update_insn_after_change): Update.
+       (rank_for_schedule): Use auto-prefetcher model, if requested.
+       (autopref_multipass_init): New static function.
+       (autopref_rank_for_schedule): New rank_for_schedule heuristic.
+       (autopref_multipass_dfa_lookahead_guard_started_dump_p): New static
+       variable for debug dumps.
+       (autopref_multipass_dfa_lookahead_guard_1): New static helper function.
+       (autopref_multipass_dfa_lookahead_guard): New global function that
+       implements TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD hook.
+       (init_h_i_d): Update.
+       * params.def (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH): New tuning knob.
+       * sched-int.h (enum autopref_multipass_data_status): New const enum.
+       (autopref_multipass_data_): Structure for auto-prefetcher data.
+       (autopref_multipass_data_def, autopref_multipass_data_t): New typedefs.
+       (struct _haifa_insn_data:autopref_multipass_data): New field.
+       (INSN_AUTOPREF_MULTIPASS_DATA): New access macro.
+       (autopref_multipass_dfa_lookahead_guard): Declare.
+
  2015-01-17  Maxim Kuvyrkov  <maxim.kuvyrkov@linaro.org>
  
         * rtlanal.c (get_base_term): Handle SCRATCH.
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h

index 320215bcaf6c6dd3ac97fe584cf1b5f2a122b3d6..3db7e1695f8faa4ffdf944a8d65f136d86e40335 100644 (file)
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -291,6 +291,8 @@ struct tune_params
    int max_insns_inline_memset;
    /* Bitfield encoding the fuseable pairs of instructions.  */
    unsigned int fuseable_ops;
+  /* Depth of scheduling queue to check for L2 autoprefetcher.  */
+  int sched_autopref_queue_depth;
  };
  
  extern const struct tune_params *current_tune;
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c

index 337a69b43e00179fd8944e19920781b1dd706039..fddd770897235db8a407816599b9c4e8620493af 100644 (file)
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -97,6 +97,7 @@
  #include "builtins.h"
  #include "tm-constrs.h"
  #include "rtl-iter.h"
+#include "sched-int.h"
  
  /* Forward definitions of types.  */
  typedef struct minipool_node    Mnode;
@@ -269,6 +270,7 @@ static bool arm_macro_fusion_p (void);
  static bool arm_cannot_copy_insn_p (rtx_insn *);
  static int arm_issue_rate (void);
  static int arm_first_cycle_multipass_dfa_lookahead (void);
+static int arm_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *, int);
  static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
  static bool arm_output_addr_const_extra (FILE *, rtx);
  static bool arm_allocate_stack_slots_for_args (void);
@@ -629,6 +631,10 @@ static const struct attribute_spec arm_attribute_table[] =
  #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
    arm_first_cycle_multipass_dfa_lookahead
  
+#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
+#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
+  arm_first_cycle_multipass_dfa_lookahead_guard
+
  #undef TARGET_MANGLE_TYPE
  #define TARGET_MANGLE_TYPE arm_mangle_type
  
@@ -1690,7 +1696,8 @@ const struct tune_params arm_slowmul_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_fastmul_tune =
@@ -1710,7 +1717,8 @@ const struct tune_params arm_fastmul_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  /* StrongARM has early execution of branches, so a sequence that is worth
@@ -1733,7 +1741,8 @@ const struct tune_params arm_strongarm_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_xscale_tune =
@@ -1753,7 +1762,8 @@ const struct tune_params arm_xscale_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_9e_tune =
@@ -1773,7 +1783,8 @@ const struct tune_params arm_9e_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_v6t2_tune =
@@ -1793,7 +1804,8 @@ const struct tune_params arm_v6t2_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  /* Generic Cortex tuning.  Use more specific tunings if appropriate.  */
@@ -1814,7 +1826,8 @@ const struct tune_params arm_cortex_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_cortex_a8_tune =
@@ -1834,7 +1847,8 @@ const struct tune_params arm_cortex_a8_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    true,                                                /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_cortex_a7_tune =
@@ -1854,7 +1868,8 @@ const struct tune_params arm_cortex_a7_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    true,                                                /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_cortex_a15_tune =
@@ -1874,7 +1889,8 @@ const struct tune_params arm_cortex_a15_tune =
    true, true,                                   /* Prefer 32-bit encodings.  */
    true,                                                /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  max_insn_queue_index + 1                     /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_cortex_a53_tune =
@@ -1894,7 +1910,8 @@ const struct tune_params arm_cortex_a53_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_MOVW_MOVT                           /* Fuseable pairs of instructions.  */
+  ARM_FUSE_MOVW_MOVT,                          /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_cortex_a57_tune =
@@ -1914,7 +1931,8 @@ const struct tune_params arm_cortex_a57_tune =
    true, true,                                  /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_MOVW_MOVT                           /* Fuseable pairs of instructions.  */
+  ARM_FUSE_MOVW_MOVT,                          /* Fuseable pairs of instructions.  */
+  max_insn_queue_index + 1                     /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_xgene1_tune =
@@ -1934,7 +1952,8 @@ const struct tune_params arm_xgene1_tune =
    true, true,                                  /* Prefer 32-bit encodings.  */
    false,                                      /* Prefer Neon for stringops.  */
    32,                                         /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  /* Branches can be dual-issued on Cortex-A5, so conditional execution is
@@ -1957,7 +1976,8 @@ const struct tune_params arm_cortex_a5_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    true,                                                /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_cortex_a9_tune =
@@ -1977,7 +1997,8 @@ const struct tune_params arm_cortex_a9_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_cortex_a12_tune =
@@ -1997,7 +2018,8 @@ const struct tune_params arm_cortex_a12_tune =
    true, true,                                   /* Prefer 32-bit encodings.  */
    true,                                                /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_MOVW_MOVT                           /* Fuseable pairs of instructions.  */
+  ARM_FUSE_MOVW_MOVT,                          /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  /* armv7m tuning.  On Cortex-M4 cores for example, MOVW/MOVT take a single
@@ -2024,7 +2046,8 @@ const struct tune_params arm_v7m_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  /* Cortex-M7 tuning.  */
@@ -2046,7 +2069,8 @@ const struct tune_params arm_cortex_m7_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  /* The arm_v6m_tune is duplicated from arm_cortex_tune, rather than
@@ -2068,7 +2092,8 @@ const struct tune_params arm_v6m_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  const struct tune_params arm_fa726te_tune =
@@ -2088,7 +2113,8 @@ const struct tune_params arm_fa726te_tune =
    false, false,                                 /* Prefer 32-bit encodings.  */
    false,                                       /* Prefer Neon for stringops.  */
    8,                                           /* Maximum insns to inline memset.  */
-  ARM_FUSE_NOTHING                             /* Fuseable pairs of instructions.  */
+  ARM_FUSE_NOTHING,                            /* Fuseable pairs of instructions.  */
+  -1                                           /* Sched L2 autopref depth.  */
  };
  
  
@@ -3144,6 +3170,13 @@ arm_option_override (void)
                           global_options.x_param_values,
                           global_options_set.x_param_values);
  
+  /* Look through ready list and all of queue for instructions
+     relevant for L2 auto-prefetcher.  */
+  maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
+                        current_tune->sched_autopref_queue_depth,
+                         global_options.x_param_values,
+                         global_options_set.x_param_values);
+
    /* Disable shrink-wrap when optimizing function for size, since it tends to
       generate additional returns.  */
    if (optimize_function_for_size_p (cfun) && TARGET_THUMB2)
@@ -27153,6 +27186,13 @@ arm_first_cycle_multipass_dfa_lookahead (void)
    return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
  }
  
+/* Enable modeling of L2 auto-prefetcher.  */
+static int
+arm_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, int ready_index)
+{
+  return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
+}
+
  const char *
  arm_mangle_type (const_tree type)
  {
diff --git a/gcc/config/arm/t-arm b/gcc/config/arm/t-arm

index 4ef38a87f013a0bc3e4b41bc3eda8dcd573caa3b..ab5b6e7d598ec2776ac28f50fa148c31fdc3b291 100644 (file)
--- a/gcc/config/arm/t-arm
+++ b/gcc/config/arm/t-arm
@@ -91,7 +91,8 @@ arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
    $(EXPR_H) $(OPTABS_H) $(RECOG_H) $(CGRAPH_H) \
    $(GGC_H) except.h $(C_PRAGMA_H) $(TM_P_H) \
    $(TARGET_H) $(TARGET_DEF_H) debug.h langhooks.h $(DF_H) \
-  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) $(srcdir)/config/arm/arm-cores.def \
+  intl.h libfuncs.h $(PARAMS_H) $(OPTS_H) sched-int.h \
+  $(srcdir)/config/arm/arm-cores.def \
    $(srcdir)/config/arm/arm-arches.def $(srcdir)/config/arm/arm-fpus.def \
    $(srcdir)/config/arm/arm-protos.h \
    $(srcdir)/config/arm/arm_neon_builtins.def
diff --git a/gcc/haifa-sched.c b/gcc/haifa-sched.c

index 98cb9e4ba56e204655989f6ebae9c12c80eaec8f..795ff79e8985c736a9346a912e2c2359375bf329 100644 (file)
--- a/gcc/haifa-sched.c
+++ b/gcc/haifa-sched.c
@@ -841,6 +841,7 @@ add_delay_dependencies (rtx_insn *insn)
  /* Forward declarations.  */
  
  static int priority (rtx_insn *);
+static int autopref_rank_for_schedule (const rtx_insn *, const rtx_insn *);
  static int rank_for_schedule (const void *, const void *);
  static void swap_sort (rtx_insn **, int);
  static void queue_insn (rtx_insn *, int, const char *);
@@ -1184,6 +1185,12 @@ update_insn_after_change (rtx_insn *insn)
    INSN_COST (insn) = -1;
    /* Invalidate INSN_TICK, so it'll be recalculated.  */
    INSN_TICK (insn) = INVALID_TICK;
+
+  /* Invalidate autoprefetch data entry.  */
+  INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status
+    = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
+  INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status
+    = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
  }
  
  
@@ -2724,6 +2731,13 @@ rank_for_schedule (const void *x, const void *y)
    if (flag_sched_critical_path_heuristic && priority_val)
      return rfs_result (RFS_PRIORITY, priority_val, tmp, tmp2);
  
+  if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) >= 0)
+    {
+      int autopref = autopref_rank_for_schedule (tmp, tmp2);
+      if (autopref != 0)
+       return autopref;
+    }
+
    /* Prefer speculative insn with greater dependencies weakness.  */
    if (flag_sched_spec_insn_heuristic && spec_info)
      {
@@ -5500,6 +5514,241 @@ insn_finishes_cycle_p (rtx_insn *insn)
    return false;
  }
  
+/* Functions to model cache auto-prefetcher.
+
+   Some of the CPUs have cache auto-prefetcher, which /seems/ to initiate
+   memory prefetches if it sees instructions with consequitive memory accesses
+   in the instruction stream.  Details of such hardware units are not published,
+   so we can only guess what exactly is going on there.
+   In the scheduler, we model abstract auto-prefetcher.  If there are memory
+   insns in the ready list (or the queue) that have same memory base, but
+   different offsets, then we delay the insns with larger offsets until insns
+   with smaller offsets get scheduled.  If PARAM_SCHED_AUTOPREF_QUEUE_DEPTH
+   is "1", then we look at the ready list; if it is N>1, then we also look
+   through N-1 queue entries.
+   If the param is N>=0, then rank_for_schedule will consider auto-prefetching
+   among its heuristics.
+   Param value of "-1" disables modelling of the auto-prefetcher.  */
+
+/* Initialize autoprefetcher model data for INSN.  */
+static void
+autopref_multipass_init (const rtx_insn *insn, int write)
+{
+  autopref_multipass_data_t data = &INSN_AUTOPREF_MULTIPASS_DATA (insn)[write];
+
+  gcc_assert (data->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED);
+  data->base = NULL_RTX;
+  data->offset = 0;
+  /* Set insn entry initialized, but not relevant for auto-prefetcher.  */
+  data->status = AUTOPREF_MULTIPASS_DATA_IRRELEVANT;
+
+  rtx set = single_set (insn);
+  if (set == NULL_RTX)
+    return;
+
+  rtx mem = write ? SET_DEST (set) : SET_SRC (set);
+  if (!MEM_P (mem))
+    return;
+
+  struct address_info info;
+  decompose_mem_address (&info, mem);
+
+  /* TODO: Currently only (base+const) addressing is supported.  */
+  if (info.base == NULL || !REG_P (*info.base)
+      || (info.disp != NULL && !CONST_INT_P (*info.disp)))
+    return;
+
+  /* This insn is relevant for auto-prefetcher.  */
+  data->base = *info.base;
+  data->offset = info.disp ? INTVAL (*info.disp) : 0;
+  data->status = AUTOPREF_MULTIPASS_DATA_NORMAL;
+}
+
+/* Helper function for rank_for_schedule sorting.  */
+static int
+autopref_rank_for_schedule (const rtx_insn *insn1, const rtx_insn *insn2)
+{
+  for (int write = 0; write < 2; ++write)
+    {
+      autopref_multipass_data_t data1
+       = &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
+      autopref_multipass_data_t data2
+       = &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write];
+
+      if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
+       autopref_multipass_init (insn1, write);
+      if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
+       continue;
+
+      if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
+       autopref_multipass_init (insn2, write);
+      if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
+       continue;
+
+      if (!rtx_equal_p (data1->base, data2->base))
+       continue;
+
+      return data1->offset - data2->offset;
+    }
+
+  return 0;
+}
+
+/* True if header of debug dump was printed.  */
+static bool autopref_multipass_dfa_lookahead_guard_started_dump_p;
+
+/* Helper for autopref_multipass_dfa_lookahead_guard.
+   Return "1" if INSN1 should be delayed in favor of INSN2.  */
+static int
+autopref_multipass_dfa_lookahead_guard_1 (const rtx_insn *insn1,
+                                         const rtx_insn *insn2, int write)
+{
+  autopref_multipass_data_t data1
+    = &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
+  autopref_multipass_data_t data2
+    = &INSN_AUTOPREF_MULTIPASS_DATA (insn2)[write];
+
+  if (data2->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
+    autopref_multipass_init (insn2, write);
+  if (data2->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
+    return 0;
+
+  if (rtx_equal_p (data1->base, data2->base)
+      && data1->offset > data2->offset)
+    {
+      if (sched_verbose >= 2)
+       {
+          if (!autopref_multipass_dfa_lookahead_guard_started_dump_p)
+           {
+             fprintf (sched_dump,
+                      ";;\t\tnot trying in max_issue due to autoprefetch "
+                      "model: ");
+             autopref_multipass_dfa_lookahead_guard_started_dump_p = true;
+           }
+
+         fprintf (sched_dump, " %d(%d)", INSN_UID (insn1), INSN_UID (insn2));
+       }
+
+      return 1;
+    }
+
+  return 0;
+}
+
+/* General note:
+
+   We could have also hooked autoprefetcher model into
+   first_cycle_multipass_backtrack / first_cycle_multipass_issue hooks
+   to enable intelligent selection of "[r1+0]=r2; [r1+4]=r3" on the same cycle
+   (e.g., once "[r1+0]=r2" is issued in max_issue(), "[r1+4]=r3" gets
+   unblocked).  We don't bother about this yet because target of interest
+   (ARM Cortex-A15) can issue only 1 memory operation per cycle.  */
+
+/* Implementation of first_cycle_multipass_dfa_lookahead_guard hook.
+   Return "1" if INSN1 should not be considered in max_issue due to
+   auto-prefetcher considerations.  */
+int
+autopref_multipass_dfa_lookahead_guard (rtx_insn *insn1, int ready_index)
+{
+  int r = 0;
+
+  if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) <= 0)
+    return 0;
+
+  if (sched_verbose >= 2 && ready_index == 0)
+    autopref_multipass_dfa_lookahead_guard_started_dump_p = false;
+
+  for (int write = 0; write < 2; ++write)
+    {
+      autopref_multipass_data_t data1
+       = &INSN_AUTOPREF_MULTIPASS_DATA (insn1)[write];
+
+      if (data1->status == AUTOPREF_MULTIPASS_DATA_UNINITIALIZED)
+       autopref_multipass_init (insn1, write);
+      if (data1->status == AUTOPREF_MULTIPASS_DATA_IRRELEVANT)
+       continue;
+
+      if (ready_index == 0
+         && data1->status == AUTOPREF_MULTIPASS_DATA_DONT_DELAY)
+       /* We allow only a single delay on priviledged instructions.
+          Doing otherwise would cause infinite loop.  */
+       {
+         if (sched_verbose >= 2)
+           {
+             if (!autopref_multipass_dfa_lookahead_guard_started_dump_p)
+               {
+                 fprintf (sched_dump,
+                          ";;\t\tnot trying in max_issue due to autoprefetch "
+                          "model: ");
+                 autopref_multipass_dfa_lookahead_guard_started_dump_p = true;
+               }
+
+             fprintf (sched_dump, " *%d*", INSN_UID (insn1));
+           }
+         continue;
+       }
+
+      for (int i2 = 0; i2 < ready.n_ready; ++i2)
+       {
+         rtx_insn *insn2 = get_ready_element (i2);
+         if (insn1 == insn2)
+           continue;
+         r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2, write);
+         if (r)
+           {
+             if (ready_index == 0)
+               {
+                 r = -1;
+                 data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY;
+               }
+             goto finish;
+           }
+       }
+
+      if (PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) == 1)
+       continue;
+
+      /* Everything from the current queue slot should have been moved to
+        the ready list.  */
+      gcc_assert (insn_queue[NEXT_Q_AFTER (q_ptr, 0)] == NULL_RTX);
+
+      int n_stalls = PARAM_VALUE (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH) - 1;
+      if (n_stalls > max_insn_queue_index)
+       n_stalls = max_insn_queue_index;
+
+      for (int stalls = 1; stalls <= n_stalls; ++stalls)
+       {
+         for (rtx_insn_list *link = insn_queue[NEXT_Q_AFTER (q_ptr, stalls)];
+              link != NULL_RTX;
+              link = link->next ())
+           {
+             rtx_insn *insn2 = link->insn ();
+             r = autopref_multipass_dfa_lookahead_guard_1 (insn1, insn2,
+                                                           write);
+             if (r)
+               {
+                 /* Queue INSN1 until INSN2 can issue.  */
+                 r = -stalls;
+                 if (ready_index == 0)
+                   data1->status = AUTOPREF_MULTIPASS_DATA_DONT_DELAY;
+                 goto finish;
+               }
+           }
+       }
+    }
+
+    finish:
+  if (sched_verbose >= 2
+      && autopref_multipass_dfa_lookahead_guard_started_dump_p
+      && (ready_index == ready.n_ready - 1 || r < 0))
+    /* This does not /always/ trigger.  We don't output EOL if the last
+       insn is not recognized (INSN_CODE < 0) and lookahead_guard is not
+       called.  We can live with this.  */
+    fprintf (sched_dump, "\n");
+
+  return r;
+}
+
  /* Define type for target data used in multipass scheduling.  */
  #ifndef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T
  # define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DATA_T int
@@ -8710,6 +8959,10 @@ init_h_i_d (rtx_insn *insn)
        INSN_EXACT_TICK (insn) = INVALID_TICK;
        INTER_TICK (insn) = INVALID_TICK;
        TODO_SPEC (insn) = HARD_DEP;
+      INSN_AUTOPREF_MULTIPASS_DATA (insn)[0].status
+       = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
+      INSN_AUTOPREF_MULTIPASS_DATA (insn)[1].status
+       = AUTOPREF_MULTIPASS_DATA_UNINITIALIZED;
      }
  }
  
diff --git a/gcc/params.def b/gcc/params.def

index 3f69ce0c6a492cd41d68e538dc3cdbc3cf4387db..192c1e021c2a910c6836c3fa60a0e12e8169e7b0 100644 (file)
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -668,6 +668,11 @@ DEFPARAM (PARAM_SCHED_MEM_TRUE_DEP_COST,
           "Minimal distance between possibly conflicting store and load",
           1, 0, 0)
  
+DEFPARAM (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
+         "sched-autopref-queue-depth",
+         "Hardware autoprefetcher scheduler model control flag.  Number of lookahead cycles the model looks into; at '0' only enable instruction sorting heuristic.  Disabled by default.",
+         -1, 0, 0)
+
  DEFPARAM(PARAM_MAX_LAST_VALUE_RTL,
          "max-last-value-rtl",
          "The maximum number of RTL nodes that can be recorded as combiner's last value",
diff --git a/gcc/sched-int.h b/gcc/sched-int.h

index 9392d04d5bf8e0c9c3c505eb5c74e8cfc94fa299..28e95ea97b6e7c5051e21ac915618327cbfe4f7e 100644 (file)
--- a/gcc/sched-int.h
+++ b/gcc/sched-int.h
@@ -793,6 +793,32 @@ struct reg_set_data
    struct reg_set_data *next_insn_set;
  };
  
+enum autopref_multipass_data_status {
+  /* Entry is irrelevant for auto-prefetcher.  */
+  AUTOPREF_MULTIPASS_DATA_IRRELEVANT = -2,
+  /* Entry is uninitialized.  */
+  AUTOPREF_MULTIPASS_DATA_UNINITIALIZED = -1,
+  /* Entry is relevant for auto-prefetcher and insn can be delayed
+     to allow another insn through.  */
+  AUTOPREF_MULTIPASS_DATA_NORMAL = 0,
+  /* Entry is relevant for auto-prefetcher, but insn should not be
+     delayed as that will break scheduling.  */
+  AUTOPREF_MULTIPASS_DATA_DONT_DELAY = 1
+};
+
+/* Data for modeling cache auto-prefetcher.  */
+struct autopref_multipass_data_
+{
+  /* Base part of memory address.  */
+  rtx base;
+  /* Memory offset.  */
+  int offset;
+  /* Entry status.  */
+  enum autopref_multipass_data_status status;
+};
+typedef struct autopref_multipass_data_ autopref_multipass_data_def;
+typedef autopref_multipass_data_def *autopref_multipass_data_t;
+
  struct _haifa_insn_data
  {
    /* We can't place 'struct _deps_list' into h_i_d instead of deps_list_t
@@ -893,6 +919,10 @@ struct _haifa_insn_data
  
    /* The deciding reason for INSN's place in the ready list.  */
    int last_rfs_win;
+
+  /* Two entries for cache auto-prefetcher model: one for mem reads,
+     and one for mem writes.  */
+  autopref_multipass_data_def autopref_multipass_data[2];
  };
  
  typedef struct _haifa_insn_data haifa_insn_data_def;
@@ -915,6 +945,8 @@ extern vec<haifa_insn_data_def> h_i_d;
    (HID (INSN)->reg_pressure_excess_cost_change)
  #define INSN_PRIORITY_STATUS(INSN) (HID (INSN)->priority_status)
  #define INSN_MODEL_INDEX(INSN) (HID (INSN)->model_index)
+#define INSN_AUTOPREF_MULTIPASS_DATA(INSN) \
+  (HID (INSN)->autopref_multipass_data)
  
  typedef struct _haifa_deps_insn_data haifa_deps_insn_data_def;
  typedef haifa_deps_insn_data_def *haifa_deps_insn_data_t;
@@ -1363,6 +1395,8 @@ extern int cycle_issued_insns;
  extern int issue_rate;
  extern int dfa_lookahead;
  
+extern int autopref_multipass_dfa_lookahead_guard (rtx_insn *, int);
+
  extern void ready_sort (struct ready_list *);
  extern rtx_insn *ready_element (struct ready_list *, int);
  extern rtx_insn **ready_lastpos (struct ready_list *);
author	Maxim Kuvyrkov <maxim.kuvyrkov@linaro.org>
	Sat, 17 Jan 2015 01:06:43 +0000 (01:06 +0000)
committer	Maxim Kuvyrkov <mkuvyrkov@gcc.gnu.org>
	Sat, 17 Jan 2015 01:06:43 +0000 (01:06 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/arm/arm-protos.h		patch \| blob \| history
gcc/config/arm/arm.c		patch \| blob \| history
gcc/config/arm/t-arm		patch \| blob \| history
gcc/haifa-sched.c		patch \| blob \| history
gcc/params.def		patch \| blob \| history
gcc/sched-int.h		patch \| blob \| history