lima/gpir: Always schedule complex2 and *_impl right after complex1

[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_query_hw_metric.c
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c

index ed4ab9f74249e596b0069370a08ba04237226b08..c78b04600d0a8f7ebd85ce3a2efb2a5e8c2e4ed4 100644 (file)
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -24,22 +24,78 @@
  #include "nvc0/nvc0_query_hw_metric.h"
  #include "nvc0/nvc0_query_hw_sm.h"
  
-#define _Q(i,n,t) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t }
-struct nvc0_hw_metric_cfg {
+#define _Q(i,n,t,d) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t, d }
+static const struct nvc0_hw_metric_cfg {
     unsigned id;
     const char *name;
     enum pipe_driver_query_type type;
+   const char *desc;
  } nvc0_hw_metric_queries[] = {
-   _Q(ACHIEVED_OCCUPANCY,        "metric-achieved_occupancy",     UINT64      ),
-   _Q(BRANCH_EFFICIENCY,         "metric-branch_efficiency",      UINT64      ),
-   _Q(INST_ISSUED,               "metric-inst_issued",            UINT64      ),
-   _Q(INST_PER_WRAP,             "metric-inst_per_wrap",          UINT64      ),
-   _Q(INST_REPLAY_OVERHEAD,      "metric-inst_replay_overhead",   UINT64      ),
-   _Q(ISSUED_IPC,                "metric-issued_ipc",             UINT64      ),
-   _Q(ISSUE_SLOTS,               "metric-issue_slots",            UINT64      ),
-   _Q(ISSUE_SLOT_UTILIZATION,    "metric-issue_slot_utilization", UINT64      ),
-   _Q(IPC,                       "metric-ipc",                    UINT64      ),
-   _Q(SHARED_REPLAY_OVERHEAD,    "metric-shared_replay_overhead", UINT64      ),
+   _Q(ACHIEVED_OCCUPANCY,
+      "metric-achieved_occupancy",
+      PERCENTAGE,
+      "Ratio of the average active warps per active cycle to the maximum "
+      "number of warps supported on a multiprocessor"),
+
+   _Q(BRANCH_EFFICIENCY,
+      "metric-branch_efficiency",
+      PERCENTAGE,
+      "Ratio of non-divergent branches to total branches"),
+
+   _Q(INST_ISSUED,
+      "metric-inst_issued",
+      UINT64,
+      "The number of instructions issued"),
+
+   _Q(INST_PER_WRAP,
+      "metric-inst_per_wrap",
+      UINT64,
+      "Average number of instructions executed by each warp"),
+
+   _Q(INST_REPLAY_OVERHEAD,
+      "metric-inst_replay_overhead",
+      UINT64,
+      "Average number of replays for each instruction executed"),
+
+   _Q(ISSUED_IPC,
+      "metric-issued_ipc",
+      UINT64,
+      "Instructions issued per cycle"),
+
+   _Q(ISSUE_SLOTS,
+      "metric-issue_slots",
+      UINT64,
+      "The number of issue slots used"),
+
+   _Q(ISSUE_SLOT_UTILIZATION,
+      "metric-issue_slot_utilization",
+      PERCENTAGE,
+      "Percentage of issue slots that issued at least one instruction, "
+      "averaged across all cycles"),
+
+   _Q(IPC,
+      "metric-ipc",
+      UINT64,
+      "Instructions executed per cycle"),
+
+   _Q(SHARED_REPLAY_OVERHEAD,
+      "metric-shared_replay_overhead",
+      UINT64,
+      "Average number of replays due to shared memory conflicts for each "
+      "instruction executed"),
+
+   _Q(WARP_EXECUTION_EFFICIENCY,
+      "metric-warp_execution_efficiency",
+      PERCENTAGE,
+      "Ratio of the average active threads per warp to the maximum number of "
+      "threads per warp supported on a multiprocessor"),
+
+   _Q(WARP_NONPRED_EXECUTION_EFFICIENCY,
+      "metric-warp_nonpred_execution_efficiency",
+      PERCENTAGE,
+      "Ratio of the average active threads per warp executing non-predicated "
+      "instructions to the maximum number of threads per warp supported on a "
+      "multiprocessor"),
  };
  
  #undef _Q
@@ -135,9 +191,9 @@ static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] =
     &sm20_branch_efficiency,
     &sm20_inst_per_wrap,
     &sm20_inst_replay_overhead,
+   &sm20_ipc,
     &sm20_issued_ipc,
     &sm20_issue_slot_utilization,
-   &sm20_ipc,
  };
  
  /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
@@ -206,31 +262,13 @@ static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] =
     &sm21_inst_issued,
     &sm20_inst_per_wrap,
     &sm21_inst_replay_overhead,
+   &sm20_ipc,
     &sm21_issued_ipc,
     &sm21_issue_slots,
     &sm21_issue_slot_utilization,
-   &sm20_ipc,
  };
  
  /* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */
-static const struct nvc0_hw_metric_query_cfg
-sm30_achieved_occupancy =
-{
-   .type        = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY,
-   .queries[0]  = _SM(ACTIVE_WARPS),
-   .queries[1]  = _SM(ACTIVE_CYCLES),
-   .num_queries = 2,
-};
-
-static const struct nvc0_hw_metric_query_cfg
-sm30_branch_efficiency =
-{
-   .type        = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
-   .queries[0]  = _SM(BRANCH),
-   .queries[1]  = _SM(DIVERGENT_BRANCH),
-   .num_queries = 2,
-};
-
  static const struct nvc0_hw_metric_query_cfg
  sm30_inst_issued =
  {
@@ -240,15 +278,6 @@ sm30_inst_issued =
     .num_queries = 2,
  };
  
-static const struct nvc0_hw_metric_query_cfg
-sm30_inst_per_wrap =
-{
-   .type        = NVC0_HW_METRIC_QUERY_INST_PER_WRAP,
-   .queries[0]  = _SM(INST_EXECUTED),
-   .queries[1]  = _SM(WARPS_LAUNCHED),
-   .num_queries = 2,
-};
-
  static const struct nvc0_hw_metric_query_cfg
  sm30_inst_replay_overhead =
  {
@@ -288,15 +317,6 @@ sm30_issue_slot_utilization =
     .num_queries = 3,
  };
  
-static const struct nvc0_hw_metric_query_cfg
-sm30_ipc =
-{
-   .type        = NVC0_HW_METRIC_QUERY_IPC,
-   .queries[0]  = _SM(INST_EXECUTED),
-   .queries[1]  = _SM(ACTIVE_CYCLES),
-   .num_queries = 2,
-};
-
  static const struct nvc0_hw_metric_query_cfg
  sm30_shared_replay_overhead =
  {
@@ -307,32 +327,69 @@ sm30_shared_replay_overhead =
     .num_queries = 3,
  };
  
+static const struct nvc0_hw_metric_query_cfg
+sm30_warp_execution_efficiency =
+{
+   .type        = NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY,
+   .queries[0]  = _SM(INST_EXECUTED),
+   .queries[1]  = _SM(TH_INST_EXECUTED),
+   .num_queries = 2,
+};
+
  static const struct nvc0_hw_metric_query_cfg *sm30_hw_metric_queries[] =
  {
-   &sm30_achieved_occupancy,
-   &sm30_branch_efficiency,
+   &sm20_achieved_occupancy,
+   &sm20_branch_efficiency,
     &sm30_inst_issued,
-   &sm30_inst_per_wrap,
+   &sm20_inst_per_wrap,
     &sm30_inst_replay_overhead,
+   &sm20_ipc,
     &sm30_issued_ipc,
     &sm30_issue_slots,
     &sm30_issue_slot_utilization,
-   &sm30_ipc,
     &sm30_shared_replay_overhead,
+   &sm30_warp_execution_efficiency,
+};
+
+/* ==== Compute capability 3.5 (GK110/GK208) ==== */
+static const struct nvc0_hw_metric_query_cfg
+sm35_warp_nonpred_execution_efficiency =
+{
+   .type        = NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY,
+   .queries[0]  = _SM(INST_EXECUTED),
+   .queries[1]  = _SM(NOT_PRED_OFF_INST_EXECUTED),
+   .num_queries = 2,
  };
  
-/* ==== Compute capability 3.5 (GK110) ==== */
  static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] =
  {
-   &sm30_achieved_occupancy,
+   &sm20_achieved_occupancy,
     &sm30_inst_issued,
-   &sm30_inst_per_wrap,
+   &sm20_inst_per_wrap,
     &sm30_inst_replay_overhead,
+   &sm20_ipc,
     &sm30_issued_ipc,
-   &sm30_inst_issued,
+   &sm30_issue_slots,
     &sm30_issue_slot_utilization,
-   &sm30_ipc,
     &sm30_shared_replay_overhead,
+   &sm30_warp_execution_efficiency,
+   &sm35_warp_nonpred_execution_efficiency,
+};
+
+/* ==== Compute capability 5.0 (GM107/GM108) ==== */
+static const struct nvc0_hw_metric_query_cfg *sm50_hw_metric_queries[] =
+{
+   &sm20_achieved_occupancy,
+   &sm20_branch_efficiency,
+   &sm30_inst_issued,
+   &sm20_inst_per_wrap,
+   &sm30_inst_replay_overhead,
+   &sm20_ipc,
+   &sm30_issued_ipc,
+   &sm30_issue_slots,
+   &sm30_issue_slot_utilization,
+   &sm30_warp_execution_efficiency,
+   &sm35_warp_nonpred_execution_efficiency,
  };
  
  #undef _SM
@@ -343,6 +400,9 @@ nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
     struct nouveau_device *dev = screen->base.device;
  
     switch (screen->base.class_3d) {
+   case GM200_3D_CLASS:
+   case GM107_3D_CLASS:
+      return sm50_hw_metric_queries;
     case NVF0_3D_CLASS:
        return sm35_hw_metric_queries;
     case NVE4_3D_CLASS:
@@ -362,6 +422,9 @@ nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen)
     struct nouveau_device *dev = screen->base.device;
  
     switch (screen->base.class_3d) {
+   case GM200_3D_CLASS:
+   case GM107_3D_CLASS:
+      return ARRAY_SIZE(sm50_hw_metric_queries);
     case NVF0_3D_CLASS:
        return ARRAY_SIZE(sm35_hw_metric_queries);
     case NVE4_3D_CLASS:
@@ -407,11 +470,11 @@ nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
     FREE(hmq);
  }
  
-static boolean
+static bool
  nvc0_hw_metric_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
  {
     struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
-   boolean ret = false;
+   bool ret = false;
     unsigned i;
  
     for (i = 0; i < hmq->num_queries; i++) {
@@ -437,9 +500,9 @@ sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
  {
     switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
     case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
-      /* (active_warps / active_cycles) / max. number of warps on a MP */
+      /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
        if (res64[1])
-         return (res64[0] / (double)res64[1]) / 48;
+         return ((res64[0] / (double)res64[1]) / 48) * 100;
        break;
     case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
        /* (branch / (branch + divergent_branch)) * 100 */
@@ -530,9 +593,9 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
  {
     switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
     case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
-      /* (active_warps / active_cycles) / max. number of warps on a MP */
+      /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
        if (res64[1])
-         return (res64[0] / (double)res64[1]) / 64;
+         return ((res64[0] / (double)res64[1]) / 64) * 100;
        break;
     case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
        return sm20_hw_metric_calc_result(hq, res64);
@@ -566,6 +629,12 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
        if (res64[2])
           return (res64[0] + res64[1]) / (double)res64[2];
        break;
+   case NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY:
+      /* thread_inst_executed / (inst_executed * max. number of threads per
+       * wrap) * 100 */
+      if (res64[0])
+         return (res64[1] / ((double)res64[0] * 32)) * 100;
+      break;
     default:
        debug_printf("invalid metric type: %d\n",
                     hq->base.type - NVC0_HW_METRIC_QUERY(0));
@@ -574,9 +643,25 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
     return 0;
  }
  
-static boolean
+static uint64_t
+sm35_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
+{
+   switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
+   case NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY:
+      /* not_predicated_off_thread_inst_executed / (inst_executed * max. number
+       * of threads per wrap) * 100 */
+      if (res64[0])
+         return (res64[1] / ((double)res64[0] * 32)) * 100;
+      break;
+   default:
+      return sm30_hw_metric_calc_result(hq, res64);
+   }
+   return 0;
+}
+
+static bool
  nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
-                                struct nvc0_hw_query *hq, boolean wait,
+                                struct nvc0_hw_query *hq, bool wait,
                                  union pipe_query_result *result)
  {
     struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq);
@@ -585,7 +670,7 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
     union pipe_query_result results[8] = {};
     uint64_t res64[8] = {};
     uint64_t value = 0;
-   boolean ret = false;
+   bool ret = false;
     unsigned i;
  
     for (i = 0; i < hmq->num_queries; i++) {
@@ -597,7 +682,11 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
     }
  
     switch (screen->base.class_3d) {
+   case GM200_3D_CLASS:
+   case GM107_3D_CLASS:
     case NVF0_3D_CLASS:
+      value = sm35_hw_metric_calc_result(hq, res64);
+      break;
     case NVE4_3D_CLASS:
        value = sm30_hw_metric_calc_result(hq, res64);
        break;
@@ -669,7 +758,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
  
     if (id < count) {
        if (screen->compute) {
-         if (screen->base.class_3d <= NVF0_3D_CLASS) {
+         if (screen->base.class_3d <= GM200_3D_CLASS) {
              const struct nvc0_hw_metric_query_cfg **queries =
                 nvc0_hw_metric_get_queries(screen);
              const struct nvc0_hw_metric_cfg *cfg =