#include "nvc0/nvc0_query_hw_metric.h"
#include "nvc0/nvc0_query_hw_sm.h"
-/* === PERFORMANCE MONITORING METRICS for NVC0:NVE4 === */
-static const char *nvc0_hw_metric_names[] =
-{
- "metric-achieved_occupancy",
- "metric-branch_efficiency",
- "metric-inst_issued",
- "metric-inst_per_wrap",
- "metric-inst_replay_overhead",
- "metric-issued_ipc",
- "metric-issue_slots",
- "metric-issue_slot_utilization",
- "metric-ipc",
+#define _Q(i,n,t,d) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t, d }
+static const struct nvc0_hw_metric_cfg {
+ unsigned id;
+ const char *name;
+ enum pipe_driver_query_type type;
+ const char *desc;
+} nvc0_hw_metric_queries[] = {
+ _Q(ACHIEVED_OCCUPANCY,
+ "metric-achieved_occupancy",
+ PERCENTAGE,
+ "Ratio of the average active warps per active cycle to the maximum number "
+ "of warps supported on a multiprocessor"),
+
+ _Q(BRANCH_EFFICIENCY,
+ "metric-branch_efficiency",
+ PERCENTAGE,
+ "Ratio of non-divergent branches to total branches"),
+
+ _Q(INST_ISSUED,
+ "metric-inst_issued",
+ UINT64,
+ "The number of instructions issued"),
+
+ _Q(INST_PER_WRAP,
+ "metric-inst_per_wrap",
+ UINT64,
+ "Average number of instructions executed by each warp"),
+
+ _Q(INST_REPLAY_OVERHEAD,
+ "metric-inst_replay_overhead",
+ UINT64,
+ "Average number of replays for each instruction executed"),
+
+ _Q(ISSUED_IPC,
+ "metric-issued_ipc",
+ UINT64,
+ "Instructions issued per cycle"),
+
+ _Q(ISSUE_SLOTS,
+ "metric-issue_slots",
+ UINT64,
+ "The number of issue slots used"),
+
+ _Q(ISSUE_SLOT_UTILIZATION,
+ "metric-issue_slot_utilization",
+ PERCENTAGE,
+ "Percentage of issue slots that issued at least one instruction, averaged "
+ "across all cycles"),
+
+ _Q(IPC,
+ "metric-ipc",
+ UINT64,
+ "Instructions executed per cycle"),
+
+ _Q(SHARED_REPLAY_OVERHEAD,
+ "metric-shared_replay_overhead",
+ UINT64,
+ "Average number of replays due to shared memory conflicts for each "
+ "instruction executed"),
};
+#undef _Q
+
+static inline const struct nvc0_hw_metric_cfg *
+nvc0_hw_metric_get_cfg(unsigned metric_id)
+{
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(nvc0_hw_metric_queries); i++) {
+ if (nvc0_hw_metric_queries[i].id == metric_id)
+ return &nvc0_hw_metric_queries[i];
+ }
+ assert(0);
+ return NULL;
+}
+
struct nvc0_hw_metric_query_cfg {
+ unsigned type;
uint32_t queries[8];
uint32_t num_queries;
};
#define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n)
-#define _M(n, c) [NVC0_HW_METRIC_QUERY_##n] = c
/* ==== Compute capability 2.0 (GF100/GF110) ==== */
static const struct nvc0_hw_metric_query_cfg
sm20_achieved_occupancy =
{
+ .type = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY,
.queries[0] = _SM(ACTIVE_WARPS),
.queries[1] = _SM(ACTIVE_CYCLES),
.num_queries = 2,
static const struct nvc0_hw_metric_query_cfg
sm20_branch_efficiency =
{
+ .type = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
.queries[0] = _SM(BRANCH),
.queries[1] = _SM(DIVERGENT_BRANCH),
.num_queries = 2,
static const struct nvc0_hw_metric_query_cfg
sm20_inst_per_wrap =
{
+ .type = NVC0_HW_METRIC_QUERY_INST_PER_WRAP,
.queries[0] = _SM(INST_EXECUTED),
.queries[1] = _SM(WARPS_LAUNCHED),
.num_queries = 2,
static const struct nvc0_hw_metric_query_cfg
sm20_inst_replay_overhead =
{
+ .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
.queries[0] = _SM(INST_ISSUED),
.queries[1] = _SM(INST_EXECUTED),
.num_queries = 2,
static const struct nvc0_hw_metric_query_cfg
sm20_issued_ipc =
{
+ .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
+ .queries[0] = _SM(INST_ISSUED),
+ .queries[1] = _SM(ACTIVE_CYCLES),
+ .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm20_issue_slot_utilization =
+{
+ .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
.queries[0] = _SM(INST_ISSUED),
.queries[1] = _SM(ACTIVE_CYCLES),
.num_queries = 2,
static const struct nvc0_hw_metric_query_cfg
sm20_ipc =
{
+ .type = NVC0_HW_METRIC_QUERY_IPC,
.queries[0] = _SM(INST_EXECUTED),
.queries[1] = _SM(ACTIVE_CYCLES),
.num_queries = 2,
static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] =
{
- _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy),
- _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency),
- _M(INST_ISSUED, NULL),
- _M(INST_PER_WRAP, &sm20_inst_per_wrap),
- _M(INST_REPLAY_OVERHEAD, &sm20_inst_replay_overhead),
- _M(ISSUED_IPC, &sm20_issued_ipc),
- _M(ISSUE_SLOTS, NULL),
- _M(ISSUE_SLOT_UTILIZATION, &sm20_issued_ipc),
- _M(IPC, &sm20_ipc),
+ &sm20_achieved_occupancy,
+ &sm20_branch_efficiency,
+ &sm20_inst_per_wrap,
+ &sm20_inst_replay_overhead,
+ &sm20_issued_ipc,
+ &sm20_issue_slot_utilization,
+ &sm20_ipc,
};
/* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
static const struct nvc0_hw_metric_query_cfg
sm21_inst_issued =
{
+ .type = NVC0_HW_METRIC_QUERY_INST_ISSUED,
.queries[0] = _SM(INST_ISSUED1_0),
.queries[1] = _SM(INST_ISSUED1_1),
.queries[2] = _SM(INST_ISSUED2_0),
static const struct nvc0_hw_metric_query_cfg
sm21_inst_replay_overhead =
{
+ .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
.queries[0] = _SM(INST_ISSUED1_0),
.queries[1] = _SM(INST_ISSUED1_1),
.queries[2] = _SM(INST_ISSUED2_0),
static const struct nvc0_hw_metric_query_cfg
sm21_issued_ipc =
{
+ .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
+ .queries[0] = _SM(INST_ISSUED1_0),
+ .queries[1] = _SM(INST_ISSUED1_1),
+ .queries[2] = _SM(INST_ISSUED2_0),
+ .queries[3] = _SM(INST_ISSUED2_1),
+ .queries[4] = _SM(ACTIVE_CYCLES),
+ .num_queries = 5,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm21_issue_slots =
+{
+ .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
+ .queries[0] = _SM(INST_ISSUED1_0),
+ .queries[1] = _SM(INST_ISSUED1_1),
+ .queries[2] = _SM(INST_ISSUED2_0),
+ .queries[3] = _SM(INST_ISSUED2_1),
+ .num_queries = 4,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm21_issue_slot_utilization =
+{
+ .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
.queries[0] = _SM(INST_ISSUED1_0),
.queries[1] = _SM(INST_ISSUED1_1),
.queries[2] = _SM(INST_ISSUED2_0),
static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] =
{
- _M(ACHIEVED_OCCUPANCY, &sm20_achieved_occupancy),
- _M(BRANCH_EFFICIENCY, &sm20_branch_efficiency),
- _M(INST_ISSUED, &sm21_inst_issued),
- _M(INST_PER_WRAP, &sm20_inst_per_wrap),
- _M(INST_REPLAY_OVERHEAD, &sm21_inst_replay_overhead),
- _M(ISSUED_IPC, &sm21_issued_ipc),
- _M(ISSUE_SLOTS, &sm21_inst_issued),
- _M(ISSUE_SLOT_UTILIZATION, &sm21_issued_ipc),
- _M(IPC, &sm20_ipc),
+ &sm20_achieved_occupancy,
+ &sm20_branch_efficiency,
+ &sm21_inst_issued,
+ &sm20_inst_per_wrap,
+ &sm21_inst_replay_overhead,
+ &sm21_issued_ipc,
+ &sm21_issue_slots,
+ &sm21_issue_slot_utilization,
+ &sm20_ipc,
+};
+
+/* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */
+static const struct nvc0_hw_metric_query_cfg
+sm30_achieved_occupancy =
+{
+ .type = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY,
+ .queries[0] = _SM(ACTIVE_WARPS),
+ .queries[1] = _SM(ACTIVE_CYCLES),
+ .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_branch_efficiency =
+{
+ .type = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
+ .queries[0] = _SM(BRANCH),
+ .queries[1] = _SM(DIVERGENT_BRANCH),
+ .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_inst_issued =
+{
+ .type = NVC0_HW_METRIC_QUERY_INST_ISSUED,
+ .queries[0] = _SM(INST_ISSUED1),
+ .queries[1] = _SM(INST_ISSUED2),
+ .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_inst_per_wrap =
+{
+ .type = NVC0_HW_METRIC_QUERY_INST_PER_WRAP,
+ .queries[0] = _SM(INST_EXECUTED),
+ .queries[1] = _SM(WARPS_LAUNCHED),
+ .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_inst_replay_overhead =
+{
+ .type = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
+ .queries[0] = _SM(INST_ISSUED1),
+ .queries[1] = _SM(INST_ISSUED2),
+ .queries[2] = _SM(INST_EXECUTED),
+ .num_queries = 3,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_issued_ipc =
+{
+ .type = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
+ .queries[0] = _SM(INST_ISSUED1),
+ .queries[1] = _SM(INST_ISSUED2),
+ .queries[2] = _SM(ACTIVE_CYCLES),
+ .num_queries = 3,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_issue_slots =
+{
+ .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
+ .queries[0] = _SM(INST_ISSUED1),
+ .queries[1] = _SM(INST_ISSUED2),
+ .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_issue_slot_utilization =
+{
+ .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
+ .queries[0] = _SM(INST_ISSUED1),
+ .queries[1] = _SM(INST_ISSUED2),
+ .queries[2] = _SM(ACTIVE_CYCLES),
+ .num_queries = 3,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_ipc =
+{
+ .type = NVC0_HW_METRIC_QUERY_IPC,
+ .queries[0] = _SM(INST_EXECUTED),
+ .queries[1] = _SM(ACTIVE_CYCLES),
+ .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_shared_replay_overhead =
+{
+ .type = NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD,
+ .queries[0] = _SM(SHARED_LD_REPLAY),
+ .queries[1] = _SM(SHARED_ST_REPLAY),
+ .queries[2] = _SM(INST_EXECUTED),
+ .num_queries = 3,
+};
+
+static const struct nvc0_hw_metric_query_cfg *sm30_hw_metric_queries[] =
+{
+ &sm30_achieved_occupancy,
+ &sm30_branch_efficiency,
+ &sm30_inst_issued,
+ &sm30_inst_per_wrap,
+ &sm30_inst_replay_overhead,
+ &sm30_issued_ipc,
+ &sm30_issue_slots,
+ &sm30_issue_slot_utilization,
+ &sm30_ipc,
+ &sm30_shared_replay_overhead,
+};
+
+/* ==== Compute capability 3.5 (GK110) ==== */
+static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] =
+{
+ &sm30_achieved_occupancy,
+ &sm30_inst_issued,
+ &sm30_inst_per_wrap,
+ &sm30_inst_replay_overhead,
+ &sm30_issued_ipc,
+ &sm30_inst_issued,
+ &sm30_issue_slot_utilization,
+ &sm30_ipc,
+ &sm30_shared_replay_overhead,
};
#undef _SM
-#undef _M
static inline const struct nvc0_hw_metric_query_cfg **
nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
{
struct nouveau_device *dev = screen->base.device;
- if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
- return sm20_hw_metric_queries;
- return sm21_hw_metric_queries;
+ switch (screen->base.class_3d) {
+ case NVF0_3D_CLASS:
+ return sm35_hw_metric_queries;
+ case NVE4_3D_CLASS:
+ return sm30_hw_metric_queries;
+ default:
+ if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+ return sm20_hw_metric_queries;
+ return sm21_hw_metric_queries;
+ }
+ assert(0);
+ return NULL;
+}
+
+unsigned
+nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen)
+{
+ struct nouveau_device *dev = screen->base.device;
+
+ switch (screen->base.class_3d) {
+ case NVF0_3D_CLASS:
+ return ARRAY_SIZE(sm35_hw_metric_queries);
+ case NVE4_3D_CLASS:
+ return ARRAY_SIZE(sm30_hw_metric_queries);
+ default:
+ if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+ return ARRAY_SIZE(sm20_hw_metric_queries);
+ return ARRAY_SIZE(sm21_hw_metric_queries);
+ }
+ return 0;
}
static const struct nvc0_hw_metric_query_cfg *
-nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0,
- struct nvc0_hw_query *hq)
+nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
{
const struct nvc0_hw_metric_query_cfg **queries;
struct nvc0_screen *screen = nvc0->screen;
struct nvc0_query *q = &hq->base;
+ unsigned num_queries;
+ unsigned i;
+ num_queries = nvc0_hw_metric_get_num_queries(screen);
queries = nvc0_hw_metric_get_queries(screen);
- return queries[q->type - NVC0_HW_METRIC_QUERY(0)];
+
+ for (i = 0; i < num_queries; i++) {
+ if (NVC0_HW_METRIC_QUERY(queries[i]->type) == q->type)
+ return queries[i];
+ }
+ assert(0);
+ return NULL;
}
static void
unsigned i;
for (i = 0; i < hmq->num_queries; i++)
- hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
+ if (hmq->queries[i]->funcs->destroy_query)
+ hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
FREE(hmq);
}
{
switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
- /* (active_warps / active_cycles) / max. number of warps on a MP */
+ /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
if (res64[1])
- return (res64[0] / (double)res64[1]) / 48;
+ return ((res64[0] / (double)res64[1]) / 48) * 100;
break;
case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
/* (branch / (branch + divergent_branch)) * 100 */
return 0;
}
+static uint64_t
+sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
+{
+ switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
+ case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
+ /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */
+ if (res64[1])
+ return ((res64[0] / (double)res64[1]) / 64) * 100;
+ break;
+ case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
+ return sm20_hw_metric_calc_result(hq, res64);
+ case NVC0_HW_METRIC_QUERY_INST_ISSUED:
+ /* inst_issued1 + inst_issued2 * 2 */
+ return res64[0] + res64[1] * 2;
+ case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
+ return sm20_hw_metric_calc_result(hq, res64);
+ case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
+ /* (metric-inst_issued - inst_executed) / inst_executed */
+ if (res64[2])
+ return (((res64[0] + res64[1] * 2) - res64[2]) / (double)res64[2]);
+ break;
+ case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
+ /* metric-inst_issued / active_cycles */
+ if (res64[2])
+ return (res64[0] + res64[1] * 2) / (double)res64[2];
+ break;
+ case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS:
+ /* inst_issued1 + inst_issued2 */
+ return res64[0] + res64[1];
+ case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
+ /* ((metric-issue_slots / 2) / active_cycles) * 100 */
+ if (res64[2])
+ return (((res64[0] + res64[1]) / 2) / (double)res64[2]) * 100;
+ break;
+ case NVC0_HW_METRIC_QUERY_IPC:
+ return sm20_hw_metric_calc_result(hq, res64);
+ case NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD:
+ /* (shared_load_replay + shared_store_replay) / inst_executed */
+ if (res64[2])
+ return (res64[0] + res64[1]) / (double)res64[2];
+ break;
+ default:
+ debug_printf("invalid metric type: %d\n",
+ hq->base.type - NVC0_HW_METRIC_QUERY(0));
+ break;
+ }
+ return 0;
+}
+
static boolean
nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
struct nvc0_hw_query *hq, boolean wait,
res64[i] = *(uint64_t *)&results[i];
}
- if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
- value = sm20_hw_metric_calc_result(hq, res64);
- else
- value = sm21_hw_metric_calc_result(hq, res64);
+ switch (screen->base.class_3d) {
+ case NVF0_3D_CLASS:
+ case NVE4_3D_CLASS:
+ value = sm30_hw_metric_calc_result(hq, res64);
+ break;
+ default:
+ if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+ value = sm20_hw_metric_calc_result(hq, res64);
+ else
+ value = sm21_hw_metric_calc_result(hq, res64);
+ break;
+ }
*(uint64_t *)result = value;
return ret;
return hq;
}
-static int
-nvc0_hw_metric_get_next_query_id(const struct nvc0_hw_metric_query_cfg **queries,
- unsigned id)
-{
- unsigned i, next = 0;
-
- for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) {
- if (!queries[i]) {
- next++;
- } else
- if (i >= id && queries[id + next]) {
- break;
- }
- }
- return id + next;
-}
-
int
nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
struct pipe_driver_query_info *info)
{
- uint16_t class_3d = screen->base.class_3d;
int count = 0;
- if (screen->base.device->drm_version >= 0x01000101) {
- if (screen->compute) {
- if (class_3d < NVE4_3D_CLASS) {
- const struct nvc0_hw_metric_query_cfg **queries =
- nvc0_hw_metric_get_queries(screen);
- unsigned i;
-
- for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) {
- if (queries[i])
- count++;
- }
- }
- }
+ if (screen->base.drm->version >= 0x01000101) {
+ if (screen->compute)
+ count = nvc0_hw_metric_get_num_queries(screen);
}
if (!info)
if (id < count) {
if (screen->compute) {
- if (class_3d < NVE4_3D_CLASS) {
- const struct nvc0_hw_metric_query_cfg **queries =
+ if (screen->base.class_3d <= NVF0_3D_CLASS) {
+ const struct nvc0_hw_metric_query_cfg **queries =
nvc0_hw_metric_get_queries(screen);
+ const struct nvc0_hw_metric_cfg *cfg =
+ nvc0_hw_metric_get_cfg(queries[id]->type);
- id = nvc0_hw_metric_get_next_query_id(queries, id);
- info->name = nvc0_hw_metric_names[id];
- info->query_type = NVC0_HW_METRIC_QUERY(id);
- info->group_id = -1;
+ info->name = cfg->name;
+ info->query_type = NVC0_HW_METRIC_QUERY(queries[id]->type);
+ info->type = cfg->type;
+ info->group_id = NVC0_HW_METRIC_QUERY_GROUP;
return 1;
}
}