From 41fb87249aba2810a8ab5313402af5927ddb0377 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 9 Mar 2016 21:13:22 +0100 Subject: [PATCH] nvc0: rework the MP counters infrastructure This mainly improves how we define the different list of queries. Signed-off-by: Samuel Pitoiset Acked-by: Ilia Mirkin --- src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 16 +- .../nouveau/nvc0/nvc0_query_hw_metric.c | 2 +- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.c | 414 +++++++++--------- .../drivers/nouveau/nvc0/nvc0_query_hw_sm.h | 79 ++-- 4 files changed, 243 insertions(+), 268 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index d2acce7d5be..f9f2bbe633f 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -204,10 +204,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, if (screen->base.drm->version >= 0x01000101) { if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += 2; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { + if (screen->base.class_3d <= NVE4_3D_CLASS) { count += 2; } } @@ -227,15 +224,8 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, * currently only used by AMD_performance_monitor. */ info->max_active_queries = 1; - - if (screen->base.class_3d == NVE4_3D_CLASS) { - info->num_queries = NVE4_HW_SM_QUERY_COUNT; - return 1; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - info->num_queries = NVC0_HW_SM_QUERY_COUNT; - return 1; - } + info->num_queries = nvc0_hw_sm_get_num_queries(screen); + return 1; } } else if (id == NVC0_HW_METRIC_QUERY_GROUP) { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c index 7a64b69b1c1..c1085511a85 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c @@ -172,7 +172,7 @@ static const char *nve4_hw_metric_names[] = "metric-shared_replay_overhead", }; -#define _SM(n) NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_ ##n) +#define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n) #define _M(n, c) [NVE4_HW_METRIC_QUERY_##n] = c /* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */ diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index f5f9bb39fd9..7c4ab1c19c8 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -30,59 +30,85 @@ #include "nvc0/nve4_compute.xml.h" #include "nvc0/nvc0_compute.xml.h" -/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ - /* NOTE: intentionally using the same names as NV */ -static const char *nve4_hw_sm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_cas_count", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "global_ld_mem_divergence_replays", - "global_store_transaction", - "global_st_mem_divergence_replays", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued1", - "inst_issued2", - "l1_global_load_hit", - "l1_global_load_miss", - "__l1_global_load_transactions", - "__l1_global_store_transactions", - "l1_local_load_hit", - "l1_local_load_miss", - "l1_local_store_hit", - "l1_local_store_miss", - "l1_shared_load_transactions", - "l1_shared_store_transactions", - "local_load", - "local_load_transactions", - "local_store", - "local_store_transactions", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_load_replay", - "shared_store", - "shared_store_replay", - "sm_cta_launched", - "threads_launched", - "uncached_global_load_transaction", - "warps_launched", +#define _Q(t, n) { NVC0_HW_SM_QUERY_##t, n } +struct { + unsigned type; + const char *name; +} nvc0_hw_sm_queries[] = { + _Q(ACTIVE_CYCLES, "active_cycles" ), + _Q(ACTIVE_WARPS, "active_warps" ), + _Q(ATOM_CAS_COUNT, "atom_cas_count" ), + _Q(ATOM_COUNT, "atom_count" ), + _Q(BRANCH, "branch" ), + _Q(DIVERGENT_BRANCH, "divergent_branch" ), + _Q(GLD_REQUEST, "gld_request" ), + _Q(GLD_MEM_DIV_REPLAY, "global_ld_mem_divergence_replays" ), + _Q(GST_TRANSACTIONS, "global_store_transaction" ), + _Q(GST_MEM_DIV_REPLAY, "global_st_mem_divergence_replays" ), + _Q(GRED_COUNT, "gred_count" ), + _Q(GST_REQUEST, "gst_request" ), + _Q(INST_EXECUTED, "inst_executed" ), + _Q(INST_ISSUED, "inst_issued" ), + _Q(INST_ISSUED1, "inst_issued1" ), + _Q(INST_ISSUED2, "inst_issued2" ), + _Q(INST_ISSUED1_0, "inst_issued1_0" ), + _Q(INST_ISSUED1_1, "inst_issued1_1" ), + _Q(INST_ISSUED2_0, "inst_issued2_0" ), + _Q(INST_ISSUED2_1, "inst_issued2_1" ), + _Q(L1_GLD_HIT, "l1_global_load_hit" ), + _Q(L1_GLD_MISS, "l1_global_load_miss" ), + _Q(L1_GLD_TRANSACTIONS, "__l1_global_load_transactions" ), + _Q(L1_GST_TRANSACTIONS, "__l1_global_store_transactions" ), + _Q(L1_LOCAL_LD_HIT, "l1_local_load_hit" ), + _Q(L1_LOCAL_LD_MISS, "l1_local_load_miss" ), + _Q(L1_LOCAL_ST_HIT, "l1_local_store_hit" ), + _Q(L1_LOCAL_ST_MISS, "l1_local_store_miss" ), + _Q(L1_SHARED_LD_TRANSACTIONS, "l1_shared_load_transactions" ), + _Q(L1_SHARED_ST_TRANSACTIONS, "l1_shared_store_transactions" ), + _Q(LOCAL_LD, "local_load" ), + _Q(LOCAL_LD_TRANSACTIONS, "local_load_transactions" ), + _Q(LOCAL_ST, "local_store" ), + _Q(LOCAL_ST_TRANSACTIONS, "local_store_transactions" ), + _Q(PROF_TRIGGER_0, "prof_trigger_00" ), + _Q(PROF_TRIGGER_1, "prof_trigger_01" ), + _Q(PROF_TRIGGER_2, "prof_trigger_02" ), + _Q(PROF_TRIGGER_3, "prof_trigger_03" ), + _Q(PROF_TRIGGER_4, "prof_trigger_04" ), + _Q(PROF_TRIGGER_5, "prof_trigger_05" ), + _Q(PROF_TRIGGER_6, "prof_trigger_06" ), + _Q(PROF_TRIGGER_7, "prof_trigger_07" ), + _Q(SHARED_LD, "shared_load" ), + _Q(SHARED_LD_REPLAY, "shared_load_replay" ), + _Q(SHARED_ST, "shared_store" ), + _Q(SHARED_ST_REPLAY, "shared_store_replay" ), + _Q(SM_CTA_LAUNCHED, "sm_cta_launched" ), + _Q(THREADS_LAUNCHED, "threads_launched" ), + _Q(TH_INST_EXECUTED_0, "thread_inst_executed_0" ), + _Q(TH_INST_EXECUTED_1, "thread_inst_executed_1" ), + _Q(TH_INST_EXECUTED_2, "thread_inst_executed_2" ), + _Q(TH_INST_EXECUTED_3, "thread_inst_executed_3" ), + _Q(UNCACHED_GLD_TRANSACTIONS, "uncached_global_load_transaction" ), + _Q(WARPS_LAUNCHED, "warps_launched" ), }; +#undef _Q + +static inline const char * +nvc0_hw_sm_query_get_name(unsigned query_type) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) { + if (nvc0_hw_sm_queries[i].type == query_type) + return nvc0_hw_sm_queries[i].name; + } + assert(0); + return NULL; +} + +/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */ + /* Code to read out MP counters: They are accessible via mmio, too, but let's * just avoid mapping registers in userspace. We'd have to know which MPs are * enabled/present, too, and that information is not presently exposed. @@ -187,19 +213,20 @@ struct nvc0_hw_sm_counter_cfg struct nvc0_hw_sm_query_cfg { + unsigned type; struct nvc0_hw_sm_counter_cfg ctr[8]; uint8_t num_counters; uint8_t norm[2]; /* normalization num,denom */ }; -#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } } -#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } } +#define _Q1A(n, f, m, g, s, nu, dn) { NVC0_HW_SM_QUERY_##n, { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } } +#define _Q1B(n, f, m, g, s, nu, dn) { NVC0_HW_SM_QUERY_##n, { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } } /* NOTES: * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps * inst_executed etc.: we only count a single warp scheduler */ -static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = +static const struct nvc0_hw_sm_query_cfg sm30_hw_sm_queries[] = { _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1), _Q1B(ACTIVE_WARPS, 0x003f, B6, WARP, 0x31483104, 2, 1), @@ -257,43 +284,6 @@ static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] = * because there is a context-switch problem that we need to fix. * Results might be wrong sometimes, be careful! */ -static const char *nvc0_hw_sm_query_names[] = -{ - /* MP counters */ - "active_cycles", - "active_warps", - "atom_count", - "branch", - "divergent_branch", - "gld_request", - "gred_count", - "gst_request", - "inst_executed", - "inst_issued", - "inst_issued1_0", - "inst_issued1_1", - "inst_issued2_0", - "inst_issued2_1", - "local_load", - "local_store", - "prof_trigger_00", - "prof_trigger_01", - "prof_trigger_02", - "prof_trigger_03", - "prof_trigger_04", - "prof_trigger_05", - "prof_trigger_06", - "prof_trigger_07", - "shared_load", - "shared_store", - "threads_launched", - "thread_inst_executed_0", - "thread_inst_executed_1", - "thread_inst_executed_2", - "thread_inst_executed_3", - "warps_launched", -}; - static const uint64_t nvc0_read_hw_sm_counters_code[] = { /* mov b32 $r8 $tidx @@ -345,12 +335,12 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] = }; #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s } -#define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c /* ==== Compute capability 2.0 (GF100/GF110) ==== */ static const struct nvc0_hw_sm_query_cfg sm20_active_cycles = { + .type = NVC0_HW_SM_QUERY_ACTIVE_CYCLES, .ctr[0] = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000), .num_counters = 1, .norm = { 1, 1 }, @@ -359,6 +349,7 @@ sm20_active_cycles = static const struct nvc0_hw_sm_query_cfg sm20_active_warps = { + .type = NVC0_HW_SM_QUERY_ACTIVE_WARPS, .ctr[0] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010), .ctr[1] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020), .ctr[2] = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030), @@ -372,6 +363,7 @@ sm20_active_warps = static const struct nvc0_hw_sm_query_cfg sm20_atom_count = { + .type = NVC0_HW_SM_QUERY_ATOM_COUNT, .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030), .num_counters = 1, .norm = { 1, 1 }, @@ -380,6 +372,7 @@ sm20_atom_count = static const struct nvc0_hw_sm_query_cfg sm20_branch = { + .type = NVC0_HW_SM_QUERY_BRANCH, .ctr[0] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010), .num_counters = 2, @@ -389,6 +382,7 @@ sm20_branch = static const struct nvc0_hw_sm_query_cfg sm20_divergent_branch = { + .type = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, .ctr[0] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020), .ctr[1] = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030), .num_counters = 2, @@ -398,6 +392,7 @@ sm20_divergent_branch = static const struct nvc0_hw_sm_query_cfg sm20_gld_request = { + .type = NVC0_HW_SM_QUERY_GLD_REQUEST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030), .num_counters = 1, .norm = { 1, 1 }, @@ -406,6 +401,7 @@ sm20_gld_request = static const struct nvc0_hw_sm_query_cfg sm20_gred_count = { + .type = NVC0_HW_SM_QUERY_GRED_COUNT, .ctr[0] = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -414,6 +410,7 @@ sm20_gred_count = static const struct nvc0_hw_sm_query_cfg sm20_gst_request = { + .type = NVC0_HW_SM_QUERY_GST_REQUEST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060), .num_counters = 1, .norm = { 1, 1 }, @@ -422,6 +419,7 @@ sm20_gst_request = static const struct nvc0_hw_sm_query_cfg sm20_inst_executed = { + .type = NVC0_HW_SM_QUERY_INST_EXECUTED, .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000), .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010), .num_counters = 2, @@ -431,6 +429,7 @@ sm20_inst_executed = static const struct nvc0_hw_sm_query_cfg sm20_inst_issued = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED, .ctr[0] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060), .ctr[1] = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070), .num_counters = 2, @@ -440,6 +439,7 @@ sm20_inst_issued = static const struct nvc0_hw_sm_query_cfg sm20_local_ld = { + .type = NVC0_HW_SM_QUERY_LOCAL_LD, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020), .num_counters = 1, .norm = { 1, 1 }, @@ -448,6 +448,7 @@ sm20_local_ld = static const struct nvc0_hw_sm_query_cfg sm20_local_st = { + .type = NVC0_HW_SM_QUERY_LOCAL_ST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050), .num_counters = 1, .norm = { 1, 1 }, @@ -456,6 +457,7 @@ sm20_local_st = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_0 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000), .num_counters = 1, .norm = { 1, 1 }, @@ -464,6 +466,7 @@ sm20_prof_trigger_0 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_1 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010), .num_counters = 1, .norm = { 1, 1 }, @@ -472,6 +475,7 @@ sm20_prof_trigger_1 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_2 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_2, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020), .num_counters = 1, .norm = { 1, 1 }, @@ -480,6 +484,7 @@ sm20_prof_trigger_2 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_3 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_3, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030), .num_counters = 1, .norm = { 1, 1 }, @@ -488,6 +493,7 @@ sm20_prof_trigger_3 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_4 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_4, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -496,6 +502,7 @@ sm20_prof_trigger_4 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_5 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_5, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050), .num_counters = 1, .norm = { 1, 1 }, @@ -504,6 +511,7 @@ sm20_prof_trigger_5 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_6 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_6, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060), .num_counters = 1, .norm = { 1, 1 }, @@ -512,6 +520,7 @@ sm20_prof_trigger_6 = static const struct nvc0_hw_sm_query_cfg sm20_prof_trigger_7 = { + .type = NVC0_HW_SM_QUERY_PROF_TRIGGER_7, .ctr[0] = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070), .num_counters = 1, .norm = { 1, 1 }, @@ -520,6 +529,7 @@ sm20_prof_trigger_7 = static const struct nvc0_hw_sm_query_cfg sm20_shared_ld = { + .type = NVC0_HW_SM_QUERY_SHARED_LD, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010), .num_counters = 1, .norm = { 1, 1 }, @@ -528,6 +538,7 @@ sm20_shared_ld = static const struct nvc0_hw_sm_query_cfg sm20_shared_st = { + .type = NVC0_HW_SM_QUERY_SHARED_ST, .ctr[0] = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -536,6 +547,7 @@ sm20_shared_st = static const struct nvc0_hw_sm_query_cfg sm20_threads_launched = { + .type = NVC0_HW_SM_QUERY_THREADS_LAUNCHED, .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010), .ctr[1] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020), .ctr[2] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030), @@ -549,6 +561,7 @@ sm20_threads_launched = static const struct nvc0_hw_sm_query_cfg sm20_th_inst_executed_0 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020), @@ -562,6 +575,7 @@ sm20_th_inst_executed_0 = static const struct nvc0_hw_sm_query_cfg sm20_th_inst_executed_1 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020), @@ -575,6 +589,7 @@ sm20_th_inst_executed_1 = static const struct nvc0_hw_sm_query_cfg sm20_warps_launched = { + .type = NVC0_HW_SM_QUERY_WARPS_LAUNCHED, .ctr[0] = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000), .num_counters = 1, .norm = { 1, 1 }, @@ -582,44 +597,39 @@ sm20_warps_launched = static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] = { - _Q(ACTIVE_CYCLES, &sm20_active_cycles), - _Q(ACTIVE_WARPS, &sm20_active_warps), - _Q(ATOM_COUNT, &sm20_atom_count), - _Q(BRANCH, &sm20_branch), - _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), - _Q(GLD_REQUEST, &sm20_gld_request), - _Q(GRED_COUNT, &sm20_gred_count), - _Q(GST_REQUEST, &sm20_gst_request), - _Q(INST_EXECUTED, &sm20_inst_executed), - _Q(INST_ISSUED, &sm20_inst_issued), - _Q(INST_ISSUED1_0, NULL), - _Q(INST_ISSUED1_1, NULL), - _Q(INST_ISSUED2_0, NULL), - _Q(INST_ISSUED2_1, NULL), - _Q(LOCAL_LD, &sm20_local_ld), - _Q(LOCAL_ST, &sm20_local_st), - _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), - _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), - _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), - _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), - _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), - _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), - _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), - _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), - _Q(SHARED_LD, &sm20_shared_ld), - _Q(SHARED_ST, &sm20_shared_st), - _Q(THREADS_LAUNCHED, &sm20_threads_launched), - _Q(TH_INST_EXECUTED_0, &sm20_th_inst_executed_0), - _Q(TH_INST_EXECUTED_1, &sm20_th_inst_executed_1), - _Q(TH_INST_EXECUTED_2, NULL), - _Q(TH_INST_EXECUTED_3, NULL), - _Q(WARPS_LAUNCHED, &sm20_warps_launched), + &sm20_active_cycles, + &sm20_active_warps, + &sm20_atom_count, + &sm20_branch, + &sm20_divergent_branch, + &sm20_gld_request, + &sm20_gred_count, + &sm20_gst_request, + &sm20_inst_executed, + &sm20_inst_issued, + &sm20_local_ld, + &sm20_local_st, + &sm20_prof_trigger_0, + &sm20_prof_trigger_1, + &sm20_prof_trigger_2, + &sm20_prof_trigger_3, + &sm20_prof_trigger_4, + &sm20_prof_trigger_5, + &sm20_prof_trigger_6, + &sm20_prof_trigger_7, + &sm20_shared_ld, + &sm20_shared_st, + &sm20_threads_launched, + &sm20_th_inst_executed_0, + &sm20_th_inst_executed_1, + &sm20_warps_launched, }; /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ static const struct nvc0_hw_sm_query_cfg sm21_inst_executed = { + .type = NVC0_HW_SM_QUERY_INST_EXECUTED, .ctr[0] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020), @@ -630,6 +640,7 @@ sm21_inst_executed = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued1_0 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED1_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010), .num_counters = 1, .norm = { 1, 1 }, @@ -638,6 +649,7 @@ sm21_inst_issued1_0 = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued1_1 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED1_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040), .num_counters = 1, .norm = { 1, 1 }, @@ -646,6 +658,7 @@ sm21_inst_issued1_1 = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued2_0 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED2_0, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020), .num_counters = 1, .norm = { 1, 1 }, @@ -654,6 +667,7 @@ sm21_inst_issued2_0 = static const struct nvc0_hw_sm_query_cfg sm21_inst_issued2_1 = { + .type = NVC0_HW_SM_QUERY_INST_ISSUED2_1, .ctr[0] = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050), .num_counters = 1, .norm = { 1, 1 }, @@ -662,6 +676,7 @@ sm21_inst_issued2_1 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_0 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, .ctr[0] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020), @@ -675,6 +690,7 @@ sm21_th_inst_executed_0 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_1 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, .ctr[0] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020), @@ -688,6 +704,7 @@ sm21_th_inst_executed_1 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_2 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, .ctr[0] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020), @@ -701,6 +718,7 @@ sm21_th_inst_executed_2 = static const struct nvc0_hw_sm_query_cfg sm21_th_inst_executed_3 = { + .type = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, .ctr[0] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000), .ctr[1] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010), .ctr[2] = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020), @@ -713,41 +731,39 @@ sm21_th_inst_executed_3 = static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] = { - _Q(ACTIVE_CYCLES, &sm20_active_cycles), - _Q(ACTIVE_WARPS, &sm20_active_warps), - _Q(ATOM_COUNT, &sm20_atom_count), - _Q(BRANCH, &sm20_branch), - _Q(DIVERGENT_BRANCH, &sm20_divergent_branch), - _Q(GLD_REQUEST, &sm20_gld_request), - _Q(GRED_COUNT, &sm20_gred_count), - _Q(GST_REQUEST, &sm20_gst_request), - _Q(INST_EXECUTED, &sm21_inst_executed), - _Q(INST_ISSUED, NULL), - _Q(INST_ISSUED1_0, &sm21_inst_issued1_0), - _Q(INST_ISSUED1_1, &sm21_inst_issued1_1), - _Q(INST_ISSUED2_0, &sm21_inst_issued2_0), - _Q(INST_ISSUED2_1, &sm21_inst_issued2_1), - _Q(LOCAL_LD, &sm20_local_ld), - _Q(LOCAL_ST, &sm20_local_st), - _Q(PROF_TRIGGER_0, &sm20_prof_trigger_0), - _Q(PROF_TRIGGER_1, &sm20_prof_trigger_1), - _Q(PROF_TRIGGER_2, &sm20_prof_trigger_2), - _Q(PROF_TRIGGER_3, &sm20_prof_trigger_3), - _Q(PROF_TRIGGER_4, &sm20_prof_trigger_4), - _Q(PROF_TRIGGER_5, &sm20_prof_trigger_5), - _Q(PROF_TRIGGER_6, &sm20_prof_trigger_6), - _Q(PROF_TRIGGER_7, &sm20_prof_trigger_7), - _Q(SHARED_LD, &sm20_shared_ld), - _Q(SHARED_ST, &sm20_shared_st), - _Q(THREADS_LAUNCHED, &sm20_threads_launched), - _Q(TH_INST_EXECUTED_0, &sm21_th_inst_executed_0), - _Q(TH_INST_EXECUTED_1, &sm21_th_inst_executed_1), - _Q(TH_INST_EXECUTED_2, &sm21_th_inst_executed_2), - _Q(TH_INST_EXECUTED_3, &sm21_th_inst_executed_3), - _Q(WARPS_LAUNCHED, &sm20_warps_launched), + &sm20_active_cycles, + &sm20_active_warps, + &sm20_atom_count, + &sm20_branch, + &sm20_divergent_branch, + &sm20_gld_request, + &sm20_gred_count, + &sm20_gst_request, + &sm21_inst_executed, + &sm21_inst_issued1_0, + &sm21_inst_issued1_1, + &sm21_inst_issued2_0, + &sm21_inst_issued2_1, + &sm20_local_ld, + &sm20_local_st, + &sm20_prof_trigger_0, + &sm20_prof_trigger_1, + &sm20_prof_trigger_2, + &sm20_prof_trigger_3, + &sm20_prof_trigger_4, + &sm20_prof_trigger_5, + &sm20_prof_trigger_6, + &sm20_prof_trigger_7, + &sm20_shared_ld, + &sm20_shared_st, + &sm20_threads_launched, + &sm21_th_inst_executed_0, + &sm21_th_inst_executed_1, + &sm21_th_inst_executed_2, + &sm21_th_inst_executed_3, + &sm20_warps_launched, }; -#undef _Q #undef _C static inline const struct nvc0_hw_sm_query_cfg ** @@ -760,21 +776,47 @@ nvc0_hw_sm_get_queries(struct nvc0_screen *screen) return sm21_hw_sm_queries; } +unsigned +nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen) +{ + struct nouveau_device *dev = screen->base.device; + + switch (screen->base.class_3d) { + case NVE4_3D_CLASS: + return ARRAY_SIZE(sm30_hw_sm_queries); + default: + if (dev->chipset == 0xc0 || dev->chipset == 0xc8) + return ARRAY_SIZE(sm20_hw_sm_queries); + return ARRAY_SIZE(sm21_hw_sm_queries); + } + return 0; +} + static const struct nvc0_hw_sm_query_cfg * nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { struct nvc0_screen *screen = nvc0->screen; struct nvc0_query *q = &hq->base; + unsigned num_queries; + unsigned i; - if (screen->base.class_3d >= NVE4_3D_CLASS) - return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC]; + num_queries = nvc0_hw_sm_get_num_queries(screen); - if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) { + if (screen->base.class_3d >= NVE4_3D_CLASS) { + for (i = 0; i < num_queries; i++) { + if (NVC0_HW_SM_QUERY(sm30_hw_sm_queries[i].type) == q->type) + return &sm30_hw_sm_queries[i]; + } + } else { const struct nvc0_hw_sm_query_cfg **queries = - nvc0_hw_sm_get_queries(screen); - return queries[q->type - NVC0_HW_SM_QUERY(0)]; + nvc0_hw_sm_get_queries(screen); + + for (i = 0; i < num_queries; i++) { + if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type) + return queries[i]; + } } - debug_printf("invalid query type: %d\n", q->type); + assert(0); return NULL; } @@ -1132,8 +1174,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) if (nvc0->screen->base.drm->version < 0x01000101) return NULL; - if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) && - (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)) + if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST) return NULL; hsq = CALLOC_STRUCT(nvc0_hw_sm_query); @@ -1201,23 +1242,6 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) return hq; } -static int -nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries, - unsigned id) -{ - unsigned i, next = 0; - - for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { - if (!queries[i]) { - next++; - } else - if (i >= id && queries[id + next]) { - break; - } - } - return id + next; -} - int nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, struct pipe_driver_query_info *info) @@ -1225,21 +1249,8 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, int count = 0; if (screen->base.drm->version >= 0x01000101) { - if (screen->compute) { - if (screen->base.class_3d == NVE4_3D_CLASS) { - count += NVE4_HW_SM_QUERY_COUNT; - } else - if (screen->base.class_3d < NVE4_3D_CLASS) { - const struct nvc0_hw_sm_query_cfg **queries = - nvc0_hw_sm_get_queries(screen); - unsigned i; - - for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) { - if (queries[i]) - count++; - } - } - } + if (screen->compute) + count = nvc0_hw_sm_get_num_queries(screen); } if (!info) @@ -1248,8 +1259,10 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, if (id < count) { if (screen->compute) { if (screen->base.class_3d == NVE4_3D_CLASS) { - info->name = nve4_hw_sm_query_names[id]; - info->query_type = NVE4_HW_SM_QUERY(id); + const struct nvc0_hw_sm_query_cfg *q = &sm30_hw_sm_queries[id]; + + info->name = nvc0_hw_sm_query_get_name(q->type); + info->query_type = NVC0_HW_SM_QUERY(q->type); info->group_id = NVC0_HW_SM_QUERY_GROUP; return 1; } else @@ -1257,9 +1270,8 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, const struct nvc0_hw_sm_query_cfg **queries = nvc0_hw_sm_get_queries(screen); - id = nvc0_hw_sm_get_next_query_id(queries, id); - info->name = nvc0_hw_sm_query_names[id]; - info->query_type = NVC0_HW_SM_QUERY(id); + info->name = nvc0_hw_sm_query_get_name(queries[id]->type); + info->query_type = NVC0_HW_SM_QUERY(queries[id]->type); info->group_id = NVC0_HW_SM_QUERY_GROUP; return 1; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h index 94d55a04ff8..8c9c104e8be 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h @@ -17,78 +17,44 @@ nvc0_hw_sm_query(struct nvc0_hw_query *hq) /* * Performance counter queries: */ -#define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) -#define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1) -enum nve4_hw_sm_queries -{ - NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0, - NVE4_HW_SM_QUERY_ACTIVE_WARPS, - NVE4_HW_SM_QUERY_ATOM_CAS_COUNT, - NVE4_HW_SM_QUERY_ATOM_COUNT, - NVE4_HW_SM_QUERY_BRANCH, - NVE4_HW_SM_QUERY_DIVERGENT_BRANCH, - NVE4_HW_SM_QUERY_GLD_REQUEST, - NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY, - NVE4_HW_SM_QUERY_GRED_COUNT, - NVE4_HW_SM_QUERY_GST_REQUEST, - NVE4_HW_SM_QUERY_INST_EXECUTED, - NVE4_HW_SM_QUERY_INST_ISSUED1, - NVE4_HW_SM_QUERY_INST_ISSUED2, - NVE4_HW_SM_QUERY_L1_GLD_HIT, - NVE4_HW_SM_QUERY_L1_GLD_MISS, - NVE4_HW_SM_QUERY_L1_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_GST_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT, - NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS, - NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_LD, - NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, - NVE4_HW_SM_QUERY_LOCAL_ST, - NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, - NVE4_HW_SM_QUERY_PROF_TRIGGER_0, - NVE4_HW_SM_QUERY_PROF_TRIGGER_1, - NVE4_HW_SM_QUERY_PROF_TRIGGER_2, - NVE4_HW_SM_QUERY_PROF_TRIGGER_3, - NVE4_HW_SM_QUERY_PROF_TRIGGER_4, - NVE4_HW_SM_QUERY_PROF_TRIGGER_5, - NVE4_HW_SM_QUERY_PROF_TRIGGER_6, - NVE4_HW_SM_QUERY_PROF_TRIGGER_7, - NVE4_HW_SM_QUERY_SHARED_LD, - NVE4_HW_SM_QUERY_SHARED_LD_REPLAY, - NVE4_HW_SM_QUERY_SHARED_ST, - NVE4_HW_SM_QUERY_SHARED_ST_REPLAY, - NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED, - NVE4_HW_SM_QUERY_THREADS_LAUNCHED, - NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, - NVE4_HW_SM_QUERY_WARPS_LAUNCHED, - NVE4_HW_SM_QUERY_COUNT -}; - -#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i)) +#define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i)) #define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1) enum nvc0_hw_sm_queries { NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0, NVC0_HW_SM_QUERY_ACTIVE_WARPS, + NVC0_HW_SM_QUERY_ATOM_CAS_COUNT, NVC0_HW_SM_QUERY_ATOM_COUNT, NVC0_HW_SM_QUERY_BRANCH, NVC0_HW_SM_QUERY_DIVERGENT_BRANCH, NVC0_HW_SM_QUERY_GLD_REQUEST, + NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY, + NVC0_HW_SM_QUERY_GST_TRANSACTIONS, + NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY, NVC0_HW_SM_QUERY_GRED_COUNT, NVC0_HW_SM_QUERY_GST_REQUEST, NVC0_HW_SM_QUERY_INST_EXECUTED, NVC0_HW_SM_QUERY_INST_ISSUED, + NVC0_HW_SM_QUERY_INST_ISSUED1, + NVC0_HW_SM_QUERY_INST_ISSUED2, NVC0_HW_SM_QUERY_INST_ISSUED1_0, NVC0_HW_SM_QUERY_INST_ISSUED1_1, NVC0_HW_SM_QUERY_INST_ISSUED2_0, NVC0_HW_SM_QUERY_INST_ISSUED2_1, + NVC0_HW_SM_QUERY_L1_GLD_HIT, + NVC0_HW_SM_QUERY_L1_GLD_MISS, + NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS, + NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS, + NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT, + NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS, + NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT, + NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS, + NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS, + NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS, NVC0_HW_SM_QUERY_LOCAL_LD, + NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS, NVC0_HW_SM_QUERY_LOCAL_ST, + NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS, NVC0_HW_SM_QUERY_PROF_TRIGGER_0, NVC0_HW_SM_QUERY_PROF_TRIGGER_1, NVC0_HW_SM_QUERY_PROF_TRIGGER_2, @@ -98,12 +64,16 @@ enum nvc0_hw_sm_queries NVC0_HW_SM_QUERY_PROF_TRIGGER_6, NVC0_HW_SM_QUERY_PROF_TRIGGER_7, NVC0_HW_SM_QUERY_SHARED_LD, + NVC0_HW_SM_QUERY_SHARED_LD_REPLAY, NVC0_HW_SM_QUERY_SHARED_ST, + NVC0_HW_SM_QUERY_SHARED_ST_REPLAY, + NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED, NVC0_HW_SM_QUERY_THREADS_LAUNCHED, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2, NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3, + NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS, NVC0_HW_SM_QUERY_WARPS_LAUNCHED, NVC0_HW_SM_QUERY_COUNT }; @@ -113,4 +83,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *, unsigned); int nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *, unsigned, struct pipe_driver_query_info *); +unsigned +nvc0_hw_sm_get_num_queries(struct nvc0_screen *); + #endif -- 2.30.2