X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fnouveau%2Fnvc0%2Fnvc0_query_hw_metric.c;h=c78b04600d0a8f7ebd85ce3a2efb2a5e8c2e4ed4;hb=c2f48d8f324a7577a63f7f4ad4628564f02687b0;hp=b961cbf652ed6f9c7fa45720c9415d14a72cb4bf;hpb=0e511400de331253a3281fe848e2073c197ec232;p=mesa.git diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c index b961cbf652e..c78b04600d0 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c @@ -24,33 +24,90 @@ #include "nvc0/nvc0_query_hw_metric.h" #include "nvc0/nvc0_query_hw_sm.h" -#define _Q(t,n) { NVC0_HW_METRIC_QUERY_##t, n } -struct { - unsigned type; +#define _Q(i,n,t,d) { NVC0_HW_METRIC_QUERY_##i, n, PIPE_DRIVER_QUERY_TYPE_##t, d } +static const struct nvc0_hw_metric_cfg { + unsigned id; const char *name; + enum pipe_driver_query_type type; + const char *desc; } nvc0_hw_metric_queries[] = { - _Q(ACHIEVED_OCCUPANCY, "metric-achieved_occupancy" ), - _Q(BRANCH_EFFICIENCY, "metric-branch_efficiency" ), - _Q(INST_ISSUED, "metric-inst_issued" ), - _Q(INST_PER_WRAP, "metric-inst_per_wrap" ), - _Q(INST_REPLAY_OVERHEAD, "metric-inst_replay_overhead" ), - _Q(ISSUED_IPC, "metric-issued_ipc" ), - _Q(ISSUE_SLOTS, "metric-issue_slots" ), - _Q(ISSUE_SLOT_UTILIZATION, "metric-issue_slot_utilization" ), - _Q(IPC, "metric-ipc" ), - _Q(SHARED_REPLAY_OVERHEAD, "metric-shared_replay_overhead" ), + _Q(ACHIEVED_OCCUPANCY, + "metric-achieved_occupancy", + PERCENTAGE, + "Ratio of the average active warps per active cycle to the maximum " + "number of warps supported on a multiprocessor"), + + _Q(BRANCH_EFFICIENCY, + "metric-branch_efficiency", + PERCENTAGE, + "Ratio of non-divergent branches to total branches"), + + _Q(INST_ISSUED, + "metric-inst_issued", + UINT64, + "The number of instructions issued"), + + _Q(INST_PER_WRAP, + "metric-inst_per_wrap", + UINT64, + "Average number of instructions executed by each warp"), + + _Q(INST_REPLAY_OVERHEAD, + "metric-inst_replay_overhead", + UINT64, + "Average number of replays for each instruction executed"), + + _Q(ISSUED_IPC, + "metric-issued_ipc", + UINT64, + "Instructions issued per cycle"), + + _Q(ISSUE_SLOTS, + "metric-issue_slots", + UINT64, + "The number of issue slots used"), + + _Q(ISSUE_SLOT_UTILIZATION, + "metric-issue_slot_utilization", + PERCENTAGE, + "Percentage of issue slots that issued at least one instruction, " + "averaged across all cycles"), + + _Q(IPC, + "metric-ipc", + UINT64, + "Instructions executed per cycle"), + + _Q(SHARED_REPLAY_OVERHEAD, + "metric-shared_replay_overhead", + UINT64, + "Average number of replays due to shared memory conflicts for each " + "instruction executed"), + + _Q(WARP_EXECUTION_EFFICIENCY, + "metric-warp_execution_efficiency", + PERCENTAGE, + "Ratio of the average active threads per warp to the maximum number of " + "threads per warp supported on a multiprocessor"), + + _Q(WARP_NONPRED_EXECUTION_EFFICIENCY, + "metric-warp_nonpred_execution_efficiency", + PERCENTAGE, + "Ratio of the average active threads per warp executing non-predicated " + "instructions to the maximum number of threads per warp supported on a " + "multiprocessor"), }; #undef _Q -static inline const char * -nvc0_hw_metric_query_get_name(unsigned query_type) +static inline const struct nvc0_hw_metric_cfg * +nvc0_hw_metric_get_cfg(unsigned metric_id) { unsigned i; for (i = 0; i < ARRAY_SIZE(nvc0_hw_metric_queries); i++) { - if (nvc0_hw_metric_queries[i].type == query_type) - return nvc0_hw_metric_queries[i].name; + if (nvc0_hw_metric_queries[i].id == metric_id) + return &nvc0_hw_metric_queries[i]; } assert(0); return NULL; @@ -134,9 +191,9 @@ static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] = &sm20_branch_efficiency, &sm20_inst_per_wrap, &sm20_inst_replay_overhead, + &sm20_ipc, &sm20_issued_ipc, &sm20_issue_slot_utilization, - &sm20_ipc, }; /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */ @@ -175,6 +232,17 @@ sm21_issued_ipc = .num_queries = 5, }; +static const struct nvc0_hw_metric_query_cfg +sm21_issue_slots = +{ + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS, + .queries[0] = _SM(INST_ISSUED1_0), + .queries[1] = _SM(INST_ISSUED1_1), + .queries[2] = _SM(INST_ISSUED2_0), + .queries[3] = _SM(INST_ISSUED2_1), + .num_queries = 4, +}; + static const struct nvc0_hw_metric_query_cfg sm21_issue_slot_utilization = { @@ -194,31 +262,13 @@ static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] = &sm21_inst_issued, &sm20_inst_per_wrap, &sm21_inst_replay_overhead, + &sm20_ipc, &sm21_issued_ipc, - &sm21_inst_issued, + &sm21_issue_slots, &sm21_issue_slot_utilization, - &sm20_ipc, }; /* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */ -static const struct nvc0_hw_metric_query_cfg -sm30_achieved_occupancy = -{ - .type = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY, - .queries[0] = _SM(ACTIVE_WARPS), - .queries[1] = _SM(ACTIVE_CYCLES), - .num_queries = 2, -}; - -static const struct nvc0_hw_metric_query_cfg -sm30_branch_efficiency = -{ - .type = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY, - .queries[0] = _SM(BRANCH), - .queries[1] = _SM(DIVERGENT_BRANCH), - .num_queries = 2, -}; - static const struct nvc0_hw_metric_query_cfg sm30_inst_issued = { @@ -228,15 +278,6 @@ sm30_inst_issued = .num_queries = 2, }; -static const struct nvc0_hw_metric_query_cfg -sm30_inst_per_wrap = -{ - .type = NVC0_HW_METRIC_QUERY_INST_PER_WRAP, - .queries[0] = _SM(INST_EXECUTED), - .queries[1] = _SM(WARPS_LAUNCHED), - .num_queries = 2, -}; - static const struct nvc0_hw_metric_query_cfg sm30_inst_replay_overhead = { @@ -258,22 +299,22 @@ sm30_issued_ipc = }; static const struct nvc0_hw_metric_query_cfg -sm30_issue_slot_utilization = +sm30_issue_slots = { - .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOTS, .queries[0] = _SM(INST_ISSUED1), .queries[1] = _SM(INST_ISSUED2), - .queries[2] = _SM(ACTIVE_CYCLES), - .num_queries = 3, + .num_queries = 2, }; static const struct nvc0_hw_metric_query_cfg -sm30_ipc = +sm30_issue_slot_utilization = { - .type = NVC0_HW_METRIC_QUERY_IPC, - .queries[0] = _SM(INST_EXECUTED), - .queries[1] = _SM(ACTIVE_CYCLES), - .num_queries = 2, + .type = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION, + .queries[0] = _SM(INST_ISSUED1), + .queries[1] = _SM(INST_ISSUED2), + .queries[2] = _SM(ACTIVE_CYCLES), + .num_queries = 3, }; static const struct nvc0_hw_metric_query_cfg @@ -286,32 +327,69 @@ sm30_shared_replay_overhead = .num_queries = 3, }; +static const struct nvc0_hw_metric_query_cfg +sm30_warp_execution_efficiency = +{ + .type = NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY, + .queries[0] = _SM(INST_EXECUTED), + .queries[1] = _SM(TH_INST_EXECUTED), + .num_queries = 2, +}; + static const struct nvc0_hw_metric_query_cfg *sm30_hw_metric_queries[] = { - &sm30_achieved_occupancy, - &sm30_branch_efficiency, + &sm20_achieved_occupancy, + &sm20_branch_efficiency, &sm30_inst_issued, - &sm30_inst_per_wrap, + &sm20_inst_per_wrap, &sm30_inst_replay_overhead, + &sm20_ipc, &sm30_issued_ipc, - &sm30_inst_issued, + &sm30_issue_slots, &sm30_issue_slot_utilization, - &sm30_ipc, &sm30_shared_replay_overhead, + &sm30_warp_execution_efficiency, +}; + +/* ==== Compute capability 3.5 (GK110/GK208) ==== */ +static const struct nvc0_hw_metric_query_cfg +sm35_warp_nonpred_execution_efficiency = +{ + .type = NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY, + .queries[0] = _SM(INST_EXECUTED), + .queries[1] = _SM(NOT_PRED_OFF_INST_EXECUTED), + .num_queries = 2, }; -/* ==== Compute capability 3.5 (GK110) ==== */ static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] = { - &sm30_achieved_occupancy, + &sm20_achieved_occupancy, &sm30_inst_issued, - &sm30_inst_per_wrap, + &sm20_inst_per_wrap, &sm30_inst_replay_overhead, + &sm20_ipc, &sm30_issued_ipc, - &sm30_inst_issued, + &sm30_issue_slots, &sm30_issue_slot_utilization, - &sm30_ipc, &sm30_shared_replay_overhead, + &sm30_warp_execution_efficiency, + &sm35_warp_nonpred_execution_efficiency, +}; + +/* ==== Compute capability 5.0 (GM107/GM108) ==== */ +static const struct nvc0_hw_metric_query_cfg *sm50_hw_metric_queries[] = +{ + &sm20_achieved_occupancy, + &sm20_branch_efficiency, + &sm30_inst_issued, + &sm20_inst_per_wrap, + &sm30_inst_replay_overhead, + &sm20_ipc, + &sm30_issued_ipc, + &sm30_issue_slots, + &sm30_issue_slot_utilization, + &sm30_warp_execution_efficiency, + &sm35_warp_nonpred_execution_efficiency, }; #undef _SM @@ -322,6 +400,9 @@ nvc0_hw_metric_get_queries(struct nvc0_screen *screen) struct nouveau_device *dev = screen->base.device; switch (screen->base.class_3d) { + case GM200_3D_CLASS: + case GM107_3D_CLASS: + return sm50_hw_metric_queries; case NVF0_3D_CLASS: return sm35_hw_metric_queries; case NVE4_3D_CLASS: @@ -341,6 +422,9 @@ nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen) struct nouveau_device *dev = screen->base.device; switch (screen->base.class_3d) { + case GM200_3D_CLASS: + case GM107_3D_CLASS: + return ARRAY_SIZE(sm50_hw_metric_queries); case NVF0_3D_CLASS: return ARRAY_SIZE(sm35_hw_metric_queries); case NVE4_3D_CLASS: @@ -386,11 +470,11 @@ nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0, FREE(hmq); } -static boolean +static bool nvc0_hw_metric_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); - boolean ret = false; + bool ret = false; unsigned i; for (i = 0; i < hmq->num_queries; i++) { @@ -416,9 +500,9 @@ sm20_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) { switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: - /* (active_warps / active_cycles) / max. number of warps on a MP */ + /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */ if (res64[1]) - return (res64[0] / (double)res64[1]) / 48; + return ((res64[0] / (double)res64[1]) / 48) * 100; break; case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY: /* (branch / (branch + divergent_branch)) * 100 */ @@ -509,9 +593,9 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) { switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: - /* (active_warps / active_cycles) / max. number of warps on a MP */ + /* ((active_warps / active_cycles) / max. number of warps on a MP) * 100 */ if (res64[1]) - return (res64[0] / (double)res64[1]) / 64; + return ((res64[0] / (double)res64[1]) / 64) * 100; break; case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY: return sm20_hw_metric_calc_result(hq, res64); @@ -545,6 +629,12 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) if (res64[2]) return (res64[0] + res64[1]) / (double)res64[2]; break; + case NVC0_HW_METRIC_QUERY_WARP_EXECUTION_EFFICIENCY: + /* thread_inst_executed / (inst_executed * max. number of threads per + * wrap) * 100 */ + if (res64[0]) + return (res64[1] / ((double)res64[0] * 32)) * 100; + break; default: debug_printf("invalid metric type: %d\n", hq->base.type - NVC0_HW_METRIC_QUERY(0)); @@ -553,9 +643,25 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) return 0; } -static boolean +static uint64_t +sm35_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) +{ + switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) { + case NVC0_HW_METRIC_QUERY_WARP_NONPRED_EXECUTION_EFFICIENCY: + /* not_predicated_off_thread_inst_executed / (inst_executed * max. number + * of threads per wrap) * 100 */ + if (res64[0]) + return (res64[1] / ((double)res64[0] * 32)) * 100; + break; + default: + return sm30_hw_metric_calc_result(hq, res64); + } + return 0; +} + +static bool nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0, - struct nvc0_hw_query *hq, boolean wait, + struct nvc0_hw_query *hq, bool wait, union pipe_query_result *result) { struct nvc0_hw_metric_query *hmq = nvc0_hw_metric_query(hq); @@ -564,7 +670,7 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0, union pipe_query_result results[8] = {}; uint64_t res64[8] = {}; uint64_t value = 0; - boolean ret = false; + bool ret = false; unsigned i; for (i = 0; i < hmq->num_queries; i++) { @@ -576,7 +682,11 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0, } switch (screen->base.class_3d) { + case GM200_3D_CLASS: + case GM107_3D_CLASS: case NVF0_3D_CLASS: + value = sm35_hw_metric_calc_result(hq, res64); + break; case NVE4_3D_CLASS: value = sm30_hw_metric_calc_result(hq, res64); break; @@ -648,12 +758,15 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id, if (id < count) { if (screen->compute) { - if (screen->base.class_3d <= NVF0_3D_CLASS) { + if (screen->base.class_3d <= GM200_3D_CLASS) { const struct nvc0_hw_metric_query_cfg **queries = nvc0_hw_metric_get_queries(screen); + const struct nvc0_hw_metric_cfg *cfg = + nvc0_hw_metric_get_cfg(queries[id]->type); - info->name = nvc0_hw_metric_query_get_name(queries[id]->type); + info->name = cfg->name; info->query_type = NVC0_HW_METRIC_QUERY(queries[id]->type); + info->type = cfg->type; info->group_id = NVC0_HW_METRIC_QUERY_GROUP; return 1; }