From 9ea90ffb98fbb5eac4cec5660a1ca2253c3a3e25 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 11 Jan 2018 10:22:04 +0100 Subject: [PATCH] broadcom/vc4: Add support for HW perfmon The V3D engine provides several perf counters. Implement ->get_driver_query_[group_]info() so that these counters are exposed through the GL_AMD_performance_monitor extension. Signed-off-by: Boris Brezillon Signed-off-by: Eric Anholt --- src/gallium/drivers/vc4/vc4_context.h | 18 ++ src/gallium/drivers/vc4/vc4_job.c | 7 + src/gallium/drivers/vc4/vc4_query.c | 228 ++++++++++++++++++++++++-- src/gallium/drivers/vc4/vc4_screen.c | 7 + src/gallium/drivers/vc4/vc4_screen.h | 1 + 5 files changed, 249 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h index cd404d42c3b..e0317a0e0c0 100644 --- a/src/gallium/drivers/vc4/vc4_context.h +++ b/src/gallium/drivers/vc4/vc4_context.h @@ -219,6 +219,13 @@ struct vc4_job_key { struct pipe_surface *zsbuf; }; +struct vc4_hwperfmon { + uint32_t id; + uint64_t last_seqno; + uint8_t events[DRM_VC4_MAX_PERF_COUNTERS]; + uint64_t counters[DRM_VC4_MAX_PERF_COUNTERS]; +}; + /** * A complete bin/render job. * @@ -309,6 +316,9 @@ struct vc4_job { /** Any flags to be passed in drm_vc4_submit_cl.flags. */ uint32_t flags; + /* Performance monitor attached to this job. */ + struct vc4_hwperfmon *perfmon; + struct vc4_job_key key; }; @@ -390,6 +400,8 @@ struct vc4_context { struct pipe_viewport_state viewport; struct vc4_constbuf_stateobj constbuf[PIPE_SHADER_TYPES]; struct vc4_vertexbuf_stateobj vertexbuf; + + struct vc4_hwperfmon *perfmon; /** @} */ }; @@ -447,6 +459,12 @@ vc4_sampler_state(struct pipe_sampler_state *psampler) return (struct vc4_sampler_state *)psampler; } +int vc4_get_driver_query_group_info(struct pipe_screen *pscreen, + unsigned index, + struct pipe_driver_query_group_info *info); +int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info); + struct pipe_context *vc4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags); void vc4_draw_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c index 5d18cb9bb82..41c274ca1b3 100644 --- a/src/gallium/drivers/vc4/vc4_job.c +++ b/src/gallium/drivers/vc4/vc4_job.c @@ -92,6 +92,9 @@ vc4_job_create(struct vc4_context *vc4) job->last_gem_handle_hindex = ~0; + if (vc4->perfmon) + job->perfmon = vc4->perfmon; + return job; } @@ -455,6 +458,8 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job) submit.shader_rec_count = job->shader_rec_count; submit.uniforms = (uintptr_t)job->uniforms.base; submit.uniforms_size = cl_offset(&job->uniforms); + if (job->perfmon) + submit.perfmonid = job->perfmon->id; assert(job->draw_min_x != ~0 && job->draw_min_y != ~0); submit.min_x_tile = job->draw_min_x / job->tile_width; @@ -487,6 +492,8 @@ vc4_job_submit(struct vc4_context *vc4, struct vc4_job *job) warned = true; } else if (!ret) { vc4->last_emit_seqno = submit.seqno; + if (job->perfmon) + job->perfmon->last_seqno = submit.seqno; } } diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c index ddf8f8fb0c2..6e4681e93cc 100644 --- a/src/gallium/drivers/vc4/vc4_query.c +++ b/src/gallium/drivers/vc4/vc4_query.c @@ -22,8 +22,9 @@ */ /** - * Stub support for occlusion queries. + * Expose V3D HW perf counters. * + * We also have code to fake support for occlusion queries. * Since we expose support for GL 2.0, we have to expose occlusion queries, * but the spec allows you to expose 0 query counter bits, so we just return 0 * as the result of all our queries. @@ -32,49 +33,252 @@ struct vc4_query { - uint8_t pad; + unsigned num_queries; + struct vc4_hwperfmon *hwperfmon; }; +static const char *v3d_counter_names[] = { + "FEP-valid-primitives-no-rendered-pixels", + "FEP-valid-primitives-rendered-pixels", + "FEP-clipped-quads", + "FEP-valid-quads", + "TLB-quads-not-passing-stencil-test", + "TLB-quads-not-passing-z-and-stencil-test", + "TLB-quads-passing-z-and-stencil-test", + "TLB-quads-with-zero-coverage", + "TLB-quads-with-non-zero-coverage", + "TLB-quads-written-to-color-buffer", + "PTB-primitives-discarded-outside-viewport", + "PTB-primitives-need-clipping", + "PTB-primitives-discared-reversed", + "QPU-total-idle-clk-cycles", + "QPU-total-clk-cycles-vertex-coord-shading", + "QPU-total-clk-cycles-fragment-shading", + "QPU-total-clk-cycles-executing-valid-instr", + "QPU-total-clk-cycles-waiting-TMU", + "QPU-total-clk-cycles-waiting-scoreboard", + "QPU-total-clk-cycles-waiting-varyings", + "QPU-total-instr-cache-hit", + "QPU-total-instr-cache-miss", + "QPU-total-uniform-cache-hit", + "QPU-total-uniform-cache-miss", + "TMU-total-text-quads-processed", + "TMU-total-text-cache-miss", + "VPM-total-clk-cycles-VDW-stalled", + "VPM-total-clk-cycles-VCD-stalled", + "L2C-total-cache-hit", + "L2C-total-cache-miss", +}; + +int vc4_get_driver_query_group_info(struct pipe_screen *pscreen, + unsigned index, + struct pipe_driver_query_group_info *info) +{ + struct vc4_screen *screen = vc4_screen(pscreen); + + if (!screen->has_perfmon_ioctl) + return 0; + + if (!info) + return 1; + + if (index > 0) + return 0; + + info->name = "V3D counters"; + info->max_active_queries = DRM_VC4_MAX_PERF_COUNTERS; + info->num_queries = ARRAY_SIZE(v3d_counter_names); + return 1; +} + +int vc4_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info) +{ + struct vc4_screen *screen = vc4_screen(pscreen); + + if (!screen->has_perfmon_ioctl) + return 0; + + if (!info) + return ARRAY_SIZE(v3d_counter_names); + + if (index >= ARRAY_SIZE(v3d_counter_names)) + return 0; + + info->group_id = 0; + info->name = v3d_counter_names[index]; + info->query_type = PIPE_QUERY_DRIVER_SPECIFIC + index; + info->result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_CUMULATIVE; + info->type = PIPE_DRIVER_QUERY_TYPE_UINT64; + info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; + return 1; +} + static struct pipe_query * -vc4_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) +vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries, + unsigned *query_types) { struct vc4_query *query = calloc(1, sizeof(*query)); + struct vc4_hwperfmon *hwperfmon; + unsigned i, nhwqueries = 0; + + if (!query) + return NULL; + + for (i = 0; i < num_queries; i++) { + if (query_types[i] >= PIPE_QUERY_DRIVER_SPECIFIC) + nhwqueries++; + } + + /* We can't mix HW and non-HW queries. */ + if (nhwqueries && nhwqueries != num_queries) + return NULL; + + if (!nhwqueries) + return (struct pipe_query *)query; + + hwperfmon = calloc(1, sizeof(*hwperfmon)); + if (!hwperfmon) + goto err_free_query; + + for (i = 0; i < num_queries; i++) + hwperfmon->events[i] = query_types[i] - + PIPE_QUERY_DRIVER_SPECIFIC; + + query->hwperfmon = hwperfmon; + query->num_queries = num_queries; /* Note that struct pipe_query isn't actually defined anywhere. */ return (struct pipe_query *)query; + +err_free_query: + free(query); + + return NULL; +} + +static struct pipe_query * +vc4_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index) +{ + return vc4_create_batch_query(ctx, 1, &query_type); } static void -vc4_destroy_query(struct pipe_context *ctx, struct pipe_query *query) +vc4_destroy_query(struct pipe_context *pctx, struct pipe_query *pquery) { + struct vc4_context *ctx = vc4_context(pctx); + struct vc4_query *query = (struct vc4_query *)pquery; + + if (query->hwperfmon && query->hwperfmon->id) { + if (query->hwperfmon->id) { + struct drm_vc4_perfmon_destroy req = { }; + + req.id = query->hwperfmon->id; + vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_DESTROY, + &req); + } + + free(query->hwperfmon); + } + free(query); } static boolean -vc4_begin_query(struct pipe_context *ctx, struct pipe_query *query) +vc4_begin_query(struct pipe_context *pctx, struct pipe_query *pquery) { + struct vc4_query *query = (struct vc4_query *)pquery; + struct vc4_context *ctx = vc4_context(pctx); + struct drm_vc4_perfmon_create req = { }; + unsigned i; + int ret; + + if (!query->hwperfmon) + return true; + + /* Only one perfmon can be activated per context. */ + if (ctx->perfmon) + return false; + + /* Reset the counters by destroying the previously allocated perfmon */ + if (query->hwperfmon->id) { + struct drm_vc4_perfmon_destroy destroyreq = { }; + + destroyreq.id = query->hwperfmon->id; + vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_DESTROY, &destroyreq); + } + + for (i = 0; i < query->num_queries; i++) + req.events[i] = query->hwperfmon->events[i]; + + req.ncounters = query->num_queries; + ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_CREATE, &req); + if (ret) + return false; + + query->hwperfmon->id = req.id; + + /* Make sure all pendings jobs are flushed before activating the + * perfmon. + */ + vc4_flush(pctx); + ctx->perfmon = query->hwperfmon; return true; } static bool -vc4_end_query(struct pipe_context *ctx, struct pipe_query *query) +vc4_end_query(struct pipe_context *pctx, struct pipe_query *pquery) { + struct vc4_query *query = (struct vc4_query *)pquery; + struct vc4_context *ctx = vc4_context(pctx); + + if (!query->hwperfmon) + return true; + + if (ctx->perfmon != query->hwperfmon) + return false; + + /* Make sure all pendings jobs are flushed before deactivating the + * perfmon. + */ + vc4_flush(pctx); + ctx->perfmon = NULL; return true; } static boolean -vc4_get_query_result(struct pipe_context *ctx, struct pipe_query *query, +vc4_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery, boolean wait, union pipe_query_result *vresult) { - uint64_t *result = &vresult->u64; + struct vc4_context *ctx = vc4_context(pctx); + struct vc4_query *query = (struct vc4_query *)pquery; + struct drm_vc4_perfmon_get_values req; + unsigned i; + int ret; + + if (!query->hwperfmon) { + vresult->u64 = 0; + return true; + } - *result = 0; + if (!vc4_wait_seqno(ctx->screen, query->hwperfmon->last_seqno, + wait ? PIPE_TIMEOUT_INFINITE : 0, "perfmon")) + return false; + + req.id = query->hwperfmon->id; + req.values_ptr = (uintptr_t)query->hwperfmon->counters; + ret = vc4_ioctl(ctx->fd, DRM_IOCTL_VC4_PERFMON_GET_VALUES, &req); + if (ret) + return false; + + for (i = 0; i < query->num_queries; i++) + vresult->batch[i].u64 = query->hwperfmon->counters[i]; return true; } static void -vc4_set_active_query_state(struct pipe_context *pipe, boolean enable) +vc4_set_active_query_state(struct pipe_context *pctx, boolean enable) { } @@ -82,10 +286,10 @@ void vc4_query_init(struct pipe_context *pctx) { pctx->create_query = vc4_create_query; + pctx->create_batch_query = vc4_create_batch_query; pctx->destroy_query = vc4_destroy_query; pctx->begin_query = vc4_begin_query; pctx->end_query = vc4_end_query; pctx->get_query_result = vc4_get_query_result; - pctx->set_active_query_state = vc4_set_active_query_state; + pctx->set_active_query_state = vc4_set_active_query_state; } - diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index 5b944bf030b..397be545d9a 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -680,6 +680,8 @@ vc4_screen_create(int fd, struct renderonly *ro) vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_THREADED_FS); screen->has_madvise = vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_MADVISE); + screen->has_perfmon_ioctl = + vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_PERFMON); if (!vc4_get_chip_info(screen)) goto fail; @@ -706,6 +708,11 @@ vc4_screen_create(int fd, struct renderonly *ro) pscreen->get_compiler_options = vc4_screen_get_compiler_options; pscreen->query_dmabuf_modifiers = vc4_screen_query_dmabuf_modifiers; + if (screen->has_perfmon_ioctl) { + pscreen->get_driver_query_group_info = vc4_get_driver_query_group_info; + pscreen->get_driver_query_info = vc4_get_driver_query_info; + } + return pscreen; fail: diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h index 09d1c342ed1..0b884423ba5 100644 --- a/src/gallium/drivers/vc4/vc4_screen.h +++ b/src/gallium/drivers/vc4/vc4_screen.h @@ -97,6 +97,7 @@ struct vc4_screen { bool has_threaded_fs; bool has_madvise; bool has_tiling_ioctl; + bool has_perfmon_ioctl; struct vc4_simulator_file *sim_file; }; -- 2.30.2