From e008d4f011b62c38be3e1401315dabc0df7e5b29 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 14 Apr 2019 22:42:44 +0000 Subject: [PATCH] panfrost: Track BO lifetime with jobs and reference counts This (fairly large) patch continues work surrounding the panfrost_job abstraction to improve job lifetime management. In particular, we add infrastructure to track which BOs are used by a particular job (currently limited to the vertex buffer BOs), to reference count these BOs, and to automatically manage the BOs memory based on the reference count. This set of changes serves as a code cleanup, as a way of future proofing for allowing flushing BOs, and immediately as a bugfix to workaround the missing reference counting for vertex buffer BOs. Meanwhile, there are a few cleanups to vertex buffer handling code itself, so in the short-term, this allows us to remove the costly VBO staging workaround, since this patch addresses the underlying causes. v2: Use pipe_reference for BO reference counting, rather than managing it ourselves. Don't duplicate hash-table key removal. Fix vertex buffer counting. Signed-off-by: Alyssa Rosenzweig --- src/gallium/drivers/panfrost/pan_context.c | 54 +++++++-------- src/gallium/drivers/panfrost/pan_context.h | 7 +- src/gallium/drivers/panfrost/pan_drm.c | 6 ++ src/gallium/drivers/panfrost/pan_job.c | 75 +++++++++++++++++++++ src/gallium/drivers/panfrost/pan_job.h | 16 +++++ src/gallium/drivers/panfrost/pan_resource.c | 67 ++++++++++++++++-- src/gallium/drivers/panfrost/pan_resource.h | 12 ++++ src/gallium/drivers/panfrost/pan_screen.c | 2 +- src/gallium/drivers/panfrost/pan_screen.h | 5 +- 9 files changed, 207 insertions(+), 37 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index 0c97c0a2b43..1319b7944ab 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -36,6 +36,8 @@ #include "util/u_memory.h" #include "util/u_vbuf.h" #include "util/half_float.h" +#include "util/u_helpers.h" +#include "util/u_format.h" #include "indices/u_primconvert.h" #include "tgsi/tgsi_parse.h" @@ -737,14 +739,18 @@ panfrost_emit_varying_descriptor( * excepting some obscure circumstances */ static void -panfrost_emit_vertex_data(struct panfrost_context *ctx) +panfrost_emit_vertex_data(struct panfrost_context *ctx, struct panfrost_job *job) { - /* TODO: Only update the dirtied buffers */ + /* Staged mali_attr, and index into them. i =/= k, depending on the + * vertex buffer mask */ union mali_attr attrs[PIPE_MAX_ATTRIBS]; + unsigned k = 0; unsigned invocation_count = MALI_NEGATIVE(ctx->payload_tiler.prefix.invocation_count); - for (int i = 0; i < ctx->vertex_buffer_count; ++i) { + for (int i = 0; i < ARRAY_SIZE(ctx->vertex_buffers); ++i) { + if (!(ctx->vb_mask & (1 << i))) continue; + struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[i]; struct panfrost_resource *rsrc = (struct panfrost_resource *) (buf->buffer.resource); @@ -767,8 +773,8 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx) } /* Offset vertex count by draw_start to make sure we upload enough */ - attrs[i].stride = buf->stride; - attrs[i].size = buf->stride * (ctx->payload_vertex.draw_start + invocation_count) + max_src_offset; + attrs[k].stride = buf->stride; + attrs[k].size = buf->stride * (ctx->payload_vertex.draw_start + invocation_count) + max_src_offset; /* Vertex elements are -already- GPU-visible, at * rsrc->gpu. However, attribute buffers must be 64 aligned. If @@ -776,16 +782,19 @@ panfrost_emit_vertex_data(struct panfrost_context *ctx) mali_ptr effective_address = rsrc ? (rsrc->bo->gpu + buf->buffer_offset) : 0; - if (effective_address) { - attrs[i].elements = panfrost_upload_transient(ctx, rsrc->bo->cpu + buf->buffer_offset, attrs[i].size) | MALI_ATTR_LINEAR; + if (effective_address & 63) { + attrs[k].elements = panfrost_upload_transient(ctx, rsrc->bo->cpu + buf->buffer_offset, attrs[i].size) | MALI_ATTR_LINEAR; } else if (effective_address) { - attrs[i].elements = effective_address | MALI_ATTR_LINEAR; + panfrost_job_add_bo(job, rsrc->bo); + attrs[k].elements = effective_address | MALI_ATTR_LINEAR; } else { /* Leave unset? */ } + + ++k; } - ctx->payload_vertex.postfix.attributes = panfrost_upload_transient(ctx, attrs, ctx->vertex_buffer_count * sizeof(union mali_attr)); + ctx->payload_vertex.postfix.attributes = panfrost_upload_transient(ctx, attrs, k * sizeof(union mali_attr)); panfrost_emit_varying_descriptor(ctx, invocation_count); } @@ -807,7 +816,7 @@ panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data) struct panfrost_job *job = panfrost_get_job_for_fbo(ctx); if (with_vertex_data) { - panfrost_emit_vertex_data(ctx); + panfrost_emit_vertex_data(ctx, job); } bool msaa = ctx->rasterizer->base.multisample; @@ -1252,7 +1261,8 @@ panfrost_link_jobs(struct panfrost_context *ctx) static void panfrost_submit_frame(struct panfrost_context *ctx, bool flush_immediate, - struct pipe_fence_handle **fence) + struct pipe_fence_handle **fence, + struct panfrost_job *job) { struct pipe_context *gallium = (struct pipe_context *) ctx; struct panfrost_screen *screen = pan_screen(gallium->screen); @@ -1274,15 +1284,15 @@ panfrost_submit_frame(struct panfrost_context *ctx, bool flush_immediate, #ifndef DRY_RUN bool is_scanout = panfrost_is_scanout(ctx); - int fragment_id = screen->driver->submit_vs_fs_job(ctx, has_draws, is_scanout); + screen->driver->submit_vs_fs_job(ctx, has_draws, is_scanout); /* If visual, we can stall a frame */ if (!flush_immediate) screen->driver->force_flush_fragment(ctx, fence); - screen->last_fragment_id = fragment_id; screen->last_fragment_flushed = false; + screen->last_job = job; /* If readback, flush now (hurts the pipelined performance) */ if (flush_immediate) @@ -1317,7 +1327,7 @@ panfrost_flush( bool flush_immediate = flags & PIPE_FLUSH_END_OF_FRAME; /* Submit the frame itself */ - panfrost_submit_frame(ctx, flush_immediate, fence); + panfrost_submit_frame(ctx, flush_immediate, fence, job); /* Prepare for the next frame */ panfrost_invalidate_frame(ctx); @@ -1793,22 +1803,8 @@ panfrost_set_vertex_buffers( const struct pipe_vertex_buffer *buffers) { struct panfrost_context *ctx = pan_context(pctx); - assert(num_buffers <= PIPE_MAX_ATTRIBS); - - /* XXX: Dirty tracking? etc */ - if (buffers) { - size_t sz = sizeof(buffers[0]) * num_buffers; - ctx->vertex_buffers = malloc(sz); - ctx->vertex_buffer_count = num_buffers; - memcpy(ctx->vertex_buffers, buffers, sz); - } else { - if (ctx->vertex_buffers) { - free(ctx->vertex_buffers); - ctx->vertex_buffers = NULL; - } - ctx->vertex_buffer_count = 0; - } + util_set_vertex_buffers_mask(ctx->vertex_buffers, &ctx->vb_mask, buffers, start_slot, num_buffers); } static void diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h index d071da1c62f..a6a54277632 100644 --- a/src/gallium/drivers/panfrost/pan_context.h +++ b/src/gallium/drivers/panfrost/pan_context.h @@ -114,6 +114,9 @@ struct panfrost_context { struct panfrost_job *job; struct hash_table *jobs; + /* panfrost_resource -> panfrost_job */ + struct hash_table *write_jobs; + /* Bit mask for supported PIPE_DRAW for this hardware */ unsigned draw_modes; @@ -187,8 +190,8 @@ struct panfrost_context { struct panfrost_vertex_state *vertex; - struct pipe_vertex_buffer *vertex_buffers; - unsigned vertex_buffer_count; + struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS]; + uint32_t vb_mask; struct panfrost_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; unsigned sampler_count[PIPE_SHADER_TYPES]; diff --git a/src/gallium/drivers/panfrost/pan_drm.c b/src/gallium/drivers/panfrost/pan_drm.c index 70d8d7498d4..981b1c6ce13 100644 --- a/src/gallium/drivers/panfrost/pan_drm.c +++ b/src/gallium/drivers/panfrost/pan_drm.c @@ -1,5 +1,6 @@ /* * © Copyright 2019 Collabora, Ltd. + * Copyright 2019 Alyssa Rosenzweig * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -138,6 +139,7 @@ panfrost_drm_import_bo(struct panfrost_screen *screen, struct winsys_handle *wha bo->gem_handle = gem_handle; bo->gpu = (mali_ptr) get_bo_offset.offset; + pipe_reference_init(&bo->reference, 1); // TODO map and unmap on demand? mmap_bo.handle = gem_handle; @@ -227,6 +229,7 @@ panfrost_drm_submit_job(struct panfrost_context *ctx, u64 job_desc, int reqs, st } /* TODO: Add here the transient pools */ + /* TODO: Add here the BOs listed in the panfrost_job */ bo_handles[submit.bo_handle_count++] = ctx->shaders.gem_handle; bo_handles[submit.bo_handle_count++] = ctx->scratchpad.gem_handle; bo_handles[submit.bo_handle_count++] = ctx->tiler_heap.gem_handle; @@ -303,6 +306,9 @@ panfrost_drm_force_flush_fragment(struct panfrost_context *ctx, if (!screen->last_fragment_flushed) { drmSyncobjWait(drm->fd, &ctx->out_sync, 1, INT64_MAX, 0, NULL); screen->last_fragment_flushed = true; + + /* The job finished up, so we're safe to clean it up now */ + panfrost_free_job(ctx, screen->last_job); } if (fence) { diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c index 66a8b0d4b07..3b03cd650c8 100644 --- a/src/gallium/drivers/panfrost/pan_job.c +++ b/src/gallium/drivers/panfrost/pan_job.c @@ -35,9 +35,32 @@ panfrost_create_job(struct panfrost_context *ctx) job->ctx = ctx; + job->bos = _mesa_set_create(job, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + return job; } +void +panfrost_free_job(struct panfrost_context *ctx, struct panfrost_job *job) +{ + if (!job) + return; + + set_foreach(job->bos, entry) { + struct panfrost_bo *bo = (struct panfrost_bo *)entry->key; + panfrost_bo_unreference(ctx->base.screen, bo); + } + + _mesa_hash_table_remove_key(ctx->jobs, &job->key); + + if (ctx->job == job) + ctx->job = NULL; + + ralloc_free(job); +} + struct panfrost_job * panfrost_get_job(struct panfrost_context *ctx, struct pipe_surface **cbufs, struct pipe_surface *zsbuf) @@ -90,6 +113,53 @@ panfrost_get_job_for_fbo(struct panfrost_context *ctx) return job; } +void +panfrost_job_add_bo(struct panfrost_job *job, struct panfrost_bo *bo) +{ + if (!bo) + return; + + if (_mesa_set_search(job->bos, bo)) + return; + + panfrost_bo_reference(bo); + _mesa_set_add(job->bos, bo); +} + +void +panfrost_flush_jobs_writing_resource(struct panfrost_context *panfrost, + struct pipe_resource *prsc) +{ +#if 0 + struct hash_entry *entry = _mesa_hash_table_search(panfrost->write_jobs, + prsc); + if (entry) { + struct panfrost_job *job = entry->data; + panfrost_job_submit(panfrost, job); + } +#endif + /* TODO stub */ +} + +void +panfrost_flush_jobs_reading_resource(struct panfrost_context *panfrost, + struct pipe_resource *prsc) +{ + struct panfrost_resource *rsc = pan_resource(prsc); + + panfrost_flush_jobs_writing_resource(panfrost, prsc); + + hash_table_foreach(panfrost->jobs, entry) { + struct panfrost_job *job = entry->data; + + if (_mesa_set_search(job->bos, rsc->bo)) { + printf("TODO: submit job for flush\n"); + //panfrost_job_submit(panfrost, job); + continue; + } + } +} + static bool panfrost_job_compare(const void *a, const void *b) { @@ -109,4 +179,9 @@ panfrost_job_init(struct panfrost_context *ctx) ctx->jobs = _mesa_hash_table_create(NULL, panfrost_job_hash, panfrost_job_compare); + + ctx->write_jobs = _mesa_hash_table_create(NULL, + _mesa_hash_pointer, + _mesa_key_pointer_equal); + } diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h index 30f1cf4bd5c..1b28084c599 100644 --- a/src/gallium/drivers/panfrost/pan_job.h +++ b/src/gallium/drivers/panfrost/pan_job.h @@ -55,6 +55,8 @@ struct panfrost_job { * bitmask) */ unsigned requirements; + /* BOs referenced -- will be used for flushing logic */ + struct set *bos; }; /* Functions for managing the above */ @@ -62,6 +64,9 @@ struct panfrost_job { struct panfrost_job * panfrost_create_job(struct panfrost_context *ctx); +void +panfrost_free_job(struct panfrost_context *ctx, struct panfrost_job *job); + struct panfrost_job * panfrost_get_job(struct panfrost_context *ctx, struct pipe_surface **cbufs, struct pipe_surface *zsbuf); @@ -72,4 +77,15 @@ panfrost_get_job_for_fbo(struct panfrost_context *ctx); void panfrost_job_init(struct panfrost_context *ctx); +void +panfrost_job_add_bo(struct panfrost_job *job, struct panfrost_bo *bo); + +void +panfrost_flush_jobs_writing_resource(struct panfrost_context *panfrost, + struct pipe_resource *prsc); + +void +panfrost_flush_jobs_reading_resource(struct panfrost_context *panfrost, + struct pipe_resource *prsc); + #endif diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c index 15d522f1963..73f987c45bc 100644 --- a/src/gallium/drivers/panfrost/pan_resource.c +++ b/src/gallium/drivers/panfrost/pan_resource.c @@ -228,6 +228,7 @@ static struct panfrost_bo * panfrost_create_bo(struct panfrost_screen *screen, const struct pipe_resource *template) { struct panfrost_bo *bo = CALLOC_STRUCT(panfrost_bo); + pipe_reference_init(&bo->reference, 1); /* Based on the usage, figure out what storing will be used. There are * various tradeoffs: @@ -297,6 +298,8 @@ panfrost_resource_create(struct pipe_screen *screen, assert(0); } + util_range_init(&so->valid_buffer_range); + if (template->bind & PIPE_BIND_DISPLAY_TARGET || template->bind & PIPE_BIND_SCANOUT || template->bind & PIPE_BIND_SHARED) { @@ -359,6 +362,22 @@ panfrost_destroy_bo(struct panfrost_screen *screen, struct panfrost_bo *pbo) } } +void +panfrost_bo_reference(struct panfrost_bo *bo) +{ + pipe_reference(NULL, &bo->reference); +} + +void +panfrost_bo_unreference(struct pipe_screen *screen, struct panfrost_bo *bo) +{ + /* When the reference count goes to zero, we need to cleanup */ + + if (pipe_reference(&bo->reference, NULL)) { + panfrost_destroy_bo(pan_screen(screen), bo); + } +} + static void panfrost_resource_destroy(struct pipe_screen *screen, struct pipe_resource *pt) @@ -370,8 +389,9 @@ panfrost_resource_destroy(struct pipe_screen *screen, renderonly_scanout_destroy(rsrc->scanout, pscreen->ro); if (rsrc->bo) - panfrost_destroy_bo(pscreen, rsrc->bo); + panfrost_bo_unreference(screen, rsrc->bo); + util_range_destroy(&rsrc->valid_buffer_range); FREE(rsrc); } @@ -384,7 +404,8 @@ panfrost_transfer_map(struct pipe_context *pctx, struct pipe_transfer **out_transfer) { int bytes_per_pixel = util_format_get_blocksize(resource->format); - struct panfrost_bo *bo = pan_resource(resource)->bo; + struct panfrost_resource *rsrc = pan_resource(resource); + struct panfrost_bo *bo = rsrc->bo; struct panfrost_gtransfer *transfer = CALLOC_STRUCT(panfrost_gtransfer); transfer->base.level = level; @@ -405,6 +426,25 @@ panfrost_transfer_map(struct pipe_context *pctx, panfrost_flush(pctx, NULL, PIPE_FLUSH_END_OF_FRAME); } + if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) { + /* TODO: reallocate */ + //printf("debug: Missed reallocate\n"); + } else if ((usage & PIPE_TRANSFER_WRITE) + && resource->target == PIPE_BUFFER + && !util_ranges_intersect(&rsrc->valid_buffer_range, box->x, box->x + box->width)) { + /* No flush for writes to uninitialized */ + } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { + if (usage & PIPE_TRANSFER_WRITE) { + /* STUB: flush reading */ + //printf("debug: missed reading flush %d\n", resource->target); + } else if (usage & PIPE_TRANSFER_READ) { + /* STUB: flush writing */ + //printf("debug: missed writing flush %d (%d-%d)\n", resource->target, box->x, box->x + box->width); + } else { + /* Why are you even mapping?! */ + } + } + if (bo->layout != PAN_LINEAR) { /* Non-linear resources need to be indirectly mapped */ @@ -460,9 +500,9 @@ panfrost_transfer_unmap(struct pipe_context *pctx, /* Gallium expects writeback here, so we tile */ struct panfrost_gtransfer *trans = pan_transfer(transfer); + struct panfrost_resource *prsrc = (struct panfrost_resource *) transfer->resource; if (trans->map) { - struct panfrost_resource *prsrc = (struct panfrost_resource *) transfer->resource; struct panfrost_bo *bo = prsrc->bo; if (transfer->usage & PIPE_TRANSFER_WRITE) { @@ -480,6 +520,11 @@ panfrost_transfer_unmap(struct pipe_context *pctx, free(trans->map); } + + util_range_add(&prsrc->valid_buffer_range, + transfer->box.x, + transfer->box.x + transfer->box.width); + /* Derefence the resource */ pipe_resource_reference(&transfer->resource, NULL); @@ -487,6 +532,20 @@ panfrost_transfer_unmap(struct pipe_context *pctx, free(transfer); } +static void +panfrost_transfer_flush_region(struct pipe_context *pctx, + struct pipe_transfer *transfer, + const struct pipe_box *box) +{ + struct panfrost_resource *rsc = pan_resource(transfer->resource); + + if (transfer->resource->target == PIPE_BUFFER) { + util_range_add(&rsc->valid_buffer_range, + transfer->box.x + box->x, + transfer->box.x + box->x + box->width); + } +} + static struct pb_slab * panfrost_slab_alloc(void *priv, unsigned heap, unsigned entry_size, unsigned group_index) { @@ -564,7 +623,7 @@ static const struct u_transfer_vtbl transfer_vtbl = { .resource_destroy = panfrost_resource_destroy, .transfer_map = panfrost_transfer_map, .transfer_unmap = panfrost_transfer_unmap, - .transfer_flush_region = u_default_transfer_flush_region, + .transfer_flush_region = panfrost_transfer_flush_region, .get_internal_format = panfrost_resource_get_internal_format, .set_stencil = panfrost_resource_set_stencil, .get_stencil = panfrost_resource_get_stencil, diff --git a/src/gallium/drivers/panfrost/pan_resource.h b/src/gallium/drivers/panfrost/pan_resource.h index a1315ab1b43..269268e9d22 100644 --- a/src/gallium/drivers/panfrost/pan_resource.h +++ b/src/gallium/drivers/panfrost/pan_resource.h @@ -30,6 +30,7 @@ #include "pan_screen.h" #include "pan_allocate.h" #include "drm-uapi/drm.h" +#include "util/u_range.h" /* Describes the memory layout of a BO */ @@ -45,6 +46,9 @@ struct panfrost_slice { }; struct panfrost_bo { + struct pipe_reference reference; + + /* Description of the mip levels */ struct panfrost_slice slices[MAX_MIP_LEVELS]; /* Mapping for the entire object (all levels) */ @@ -83,6 +87,12 @@ struct panfrost_bo { int gem_handle; }; +void +panfrost_bo_reference(struct panfrost_bo *bo); + +void +panfrost_bo_unreference(struct pipe_screen *screen, struct panfrost_bo *bo); + struct panfrost_resource { struct pipe_resource base; @@ -90,6 +100,8 @@ struct panfrost_resource { struct renderonly_scanout *scanout; struct panfrost_resource *separate_stencil; + + struct util_range valid_buffer_range; }; static inline struct panfrost_resource * diff --git a/src/gallium/drivers/panfrost/pan_screen.c b/src/gallium/drivers/panfrost/pan_screen.c index 71c6175d069..6d3aca594f1 100644 --- a/src/gallium/drivers/panfrost/pan_screen.c +++ b/src/gallium/drivers/panfrost/pan_screen.c @@ -597,8 +597,8 @@ panfrost_create_screen(int fd, struct renderonly *ro) screen->base.fence_reference = panfrost_fence_reference; screen->base.fence_finish = panfrost_fence_finish; - screen->last_fragment_id = -1; screen->last_fragment_flushed = true; + screen->last_job = NULL; panfrost_resource_screen_init(screen); diff --git a/src/gallium/drivers/panfrost/pan_screen.h b/src/gallium/drivers/panfrost/pan_screen.h index cbadf813675..4e5efa91f22 100644 --- a/src/gallium/drivers/panfrost/pan_screen.h +++ b/src/gallium/drivers/panfrost/pan_screen.h @@ -92,8 +92,11 @@ struct panfrost_screen { /* TODO: Where? */ struct panfrost_resource *display_target; - int last_fragment_id; + /* While we're busy building up the job for frame N, the GPU is + * still busy executing frame N-1. So hold a reference to + * yesterjob */ int last_fragment_flushed; + struct panfrost_job *last_job; }; #endif /* PAN_SCREEN_H */ -- 2.30.2