From 064f395a890158ea5502f685ef6c22e504e3f0df Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Fri, 8 May 2020 12:28:08 -0700 Subject: [PATCH] freedreno: Tell the kernel that all BOs are for writing. Using non-write flags is pretty dubious -- it means the kernel tracking an array of read-only consumers of the BO and having exclusive consumers wait on each reader's fence. It allows multiple readers through dma-bufs to do work in parallel, but at the cost of kernel CPU time and memory management of the shared array. Other drivers have dropped this distinction since dma-buf sharing is usually producer-consumer, not producer-two-consumers, and the userspace and kernel space tracking is expensive. For us, this lets us drop the flags passed in for relocs and tracked in the ringbuffer reloc lists. The end result of the flags reduction work is drawoverhead uniforms test throughput 2.37195% +/- 0.365579% (n=15) Part-of: --- src/freedreno/drm/freedreno_ringbuffer.h | 33 +++++-------- src/freedreno/drm/msm_ringbuffer.c | 36 ++++---------- src/freedreno/drm/msm_ringbuffer_sp.c | 49 +++++++------------ src/gallium/drivers/freedreno/a6xx/fd6_pack.h | 1 - 4 files changed, 39 insertions(+), 80 deletions(-) diff --git a/src/freedreno/drm/freedreno_ringbuffer.h b/src/freedreno/drm/freedreno_ringbuffer.h index 6cfbf1a69ef..a5dec1462c2 100644 --- a/src/freedreno/drm/freedreno_ringbuffer.h +++ b/src/freedreno/drm/freedreno_ringbuffer.h @@ -165,14 +165,21 @@ struct fd_reloc { #define FD_RELOC_READ 0x0001 #define FD_RELOC_WRITE 0x0002 #define FD_RELOC_DUMP 0x0004 - uint32_t flags; uint32_t offset; uint32_t or; int32_t shift; uint32_t orhi; /* used for a5xx+ */ }; -#define FD_RELOC_FLAGS_INIT FD_RELOC_READ +/* We always mark BOs for write, instead of tracking it across reloc + * sources in userspace. On the kernel side, this means we track a single + * excl fence in the BO instead of a set of read fences, which is cheaper. + * The downside is that a dmabuf-shared device won't be able to read in + * parallel with a read-only access by freedreno, but most other drivers + * have decided that that usecase isn't important enough to do this + * tracking, as well. + */ +#define FD_RELOC_FLAGS_INIT (FD_RELOC_READ | FD_RELOC_WRITE) /* NOTE: relocs are 2 dwords on a5xx+ */ @@ -228,12 +235,11 @@ OUT_RING(struct fd_ringbuffer *ring, uint32_t data) } /* - * NOTE: OUT_RELOC*() is 2 dwords (64b) on a5xx+ + * NOTE: OUT_RELOC() is 2 dwords (64b) on a5xx+ */ - static inline void -__out_reloc(struct fd_ringbuffer *ring, struct fd_bo *bo, - uint32_t offset, uint64_t or, int32_t shift, uint32_t flags) +OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo, + uint32_t offset, uint64_t or, int32_t shift) { if (LOG_DWORDS) { fprintf(stderr, "ring[%p]: OUT_RELOC %04x: %p+%u << %d", ring, @@ -242,7 +248,6 @@ __out_reloc(struct fd_ringbuffer *ring, struct fd_bo *bo, debug_assert(offset < fd_bo_size(bo)); fd_ringbuffer_reloc(ring, &(struct fd_reloc){ .bo = bo, - .flags = flags, .offset = offset, .or = or, .shift = shift, @@ -250,19 +255,7 @@ __out_reloc(struct fd_ringbuffer *ring, struct fd_bo *bo, }); } -static inline void -OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo, - uint32_t offset, uint64_t or, int32_t shift) -{ - __out_reloc(ring, bo, offset, or, shift, 0); -} - -static inline void -OUT_RELOCW(struct fd_ringbuffer *ring, struct fd_bo *bo, - uint32_t offset, uint64_t or, int32_t shift) -{ - __out_reloc(ring, bo, offset, or, shift, FD_RELOC_WRITE); -} +#define OUT_RELOCW OUT_RELOC static inline void OUT_RB(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) diff --git a/src/freedreno/drm/msm_ringbuffer.c b/src/freedreno/drm/msm_ringbuffer.c index 8a15c559bb6..2ff4a3dfc56 100644 --- a/src/freedreno/drm/msm_ringbuffer.c +++ b/src/freedreno/drm/msm_ringbuffer.c @@ -97,15 +97,6 @@ cmd_free(struct msm_cmd *cmd) free(cmd); } -/* for _FD_RINGBUFFER_OBJECT rb's we need to track the bo's and flags to - * later copy into the submit when the stateobj rb is later referenced by - * a regular rb: - */ -struct msm_reloc_bo { - struct fd_bo *bo; - unsigned flags; -}; - struct msm_ringbuffer { struct fd_ringbuffer base; @@ -116,7 +107,7 @@ struct msm_ringbuffer { /* for _FD_RINGBUFFER_OBJECT case: */ struct { struct fd_pipe *pipe; - DECLARE_ARRAY(struct msm_reloc_bo, reloc_bos); + DECLARE_ARRAY(struct fd_bo *, reloc_bos); struct set *ring_set; }; /* for other cases: */ @@ -138,7 +129,7 @@ static struct fd_ringbuffer * msm_ringbuffer_init( /* add (if needed) bo to submit and return index: */ static uint32_t -append_bo(struct msm_submit *submit, struct fd_bo *bo, uint32_t flags) +append_bo(struct msm_submit *submit, struct fd_bo *bo) { struct msm_bo *msm_bo = to_msm_bo(bo); uint32_t idx; @@ -174,9 +165,6 @@ append_bo(struct msm_submit *submit, struct fd_bo *bo, uint32_t flags) msm_bo->idx = idx; } - if (flags & FD_RELOC_WRITE) - submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_WRITE; - return idx; } @@ -278,14 +266,10 @@ handle_stateobj_relocs(struct msm_submit *submit, struct msm_ringbuffer *ring) for (unsigned i = 0; i < cmd->nr_relocs; i++) { unsigned idx = cmd->relocs[i].reloc_idx; - struct fd_bo *bo = ring->u.reloc_bos[idx].bo; - unsigned flags = 0; - - if (ring->u.reloc_bos[idx].flags & MSM_SUBMIT_BO_WRITE) - flags |= FD_RELOC_WRITE; + struct fd_bo *bo = ring->u.reloc_bos[idx]; relocs[i] = cmd->relocs[i]; - relocs[i].reloc_idx = append_bo(submit, bo, flags); + relocs[i].reloc_idx = append_bo(submit, bo); } return relocs; @@ -343,7 +327,7 @@ msm_submit_flush(struct fd_submit *submit, int in_fence_fd, cmds[i].type = MSM_SUBMIT_CMD_IB_TARGET_BUF; cmds[i].submit_idx = - append_bo(msm_submit, msm_ring->ring_bo, 0); + append_bo(msm_submit, msm_ring->ring_bo); cmds[i].submit_offset = msm_ring->offset; cmds[i].size = offset_bytes(ring->cur, ring->start); cmds[i].pad = 0; @@ -359,7 +343,7 @@ msm_submit_flush(struct fd_submit *submit, int in_fence_fd, cmds[i].type = MSM_SUBMIT_CMD_IB_TARGET_BUF; } cmds[i].submit_idx = append_bo(msm_submit, - msm_ring->u.cmds[j]->ring_bo, 0); + msm_ring->u.cmds[j]->ring_bo); cmds[i].submit_offset = msm_ring->offset; cmds[i].size = msm_ring->u.cmds[j]->size; cmds[i].pad = 0; @@ -518,8 +502,7 @@ msm_ringbuffer_emit_reloc(struct fd_ringbuffer *ring, if (ring->flags & _FD_RINGBUFFER_OBJECT) { unsigned idx = APPEND(&msm_ring->u, reloc_bos); - msm_ring->u.reloc_bos[idx].bo = fd_bo_ref(reloc->bo); - msm_ring->u.reloc_bos[idx].flags = reloc->flags; + msm_ring->u.reloc_bos[idx] = fd_bo_ref(reloc->bo); /* this gets fixed up at submit->flush() time, since this state- * object rb can be used with many different submits @@ -531,7 +514,7 @@ msm_ringbuffer_emit_reloc(struct fd_ringbuffer *ring, struct msm_submit *msm_submit = to_msm_submit(msm_ring->u.submit); - reloc_idx = append_bo(msm_submit, reloc->bo, reloc->flags); + reloc_idx = append_bo(msm_submit, reloc->bo); pipe = msm_ring->u.submit->pipe; } @@ -603,7 +586,6 @@ msm_ringbuffer_emit_reloc_ring(struct fd_ringbuffer *ring, msm_ringbuffer_emit_reloc(ring, &(struct fd_reloc){ .bo = bo, - .flags = 0, .offset = msm_target->offset, }); @@ -646,7 +628,7 @@ msm_ringbuffer_destroy(struct fd_ringbuffer *ring) if (ring->flags & _FD_RINGBUFFER_OBJECT) { for (unsigned i = 0; i < msm_ring->u.nr_reloc_bos; i++) { - fd_bo_del(msm_ring->u.reloc_bos[i].bo); + fd_bo_del(msm_ring->u.reloc_bos[i]); } _mesa_set_destroy(msm_ring->u.ring_set, unref_rings); diff --git a/src/freedreno/drm/msm_ringbuffer_sp.c b/src/freedreno/drm/msm_ringbuffer_sp.c index 65bfb2bde80..6bab90b190a 100644 --- a/src/freedreno/drm/msm_ringbuffer_sp.c +++ b/src/freedreno/drm/msm_ringbuffer_sp.c @@ -74,29 +74,19 @@ struct msm_cmd_sp { unsigned size; }; -/* for _FD_RINGBUFFER_OBJECT rb's we need to track the bo's and flags to - * later copy into the submit when the stateobj rb is later referenced by - * a regular rb: - */ -struct msm_reloc_bo_sp { - struct fd_bo *bo; - unsigned flags; -}; - struct msm_ringbuffer_sp { struct fd_ringbuffer base; /* for FD_RINGBUFFER_STREAMING rb's which are sub-allocated */ unsigned offset; -// TODO check disasm.. hopefully compilers CSE can realize that -// reloc_bos and cmds are at the same offsets and optimize some -// divergent cases into single case union { - /* for _FD_RINGBUFFER_OBJECT case: */ + /* for _FD_RINGBUFFER_OBJECT case, the array of BOs referenced from + * this one + */ struct { struct fd_pipe *pipe; - DECLARE_ARRAY(struct msm_reloc_bo_sp, reloc_bos); + DECLARE_ARRAY(struct fd_bo *, reloc_bos); }; /* for other cases: */ struct { @@ -116,7 +106,7 @@ static struct fd_ringbuffer * msm_ringbuffer_sp_init( /* add (if needed) bo to submit and return index: */ static uint32_t -msm_submit_append_bo(struct msm_submit_sp *submit, struct fd_bo *bo, uint32_t flags) +msm_submit_append_bo(struct msm_submit_sp *submit, struct fd_bo *bo) { struct msm_bo *msm_bo = to_msm_bo(bo); uint32_t idx; @@ -152,12 +142,6 @@ msm_submit_append_bo(struct msm_submit_sp *submit, struct fd_bo *bo, uint32_t fl msm_bo->idx = idx; } - STATIC_ASSERT(FD_RELOC_READ == MSM_SUBMIT_BO_READ); - STATIC_ASSERT(FD_RELOC_WRITE == MSM_SUBMIT_BO_WRITE); - STATIC_ASSERT(FD_RELOC_DUMP == MSM_SUBMIT_BO_DUMP); - submit->submit_bos[idx].flags |= - flags & (MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE | MSM_SUBMIT_BO_DUMP); - return idx; } @@ -259,7 +243,7 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd, for (unsigned i = 0; i < primary->u.nr_cmds; i++) { cmds[i].type = MSM_SUBMIT_CMD_BUF; cmds[i].submit_idx = msm_submit_append_bo(msm_submit, - primary->u.cmds[i].ring_bo, 0); + primary->u.cmds[i].ring_bo); cmds[i].submit_offset = primary->offset; cmds[i].size = primary->u.cmds[i].size; cmds[i].pad = 0; @@ -403,15 +387,14 @@ msm_ringbuffer_sp_emit_reloc(struct fd_ringbuffer *ring, if (ring->flags & _FD_RINGBUFFER_OBJECT) { unsigned idx = APPEND(&msm_ring->u, reloc_bos); - msm_ring->u.reloc_bos[idx].bo = fd_bo_ref(reloc->bo); - msm_ring->u.reloc_bos[idx].flags = reloc->flags; + msm_ring->u.reloc_bos[idx] = fd_bo_ref(reloc->bo); pipe = msm_ring->u.pipe; } else { struct msm_submit_sp *msm_submit = to_msm_submit_sp(msm_ring->u.submit); - msm_submit_append_bo(msm_submit, reloc->bo, reloc->flags); + msm_submit_append_bo(msm_submit, reloc->bo); pipe = msm_ring->u.submit->pipe; } @@ -465,10 +448,8 @@ msm_ringbuffer_sp_emit_reloc_ring(struct fd_ringbuffer *ring, for (unsigned i = 0; i < msm_target->u.nr_reloc_bos; i++) { unsigned idx = APPEND(&msm_ring->u, reloc_bos); - msm_ring->u.reloc_bos[idx].bo = - fd_bo_ref(msm_target->u.reloc_bos[i].bo); - msm_ring->u.reloc_bos[idx].flags = - msm_target->u.reloc_bos[i].flags; + msm_ring->u.reloc_bos[idx] = + fd_bo_ref(msm_target->u.reloc_bos[i]); } } else { // TODO it would be nice to know whether we have already @@ -477,8 +458,7 @@ msm_ringbuffer_sp_emit_reloc_ring(struct fd_ringbuffer *ring, struct msm_submit_sp *msm_submit = to_msm_submit_sp(msm_ring->u.submit); for (unsigned i = 0; i < msm_target->u.nr_reloc_bos; i++) { - msm_submit_append_bo(msm_submit, msm_target->u.reloc_bos[i].bo, - msm_target->u.reloc_bos[i].flags); + msm_submit_append_bo(msm_submit, msm_target->u.reloc_bos[i]); } } @@ -502,7 +482,7 @@ msm_ringbuffer_sp_destroy(struct fd_ringbuffer *ring) if (ring->flags & _FD_RINGBUFFER_OBJECT) { for (unsigned i = 0; i < msm_ring->u.nr_reloc_bos; i++) { - fd_bo_del(msm_ring->u.reloc_bos[i].bo); + fd_bo_del(msm_ring->u.reloc_bos[i]); } free(msm_ring->u.reloc_bos); @@ -533,6 +513,11 @@ msm_ringbuffer_sp_init(struct msm_ringbuffer_sp *msm_ring, uint32_t size, { struct fd_ringbuffer *ring = &msm_ring->base; + /* We don't do any translation from internal FD_RELOC flags to MSM flags. */ + STATIC_ASSERT(FD_RELOC_READ == MSM_SUBMIT_BO_READ); + STATIC_ASSERT(FD_RELOC_WRITE == MSM_SUBMIT_BO_WRITE); + STATIC_ASSERT(FD_RELOC_DUMP == MSM_SUBMIT_BO_DUMP); + debug_assert(msm_ring->ring_bo); uint8_t *base = fd_bo_map(msm_ring->ring_bo); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_pack.h b/src/gallium/drivers/freedreno/a6xx/fd6_pack.h index ed1c0003849..6b71c10f4e0 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_pack.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_pack.h @@ -58,7 +58,6 @@ struct fd_reg_pair { if (regs[i].bo) { \ struct fd_reloc reloc = { \ .bo = regs[i].bo, \ - .flags = (regs[i].bo_write ? FD_RELOC_WRITE : 0), \ .offset = regs[i].bo_offset, \ .or = regs[i].value, \ .shift = regs[i].bo_shift, \ -- 2.30.2