From 9e2113c6dc132707db19461b77c8001b5475156a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 18 Jun 2020 01:06:12 -0400 Subject: [PATCH] radeonsi: set up IBs for preemption - Execute cs_preamble_state as a separate IB with different flags. - Set the PREEMPT flag for the main IB. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeon/radeon_winsys.h | 10 +++ .../drivers/radeonsi/si_cp_reg_shadowing.c | 13 ++-- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 78 ++++++++++++++++++- src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 2 + 4 files changed, 95 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 9aba2a9d95f..3ac425aa094 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -498,6 +498,16 @@ struct radeon_winsys { struct radeon_cmdbuf *(*cs_add_parallel_compute_ib)(struct radeon_cmdbuf *cs, bool uses_gds_ordered_append); + /** + * Set up and enable mid command buffer preemption for the command stream. + * + * \param cs Command stream + * \param preamble_ib Non-preemptible preamble IB for the context. + * \param preamble_num_dw Number of dwords in the preamble IB. + */ + bool (*cs_setup_preemption)(struct radeon_cmdbuf *cs, const uint32_t *preamble_ib, + unsigned preamble_num_dw); + /** * Destroy a command stream. * diff --git a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c index bf8742d8686..84e4ced02e5 100644 --- a/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c +++ b/src/gallium/drivers/radeonsi/si_cp_reg_shadowing.c @@ -179,12 +179,15 @@ void si_init_cp_reg_shadowing(struct si_context *sctx) /* The register values are shadowed, so we won't need to set them again. */ si_pm4_free_state(sctx, sctx->cs_preamble_state, ~0); - - /* Execute the shadowing preamble as cs_preamble, which will - * load register values from memory. - */ - sctx->cs_preamble_state = shadowing_preamble; + sctx->cs_preamble_state = NULL; si_set_tracked_regs_to_clear_state(sctx); + + /* Setup preemption. The shadowing preamble will be executed as a preamble IB, + * which will load register values from memory on a context switch. + */ + sctx->ws->cs_setup_preemption(sctx->gfx_cs, shadowing_preamble->pm4, + shadowing_preamble->ndw); + si_pm4_free_state(sctx, shadowing_preamble, ~0); } } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 05e77b03325..7f097c50cfc 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -1034,6 +1034,60 @@ amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *ib, return &cs->compute_ib.base; } +static bool +amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib, + unsigned preamble_num_dw) +{ + struct amdgpu_ib *ib = amdgpu_ib(rcs); + struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib); + struct amdgpu_winsys *ws = cs->ctx->ws; + struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2}; + unsigned size = align(preamble_num_dw * 4, ws->info.ib_alignment); + struct pb_buffer *preamble_bo; + uint32_t *map; + + /* Create the preamble IB buffer. */ + preamble_bo = amdgpu_bo_create(ws, size, ws->info.ib_alignment, + RADEON_DOMAIN_VRAM, + RADEON_FLAG_NO_INTERPROCESS_SHARING | + RADEON_FLAG_GTT_WC | + RADEON_FLAG_READ_ONLY); + if (!preamble_bo) + return false; + + map = (uint32_t*)amdgpu_bo_map(preamble_bo, NULL, + PIPE_TRANSFER_WRITE | RADEON_TRANSFER_TEMPORARY); + if (!map) { + pb_reference(&preamble_bo, NULL); + return false; + } + + /* Upload the preamble IB. */ + memcpy(map, preamble_ib, preamble_num_dw * 4); + + /* Pad the IB. */ + uint32_t ib_pad_dw_mask = ws->info.ib_pad_dw_mask[cs->ring_type]; + while (preamble_num_dw & ib_pad_dw_mask) + map[preamble_num_dw++] = PKT3_NOP_PAD; + amdgpu_bo_unmap(preamble_bo); + + for (unsigned i = 0; i < 2; i++) { + csc[i]->ib[IB_PREAMBLE] = csc[i]->ib[IB_MAIN]; + csc[i]->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE; + csc[i]->ib[IB_PREAMBLE].va_start = amdgpu_winsys_bo(preamble_bo)->va; + csc[i]->ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4; + + csc[i]->ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT; + } + + assert(!cs->preamble_ib_bo); + cs->preamble_ib_bo = preamble_bo; + + amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0, + RADEON_PRIO_IB1); + return true; +} + static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs) { return true; @@ -1446,7 +1500,7 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) if (acs->stop_exec_on_failure && acs->ctx->num_rejected_cs) { r = -ECANCELED; } else { - struct drm_amdgpu_cs_chunk chunks[6]; + struct drm_amdgpu_cs_chunk chunks[7]; unsigned num_chunks = 0; /* BO list */ @@ -1589,6 +1643,14 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) num_chunks++; } + /* IB */ + if (cs->ib[IB_PREAMBLE].ib_bytes) { + chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; + chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; + chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PREAMBLE]; + num_chunks++; + } + /* IB */ cs->ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */ chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; @@ -1596,10 +1658,13 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_MAIN]; num_chunks++; - if (ws->secure && cs->secure) + if (ws->secure && cs->secure) { + cs->ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE; cs->ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE; - else + } else { + cs->ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE; cs->ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE; + } assert(num_chunks <= ARRAY_SIZE(chunks)); @@ -1784,6 +1849,11 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, if (cs->compute_ib.ib_mapped) amdgpu_get_new_ib(ws, cs, IB_PARALLEL_COMPUTE); + if (cs->preamble_ib_bo) { + amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0, + RADEON_PRIO_IB1); + } + cs->main.base.used_gart = 0; cs->main.base.used_vram = 0; @@ -1802,6 +1872,7 @@ static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs) amdgpu_cs_sync_flush(rcs); util_queue_fence_destroy(&cs->flush_completed); p_atomic_dec(&cs->ctx->ws->num_cs); + pb_reference(&cs->preamble_ib_bo, NULL); pb_reference(&cs->main.big_ib_buffer, NULL); FREE(cs->main.base.prev); pb_reference(&cs->compute_ib.big_ib_buffer, NULL); @@ -1829,6 +1900,7 @@ void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws) ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; ws->base.cs_create = amdgpu_cs_create; ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib; + ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption; ws->base.cs_destroy = amdgpu_cs_destroy; ws->base.cs_add_buffer = amdgpu_cs_add_buffer; ws->base.cs_validate = amdgpu_cs_validate; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index a50257b5b7b..138e0b0adcc 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -56,6 +56,7 @@ struct amdgpu_cs_buffer { }; enum ib_type { + IB_PREAMBLE, IB_MAIN, IB_PARALLEL_COMPUTE, IB_NUM, @@ -151,6 +152,7 @@ struct amdgpu_cs { struct util_queue_fence flush_completed; struct pipe_fence_handle *next_fence; + struct pb_buffer *preamble_ib_bo; }; struct amdgpu_fence { -- 2.30.2