From 6944f991761367fc1cc3c1d490f284623955d791 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 4 Jun 2019 22:08:41 -0400 Subject: [PATCH] radeonsi/gfx10: allocate GDS BOs for streamout Acked-by: Bas Nieuwenhuizen --- .../drivers/radeonsi/gfx10_shader_ngg.c | 2 + src/gallium/drivers/radeonsi/si_gfx_cs.c | 39 +++++++++++++++---- src/gallium/drivers/radeonsi/si_pipe.h | 1 + .../drivers/radeonsi/si_state_streamout.c | 8 +++- 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 2e7b42e056d..777873fedd0 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -231,6 +231,8 @@ static void build_streamout(struct si_shader_context *ctx, unsigned scratch_offset_base = isgs ? 8 : 4; LLVMValueRef scratch_offset_basev = isgs ? i32_8 : i32_4; + ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size", 256); + /* Determine the mapping of streamout buffers to vertex streams. */ for (unsigned i = 0; i < so->num_outputs; ++i) { unsigned buf = so->output[i].output_buffer; diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 13ef470af3c..3c323fbafdf 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -284,20 +284,43 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx) RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); } -void si_begin_new_gfx_cs(struct si_context *ctx) +static void si_add_gds_to_buffer_list(struct si_context *sctx) { - if (ctx->is_debug) - si_begin_gfx_cs_debug(ctx); - - if (ctx->gds) { - ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds, + if (sctx->gds) { + sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0); - if (ctx->gds_oa) { - ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds_oa, + if (sctx->gds_oa) { + sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0); } } +} + +void si_allocate_gds(struct si_context *sctx) +{ + struct radeon_winsys *ws = sctx->ws; + + if (sctx->gds) + return; + + assert(sctx->chip_class >= GFX10); /* for gfx10 streamout */ + + /* 4 streamout GDS counters. + * We need 256B (64 dw) of GDS, otherwise streamout hangs. + */ + sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0); + sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0); + + assert(sctx->gds && sctx->gds_oa); + si_add_gds_to_buffer_list(sctx); +} + +void si_begin_new_gfx_cs(struct si_context *ctx) +{ + if (ctx->is_debug) + si_begin_gfx_cs_debug(ctx); + si_add_gds_to_buffer_list(ctx); /* Always invalidate caches at the beginning of IBs, because external * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 874b1bf4cd0..e3c9151e87e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1379,6 +1379,7 @@ void si_init_screen_get_functions(struct si_screen *sscreen); /* si_gfx_cs.c */ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); +void si_allocate_gds(struct si_context *ctx); void si_begin_new_gfx_cs(struct si_context *ctx); void si_need_gfx_cs_space(struct si_context *ctx); void si_unref_sdma_uploads(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index 1eb06b7430b..da8c5465488 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -145,9 +145,13 @@ static void si_set_streamout_targets(struct pipe_context *ctx, /* All readers of the streamout targets need to be finished before we can * start writing to the targets. */ - if (num_targets) + if (num_targets) { + if (sctx->chip_class >= GFX10) + si_allocate_gds(sctx); + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH; + SI_CONTEXT_CS_PARTIAL_FLUSH; + } /* Streamout buffers must be bound in 2 places: * 1) in VGT by setting the VGT_STRMOUT registers -- 2.30.2