From 91227a1e177a579adf0fd2d53b356618de374e9a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 29 Jul 2019 17:43:55 -0400 Subject: [PATCH] radeonsi/gfx10: add global use_ngg and use_ngg_streamout flags Reviewed-by: Samuel Pitoiset Acked-by: Pierre-Eric Pelloux-Prayer --- src/gallium/drivers/radeonsi/si_get.c | 2 +- src/gallium/drivers/radeonsi/si_gfx_cs.c | 12 ++++++------ src/gallium/drivers/radeonsi/si_pipe.c | 4 +++- src/gallium/drivers/radeonsi/si_pipe.h | 2 ++ src/gallium/drivers/radeonsi/si_query.c | 4 ++-- src/gallium/drivers/radeonsi/si_shader.c | 10 +++++----- src/gallium/drivers/radeonsi/si_state_draw.c | 9 +++++---- .../drivers/radeonsi/si_state_shaders.c | 12 ++++++++---- .../drivers/radeonsi/si_state_streamout.c | 18 +++++++++--------- 9 files changed, 41 insertions(+), 32 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index 15777392555..f0eed6df30b 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -161,7 +161,7 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 1; case PIPE_CAP_QUERY_SO_OVERFLOW: - return sscreen->info.chip_class <= GFX9; + return !sscreen->use_ngg_streamout; case PIPE_CAP_POST_DEPTH_COVERAGE: return sscreen->info.chip_class >= GFX10; diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 1560e3a2df3..b30839d25b5 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -162,11 +162,11 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, si_emit_streamout_end(ctx); ctx->streamout.suspended = true; - /* Since streamout uses GDS on gfx10, we need to make - * GDS idle when we leave the IB, otherwise another - * process might overwrite it while our shaders are busy. + /* Since NGG streamout uses GDS, we need to make GDS + * idle when we leave the IB, otherwise another process + * might overwrite it while our shaders are busy. */ - if (ctx->chip_class >= GFX10) + if (ctx->screen->use_ngg_streamout) wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; } } @@ -303,7 +303,7 @@ void si_allocate_gds(struct si_context *sctx) if (sctx->gds) return; - assert(sctx->chip_class >= GFX10); /* for gfx10 streamout */ + assert(sctx->screen->use_ngg_streamout); /* 4 streamout GDS counters. * We need 256B (64 dw) of GDS, otherwise streamout hangs. @@ -405,7 +405,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx) si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state); si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref); si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map); - if (ctx->chip_class < GFX10) + if (!ctx->screen->use_ngg_streamout) si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable); si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond); /* CLEAR_STATE disables all window rectangles. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 28f23b26be6..1de2b3dd624 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -501,7 +501,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, if (!sctx->border_color_map) goto fail; - sctx->ngg = sctx->chip_class >= GFX10; + sctx->ngg = sscreen->use_ngg; /* Initialize context functions used by graphics and compute. */ if (sctx->chip_class >= GFX10) @@ -1154,6 +1154,8 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->info.family == CHIP_RAVEN; sscreen->has_dcc_constant_encode = sscreen->info.family == CHIP_RAVEN2 || sscreen->info.chip_class >= GFX10; + sscreen->use_ngg = sscreen->info.chip_class >= GFX10; + sscreen->use_ngg_streamout = sscreen->info.chip_class >= GFX10; /* Only enable primitive binning on APUs by default. */ if (sscreen->info.chip_class >= GFX10) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 5b09bb2b90f..ddc1ce0c6de 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -505,6 +505,8 @@ struct si_screen { bool dpbb_allowed; bool dfsm_allowed; bool llvm_has_working_vgpr_indexing; + bool use_ngg; + bool use_ngg_streamout; struct { #define OPT_BOOL(name, dflt, description) bool name:1; diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index 920febadba2..53cedb5b83c 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -1013,7 +1013,7 @@ static void si_emit_query_predication(struct si_context *ctx) if (!query) return; - if (ctx->chip_class == GFX10 && + if (ctx->screen->use_ngg_streamout && (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE || query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)) { assert(!"not implemented"); @@ -1100,7 +1100,7 @@ static struct pipe_query *si_create_query(struct pipe_context *ctx, unsigned que query_type != SI_QUERY_TIME_ELAPSED_SDMA)) return si_query_sw_create(query_type); - if (sscreen->info.chip_class >= GFX10 && + if (sscreen->use_ngg_streamout && (query_type == PIPE_QUERY_PRIMITIVES_EMITTED || query_type == PIPE_QUERY_PRIMITIVES_GENERATED || query_type == PIPE_QUERY_SO_STATISTICS || diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 8d3763c15bf..8dd608b5378 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3460,7 +3460,7 @@ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) ret = si_insert_input_ptr(ctx, ret, ctx->param_bindless_samplers_and_images, 8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES); - if (ctx->screen->info.chip_class >= GFX10) { + if (ctx->screen->use_ngg) { ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits, 8 + SI_SGPR_VS_STATE_BITS); } @@ -3666,7 +3666,7 @@ static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, } } - if (ctx->ac.chip_class <= GFX9 && + if (!ctx->screen->use_ngg_streamout && ctx->shader->selector->so.num_outputs) si_llvm_emit_streamout(ctx, outputs, i, 0); @@ -4462,7 +4462,7 @@ static void declare_streamout_params(struct si_shader_context *ctx, struct pipe_stream_output_info *so, struct si_function_info *fninfo) { - if (ctx->ac.chip_class >= GFX10) + if (ctx->screen->use_ngg_streamout) return; /* Streamout SGPRs. */ @@ -5738,7 +5738,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, /* Fetch the vertex stream ID.*/ LLVMValueRef stream_id; - if (ctx.ac.chip_class <= GFX9 && gs_selector->so.num_outputs) + if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) stream_id = si_unpack_param(&ctx, ctx.param_streamout_config, 24, 2); else stream_id = ctx.i32_0; @@ -5798,7 +5798,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, } /* Streamout and exports. */ - if (ctx.ac.chip_class <= GFX9 && gs_selector->so.num_outputs) { + if (!sscreen->use_ngg_streamout && gs_selector->so.num_outputs) { si_llvm_emit_streamout(&ctx, outputs, gsinfo->num_outputs, stream); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 8254d7ba2a3..118d87e4734 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -586,10 +586,11 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx) struct radeon_cmdbuf *cs = sctx->gfx_cs; enum pipe_prim_type rast_prim = sctx->current_rast_prim; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; + bool use_ngg = sctx->screen->use_ngg; if (likely(rast_prim == sctx->last_rast_prim && rs->pa_sc_line_stipple == sctx->last_sc_line_stipple && - (sctx->chip_class <= GFX9 || + (!use_ngg || rs->flatshade_first == sctx->last_flatshade_first))) return; @@ -610,13 +611,13 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx) radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out); sctx->context_roll = true; - if (sctx->chip_class >= GFX10) { + if (use_ngg) { sctx->current_vs_state &= C_VS_STATE_OUTPRIM; sctx->current_vs_state |= S_VS_STATE_OUTPRIM(gs_out); } } - if (sctx->chip_class >= GFX10) { + if (use_ngg) { unsigned vtx_index = rs->flatshade_first ? 0 : gs_out; sctx->current_vs_state &= C_VS_STATE_PROVOKING_VTX_INDEX; sctx->current_vs_state |= S_VS_STATE_PROVOKING_VTX_INDEX(vtx_index); @@ -662,7 +663,7 @@ static void si_emit_vs_state(struct si_context *sctx, } /* For NGG: */ - if (sctx->chip_class >= GFX10 && + if (sctx->screen->use_ngg && sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] != R_00B230_SPI_SHADER_USER_DATA_GS_0) { radeon_set_sh_reg(cs, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index d3c3677d82b..18cdc989cf8 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1413,8 +1413,10 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, S_00B12C_OC_LDS_EN(oc_lds_en) | S_00B12C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0); - if (sscreen->info.chip_class <= GFX9) { + if (sscreen->info.chip_class <= GFX9) rsrc1 |= S_00B128_SGPRS((shader->config.num_sgprs - 1) / 8); + + if (!sscreen->use_ngg_streamout) { rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) | S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) | S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) | @@ -2453,7 +2455,9 @@ static void si_init_shader_selector_async(void *job, int thread_index) si_parse_next_shader_property(&sel->info, sel->so.num_outputs != 0, &shader->key); - if (sscreen->info.chip_class >= GFX10 && + + if (sscreen->use_ngg && + (!sel->so.num_outputs || sscreen->use_ngg_streamout) && ((sel->type == PIPE_SHADER_VERTEX && !shader->key.as_ls && !shader->key.as_es) || sel->type == PIPE_SHADER_TESS_EVAL || @@ -2537,7 +2541,7 @@ static void si_init_shader_selector_async(void *job, int thread_index) /* The GS copy shader is always pre-compiled. */ if (sel->type == PIPE_SHADER_GEOMETRY && - (sscreen->info.chip_class <= GFX9 || sel->tess_turns_off_ngg)) { + (!sscreen->use_ngg || sel->tess_turns_off_ngg)) { sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug); if (!sel->gs_copy_shader) { fprintf(stderr, "radeonsi: can't create GS copy shader\n"); @@ -2993,7 +2997,7 @@ static void si_update_tess_uses_prim_id(struct si_context *sctx) static bool si_update_ngg(struct si_context *sctx) { - if (sctx->chip_class <= GFX9) + if (!sctx->screen->use_ngg) return false; bool new_ngg = true; diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index da8c5465488..ae91c55e0c2 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -50,7 +50,7 @@ si_create_so_target(struct pipe_context *ctx, return NULL; } - unsigned buf_filled_size_size = sctx->chip_class >= GFX10 ? 8 : 4; + unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4; u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4, &t->buf_filled_size_offset, (struct pipe_resource**)&t->buf_filled_size); @@ -127,7 +127,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, SI_CONTEXT_INV_VCACHE; /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */ - if (sctx->chip_class >= GFX10) { + if (sctx->screen->use_ngg_streamout) { sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; /* Wait now. This is needed to make sure that GDS is not @@ -146,7 +146,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, * start writing to the targets. */ if (num_targets) { - if (sctx->chip_class >= GFX10) + if (sctx->screen->use_ngg_streamout) si_allocate_gds(sctx); sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | @@ -197,7 +197,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, struct pipe_shader_buffer sbuf; sbuf.buffer = targets[i]->buffer; - if (sctx->chip_class >= GFX10) { + if (sctx->screen->use_ngg_streamout) { sbuf.buffer_offset = targets[i]->buffer_offset; sbuf.buffer_size = targets[i]->buffer_size; } else { @@ -370,7 +370,7 @@ static void si_emit_streamout_begin(struct si_context *sctx) void si_emit_streamout_end(struct si_context *sctx) { - if (sctx->chip_class >= GFX10) { + if (sctx->screen->use_ngg_streamout) { gfx10_emit_streamout_end(sctx); return; } @@ -423,7 +423,7 @@ void si_emit_streamout_end(struct si_context *sctx) static void si_emit_streamout_enable(struct si_context *sctx) { - assert(sctx->chip_class < GFX10); + assert(!sctx->screen->use_ngg_streamout); radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2); radeon_emit(sctx->gfx_cs, @@ -449,7 +449,7 @@ static void si_set_streamout_enable(struct si_context *sctx, bool enable) (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12); - if (sctx->chip_class < GFX10 && + if (!sctx->screen->use_ngg_streamout && ((old_strmout_en != si_get_strmout_en(sctx)) || (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))) si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); @@ -458,7 +458,7 @@ static void si_set_streamout_enable(struct si_context *sctx, bool enable) void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff) { - if (sctx->chip_class < GFX10 && + if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) { bool old_strmout_en = si_get_strmout_en(sctx); @@ -479,7 +479,7 @@ void si_init_streamout_functions(struct si_context *sctx) sctx->b.stream_output_target_destroy = si_so_target_destroy; sctx->b.set_stream_output_targets = si_set_streamout_targets; - if (sctx->chip_class >= GFX10) { + if (sctx->screen->use_ngg_streamout) { sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin; } else { sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; -- 2.30.2