From: Marek Olšák Date: Sat, 15 Aug 2020 03:41:13 +0000 (-0400) Subject: radeonsi: don't execute LDS stores for TCS outputs that are never read X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=562b8c1a4793e5e448f9df8f91babcc164051dbd;p=mesa.git radeonsi: don't execute LDS stores for TCS outputs that are never read This is a per-component version of the previous mechanism. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index c60b47fe1c3..ba4db3dc534 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -328,6 +328,7 @@ struct si_shader_info { ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */ ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS]; + ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS]; ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS]; ubyte color_interpolate[2]; @@ -342,13 +343,6 @@ struct si_shader_info { uint num_memory_instructions; /**< sampler, buffer, and image instructions */ - /** - * If a tessellation control shader reads outputs, this describes which ones. - */ - bool reads_pervertex_outputs; - bool reads_perpatch_outputs; - bool reads_tessfactor_outputs; - ubyte colors_read; /**< which color components are read by the FS */ ubyte colors_written; bool reads_samplemask; /**< does fragment shader read sample mask? */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 3baac9dd2aa..f27623ad514 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -518,7 +518,6 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_ LLVMValueRef dw_addr, stride; LLVMValueRef buffer, base, addr; LLVMValueRef values[8]; - bool skip_lds_store; bool is_tess_factor = false, is_tess_inner = false; driver_location = driver_location / 4; @@ -541,23 +540,16 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_ dw_addr = get_tcs_out_current_patch_offset(ctx); dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index, name, index); - - skip_lds_store = !info->reads_pervertex_outputs; } else { dw_addr = get_tcs_out_current_patch_data_offset(ctx); dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index, name, index); - skip_lds_store = !info->reads_perpatch_outputs; - if (is_const && const_index == 0) { int name = info->output_semantic_name[driver_location]; /* Always write tess factors into LDS for the TCS epilog. */ if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) { - /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ - skip_lds_store = !info->reads_tessfactor_outputs && - ctx->shader->selector->info.tessfactors_are_def_in_all_invocs; is_tess_factor = true; is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; } @@ -585,7 +577,10 @@ static void si_nir_store_output_tcs(struct ac_shader_abi *abi, const struct nir_ } /* Skip LDS stores if there is no LDS read of this output. */ - if (!skip_lds_store) + if (info->output_readmask[driver_location + chan / 4] & (1 << (chan % 4)) || + /* The epilog reads LDS if invocation 0 doesn't define tess factors. */ + (is_tess_factor && + !ctx->shader->selector->info.tessfactors_are_def_in_all_invocs)) lshs_lds_store(ctx, chan, dw_addr, value); value = ac_to_integer(&ctx->ac, value); diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index d9e3ac41868..d9b96f21dcb 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -64,16 +64,18 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr } unsigned mask, bit_size; - bool dual_slot; + bool dual_slot, is_output_load; if (nir_intrinsic_infos[intr->intrinsic].index_map[NIR_INTRINSIC_WRMASK] > 0) { mask = nir_intrinsic_write_mask(intr); /* store */ bit_size = nir_src_bit_size(intr->src[0]); dual_slot = bit_size == 64 && nir_src_num_components(intr->src[0]) >= 3; + is_output_load = false; } else { mask = nir_ssa_def_components_read(&intr->dest.ssa); /* load */ bit_size = intr->dest.ssa.bit_size; dual_slot = bit_size == 64 && intr->dest.ssa.num_components >= 3; + is_output_load = !is_input; } /* Convert the 64-bit component mask to a 32-bit component mask. */ @@ -152,7 +154,15 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr info->output_semantic_name[loc] = name; info->output_semantic_index[loc] = index + i; - if (slot_mask) { + if (is_output_load) { + /* Output loads have only a few things that we need to track. */ + info->output_readmask[loc] |= slot_mask; + + if (info->processor == PIPE_SHADER_FRAGMENT && + nir_intrinsic_io_semantics(intr).fb_fetch_output) + info->uses_fbfetch = true; + } else if (slot_mask) { + /* Output stores. */ if (info->processor == PIPE_SHADER_GEOMETRY) { unsigned gs_streams = (uint32_t)nir_intrinsic_io_semantics(intr).gs_streams << (nir_intrinsic_component(intr) * 2); @@ -418,28 +428,12 @@ static void scan_instruction(const struct nir_shader *nir, struct si_shader_info case nir_intrinsic_load_interpolated_input: scan_io_usage(info, intr, true); break; + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: case nir_intrinsic_store_output: case nir_intrinsic_store_per_vertex_output: scan_io_usage(info, intr, false); break; - case nir_intrinsic_load_output: { - unsigned location = nir_intrinsic_io_semantics(intr).location; - - if (nir->info.stage == MESA_SHADER_TESS_CTRL) { - if (location == VARYING_SLOT_TESS_LEVEL_INNER || - location == VARYING_SLOT_TESS_LEVEL_OUTER) - info->reads_tessfactor_outputs = true; - else - info->reads_perpatch_outputs = true; - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { - if (nir_intrinsic_io_semantics(intr).fb_fetch_output) - info->uses_fbfetch = true; - } - break; - } - case nir_intrinsic_load_per_vertex_output: - info->reads_pervertex_outputs = true; - break; case nir_intrinsic_load_deref: case nir_intrinsic_store_deref: case nir_intrinsic_interp_deref_at_centroid: @@ -576,6 +570,10 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf } } } + + /* Trim output read masks based on write masks. */ + for (unsigned i = 0; i < info->num_outputs; i++) + info->output_readmask[i] &= info->output_usagemask[i]; } static void si_nir_opts(struct nir_shader *nir, bool first)