From 1de9ef9c96c1933b20ba1877cad799794e10359d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Thu, 26 Sep 2019 17:53:17 +0200 Subject: [PATCH] aco: Set GFX10 DLC bit properly. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The DLC bit is now set to 1 for all loads when GLC is also set, but cleared to 0 for all stores (otherwise it causes issues), and also cleared to 0 for atomics. Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann --- src/amd/compiler/README | 4 ++++ src/amd/compiler/aco_instruction_selection.cpp | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/amd/compiler/README b/src/amd/compiler/README index 01b8339c547..990eb62baec 100644 --- a/src/amd/compiler/README +++ b/src/amd/compiler/README @@ -105,6 +105,10 @@ The recommendation from AMD devs is to always set these two bits at the same tim as it doesn't make too much sense to set them independently, aside from some circumstances (eg. we needn't set DLC when only one shader array is used). +Stores and atomics always bypass the L1 cache, so they don't support the DLC bit, +and it shouldn't be set in these cases. Setting the DLC for these cases can result +in graphical glitches. + # Hardware Bugs ## SMEM corrupts VCCZ on SI/CI diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index d1849d7b92b..6b5111c6f47 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -2983,6 +2983,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc Builder bld(ctx->program, ctx->block); unsigned num_bytes = dst.size() * 4; + bool dlc = glc && ctx->options->chip_class >= GFX10; aco_opcode op; if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) { @@ -3005,6 +3006,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc mubuf->operands[2] = soffset; mubuf->offen = (offset.type() == RegType::vgpr); mubuf->glc = glc; + mubuf->dlc = dlc; mubuf->barrier = barrier_buffer; bld.insert(std::move(mubuf)); emit_split_vector(ctx, lower, 2); @@ -3034,6 +3036,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc mubuf->operands[2] = soffset; mubuf->offen = (offset.type() == RegType::vgpr); mubuf->glc = glc; + mubuf->dlc = dlc; mubuf->barrier = barrier_buffer; mubuf->offset = const_offset; aco_ptr instr = std::move(mubuf); @@ -3087,6 +3090,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc assert(load->operands[1].getTemp().type() == RegType::sgpr); load->definitions[0] = Definition(dst); load->glc = glc; + load->dlc = dlc; load->barrier = barrier_buffer; assert(ctx->options->chip_class >= GFX8 || !glc); @@ -3623,6 +3627,7 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coo load->operands[1] = Operand(fmask_desc_ptr); load->definitions[0] = Definition(fmask); load->glc = false; + load->dlc = false; load->dmask = 0x1; load->unrm = true; load->da = da; @@ -3832,6 +3837,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) store->operands[3] = Operand(data); store->idxen = true; store->glc = glc; + store->dlc = false; store->disable_wqm = true; store->barrier = barrier_image; ctx->program->needs_exact = true; @@ -3849,6 +3855,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) store->operands[2] = Operand(s4); store->operands[3] = Operand(data); store->glc = glc; + store->dlc = false; store->dmask = (1 << data.size()) - 1; store->unrm = true; store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); @@ -3945,6 +3952,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->offset = 0; mubuf->idxen = true; mubuf->glc = return_previous; + mubuf->dlc = false; /* Not needed for atomics */ mubuf->disable_wqm = true; mubuf->barrier = barrier_image; ctx->program->needs_exact = true; @@ -3962,6 +3970,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) if (return_previous) mimg->definitions[0] = Definition(dst); mimg->glc = return_previous; + mimg->dlc = false; /* Not needed for atomics */ mimg->dmask = (1 << data.size()) - 1; mimg->unrm = true; mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); @@ -4178,6 +4187,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) store->operands[1].setFixed(m0); store->operands[2] = Operand(write_data); store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->dlc = false; store->disable_wqm = true; store->barrier = barrier_buffer; ctx->block->instructions.emplace_back(std::move(store)); @@ -4195,6 +4205,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) store->offset = start * elem_size_bytes; store->offen = (offset.type() == RegType::vgpr); store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); + store->dlc = false; store->disable_wqm = true; store->barrier = barrier_buffer; ctx->program->needs_exact = true; @@ -4290,6 +4301,7 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->offset = 0; mubuf->offen = (offset.type() == RegType::vgpr); mubuf->glc = return_previous; + mubuf->dlc = false; /* Not needed for atomics */ mubuf->disable_wqm = true; mubuf->barrier = barrier_buffer; ctx->program->needs_exact = true; @@ -4314,6 +4326,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); + bool dlc = glc && ctx->options->chip_class >= GFX10; aco_opcode op; if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) { bool global = ctx->options->chip_class >= GFX9; @@ -4338,6 +4351,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) flat->operands[0] = Operand(addr); flat->operands[1] = Operand(s1); flat->glc = glc; + flat->dlc = dlc; if (dst.type() == RegType::sgpr) { Temp vec = bld.tmp(RegType::vgpr, dst.size()); @@ -4369,6 +4383,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) load->operands[1] = Operand(0u); load->definitions[0] = Definition(dst); load->glc = glc; + load->dlc = dlc; load->barrier = barrier_buffer; assert(ctx->options->chip_class >= GFX8 || !glc); @@ -4455,6 +4470,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) flat->operands[1] = Operand(s1); flat->operands[2] = Operand(data); flat->glc = glc; + flat->dlc = false; flat->offset = offset; ctx->block->instructions.emplace_back(std::move(flat)); } @@ -7436,6 +7452,7 @@ static void emit_stream_output(isel_context *ctx, } store->offen = true; store->glc = true; + store->dlc = false; store->slc = true; store->can_reorder = true; ctx->block->instructions.emplace_back(std::move(store)); -- 2.30.2