From: Marek Olšák <marek.olsak@amd.com>
Date: Fri, 24 May 2019 22:48:39 +0000 (-0400)
Subject: radeonsi/gfx10: set DLC for loads when GLC is set
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4bdf44724fa8bd88d8f5ed2d8627a4b5ba02cbc0;p=mesa.git

radeonsi/gfx10: set DLC for loads when GLC is set

This fixes L1 shader array cache coherency.

Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
---

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index ecb72395867..5089463e2db 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -1107,6 +1107,15 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
 	return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
 }
 
+static LLVMValueRef get_cache_policy(struct ac_llvm_context *ctx,
+				     bool load, bool glc, bool slc)
+{
+	return LLVMConstInt(ctx->i32,
+			    (glc ? ac_glc : 0) +
+			    (slc ? ac_slc : 0) +
+			    (ctx->chip_class >= GFX10 && glc && load ? ac_dlc : 0), 0);
+}
+
 static void
 ac_build_llvm7_buffer_store_common(struct ac_llvm_context *ctx,
 				   LLVMValueRef rsrc,
@@ -1165,7 +1174,7 @@ ac_build_llvm8_buffer_store_common(struct ac_llvm_context *ctx,
 		args[idx++] = vindex ? vindex : ctx->i32_0;
 	args[idx++] = voffset ? voffset : ctx->i32_0;
 	args[idx++] = soffset ? soffset : ctx->i32_0;
-	args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+	args[idx++] = get_cache_policy(ctx, false, glc, slc);
 	unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
 	const char *indexing_kind = structurized ? "struct" : "raw";
 	char name[256], type_name[8];
@@ -1350,7 +1359,7 @@ ac_build_llvm8_buffer_load_common(struct ac_llvm_context *ctx,
 		args[idx++] = vindex ? vindex : ctx->i32_0;
 	args[idx++] = voffset ? voffset : ctx->i32_0;
 	args[idx++] = soffset ? soffset : ctx->i32_0;
-	args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+	args[idx++] = get_cache_policy(ctx, true, glc, slc);
 	unsigned func = !ac_has_vec3_support(ctx->chip_class, use_format) && num_channels == 3 ? 4 : num_channels;
 	const char *indexing_kind = structurized ? "struct" : "raw";
 	char name[256], type_name[8];
@@ -1404,6 +1413,8 @@ ac_build_buffer_load(struct ac_llvm_context *ctx,
 				HAVE_LLVM >= 0x0800 ? "llvm.amdgcn.s.buffer.load.f32"
 						    : "llvm.SI.load.const.v4i32";
 			unsigned num_args = HAVE_LLVM >= 0x0800 ? 3 : 2;
+			/* TODO: set glc+dlc on GFX10 (LLVM support is missing) */
+			assert(!glc || ctx->chip_class < GFX10);
 			LLVMValueRef args[3] = {
 				rsrc,
 				offset,
@@ -1551,7 +1562,7 @@ ac_build_llvm8_tbuffer_load(struct ac_llvm_context *ctx,
 	args[idx++] = voffset ? voffset : ctx->i32_0;
 	args[idx++] = soffset ? soffset : ctx->i32_0;
 	args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
-	args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+	args[idx++] = get_cache_policy(ctx, true, glc, slc);
 	unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
 	const char *indexing_kind = structurized ? "struct" : "raw";
 	char name[256], type_name[8];
@@ -2049,7 +2060,7 @@ ac_build_llvm8_tbuffer_store(struct ac_llvm_context *ctx,
 	args[idx++] = voffset ? voffset : ctx->i32_0;
 	args[idx++] = soffset ? soffset : ctx->i32_0;
 	args[idx++] = LLVMConstInt(ctx->i32, ac_get_tbuffer_format(ctx, dfmt, nfmt), 0);
-	args[idx++] = LLVMConstInt(ctx->i32, (glc ? 1 : 0) + (slc ? 2 : 0), 0);
+	args[idx++] = get_cache_policy(ctx, false, glc, slc);
 	unsigned func = !ac_has_vec3_support(ctx->chip_class, true) && num_channels == 3 ? 4 : num_channels;
 	const char *indexing_kind = structurized ? "struct" : "raw";
 	char name[256], type_name[8];
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 17e701b21f8..4917315cc50 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -527,8 +527,9 @@ enum ac_image_dim {
 
 /* These cache policy bits match the definitions used by the LLVM intrinsics. */
 enum ac_image_cache_policy {
-	ac_glc = 1 << 0,
-	ac_slc = 1 << 1,
+	ac_glc = 1 << 0, /* per-CU cache control */
+	ac_slc = 1 << 1, /* global L2 cache control */
+	ac_dlc = 1 << 2, /* per-shader-array cache control */
 };
 
 struct ac_image_args {
@@ -536,7 +537,7 @@ struct ac_image_args {
 	enum ac_atomic_op atomic : 4; /* for the ac_image_atomic opcode */
 	enum ac_image_dim dim : 3;
 	unsigned dmask : 4;
-	unsigned cache_policy : 2;
+	unsigned cache_policy : 3;
 	bool unorm : 1;
 	bool level_zero : 1;
 	unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 636fd4035c8..73941ba6f45 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -1519,6 +1519,7 @@ static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueR
 
 static unsigned get_cache_policy(struct ac_nir_context *ctx,
 				 enum gl_access_qualifier access,
+				 bool load,
 				 bool may_store_unaligned,
 				 bool writeonly_memory)
 {
@@ -1535,7 +1536,8 @@ static unsigned get_cache_policy(struct ac_nir_context *ctx,
 	      */
 	     writeonly_memory ||
 	     access & (ACCESS_COHERENT | ACCESS_VOLATILE))) {
-		cache_policy |= ac_glc;
+		cache_policy |= ac_glc |
+				(ctx->ac.chip_class >= GFX10 && load ? ac_dlc : 0);
 	}
 
 	return cache_policy;
@@ -1549,7 +1551,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx,
 	unsigned writemask = nir_intrinsic_write_mask(instr);
 	enum gl_access_qualifier access = nir_intrinsic_access(instr);
 	bool writeonly_memory = access & ACCESS_NON_READABLE;
-	unsigned cache_policy = get_cache_policy(ctx, access, false, writeonly_memory);
+	unsigned cache_policy = get_cache_policy(ctx, access, false, false, writeonly_memory);
 
 	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
 				        get_src(ctx, instr->src[1]), true);
@@ -1713,7 +1715,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
 	int elem_size_bytes = instr->dest.ssa.bit_size / 8;
 	int num_components = instr->num_components;
 	enum gl_access_qualifier access = nir_intrinsic_access(instr);
-	unsigned cache_policy = get_cache_policy(ctx, access, false, false);
+	unsigned cache_policy = get_cache_policy(ctx, access, true, false, false);
 
 	LLVMValueRef offset = get_src(ctx, instr->src[1]);
 	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
@@ -2452,7 +2454,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
 
 	struct ac_image_args args = {};
 
-	args.cache_policy = get_cache_policy(ctx, access, false, false);
+	args.cache_policy = get_cache_policy(ctx, access, true, false, false);
 
 	if (dim == GLSL_SAMPLER_DIM_BUF) {
 		unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
@@ -2510,7 +2512,7 @@ static void visit_image_store(struct ac_nir_context *ctx,
 	bool writeonly_memory = access & ACCESS_NON_READABLE;
 	struct ac_image_args args = {};
 
-	args.cache_policy = get_cache_policy(ctx, access, true, writeonly_memory);
+	args.cache_policy = get_cache_policy(ctx, access, false, true, writeonly_memory);
 
 	if (dim == GLSL_SAMPLER_DIM_BUF) {
 		LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true, false);
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
index 8d6a7dc8d67..455af80e206 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -315,7 +315,7 @@ static void image_fetch_coords(
 
 static unsigned get_cache_policy(struct si_shader_context *ctx,
 				 const struct tgsi_full_instruction *inst,
-				 bool atomic, bool may_store_unaligned,
+				 bool load, bool atomic, bool may_store_unaligned,
 				 bool writeonly_memory)
 {
 	unsigned cache_policy = 0;
@@ -330,8 +330,10 @@ static unsigned get_cache_policy(struct si_shader_context *ctx,
 	      * evicting L1 cache lines that may be needed by other
 	      * instructions. */
 	     writeonly_memory ||
-	     inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE)))
-		cache_policy |= ac_glc;
+	     inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE))) {
+		cache_policy |= ac_glc |
+				(ctx->screen->info.chip_class >= GFX10 && load ? ac_dlc : 0);
+	}
 
 	if (inst->Memory.Qualifier & TGSI_MEMORY_STREAM_CACHE_POLICY)
 		cache_policy |= ac_slc;
@@ -530,7 +532,7 @@ static void load_emit(
 						info->uses_bindless_buffer_atomic,
 						info->uses_bindless_image_store |
 						info->uses_bindless_image_atomic);
-	args.cache_policy = get_cache_policy(ctx, inst, false, false, false);
+	args.cache_policy = get_cache_policy(ctx, inst, true, false, false, false);
 
 	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 		/* Don't use SMEM for shader buffer loads, because LLVM doesn't
@@ -711,6 +713,7 @@ static void store_emit(
 
 	bool is_image = inst->Dst[0].Register.File != TGSI_FILE_BUFFER;
 	args.cache_policy = get_cache_policy(ctx, inst,
+					     false, /* load */
 					     false, /* atomic */
 					     is_image, /* may_store_unaligned */
 					     writeonly_memory);
@@ -833,7 +836,7 @@ static void atomic_emit(
 
 	args.data[num_data++] =
 		ac_to_integer(&ctx->ac, lp_build_emit_fetch(bld_base, inst, 2, 0));
-	args.cache_policy = get_cache_policy(ctx, inst, true, false, false);
+	args.cache_policy = get_cache_policy(ctx, inst, false, true, false, false);
 
 	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
 		args.resource = shader_buffer_fetch_rsrc(ctx, &inst->Src[0], false);