From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Thu, 11 Jun 2020 19:54:01 +0000 (+0200)
Subject: radv: add support for MRTs compaction to avoid holes
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=7a5e6fd25f2e132ef4cacc3a5b714c4e153227b0;p=mesa.git

radv: add support for MRTs compaction to avoid holes

SPI_SHADER_COL_FORMAT allocates export memory and CB_SHADER_MASK
map them to higher MRTs if necessary. The hardware allows to remap
MRTs to avoid holes somehow.

For example, if we have a scenario where MRT0 is unused and only
MRT1 and MRT2 are used, SPI_SHADER_COL_FORMAT is 0x77 and
CB_SHADER_MASK/CB_TARGET_MASK are 0x770 (this assumes
SPI_SHADER_UINT16_ABGR is set).

This allows us to remove one workaround that was added for fixing
GPU hangs with DXVK. I think this is because SPI_SHADER_COL_FORMAT
expects contiguous MRTs to be allocated.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5434>
---

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index de435ff7834..cd2263963fc 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -9979,15 +9979,15 @@ static bool export_fs_mrt_z(isel_context *ctx)
    return true;
 }
 
-static bool export_fs_mrt_color(isel_context *ctx, int slot)
+static bool export_fs_mrt_color(isel_context *ctx, int slot,
+                                unsigned write_mask, Temp *outputs)
 {
    Builder bld(ctx->program, ctx->block);
-   unsigned write_mask = ctx->outputs.mask[slot];
    Operand values[4];
 
    for (unsigned i = 0; i < 4; ++i) {
       if (write_mask & (1 << i)) {
-         values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
+         values[i] = Operand(outputs[i]);
       } else {
          values[i] = Operand(v1);
       }
@@ -9997,7 +9997,6 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
    unsigned enabled_channels = 0;
    aco_opcode compr_op = (aco_opcode)0;
 
-   slot -= FRAG_RESULT_DATA0;
    target = V_008DFC_SQ_EXP_MRT + slot;
    col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
 
@@ -10182,6 +10181,7 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
 
 static void create_fs_exports(isel_context *ctx)
 {
+   unsigned compacted_mrt_index = 0;
    bool exported = false;
 
    /* Export depth, stencil and sample mask. */
@@ -10191,9 +10191,15 @@ static void create_fs_exports(isel_context *ctx)
       exported |= export_fs_mrt_z(ctx);
 
    /* Export all color render targets. */
-   for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
+   for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i) {
       if (ctx->outputs.mask[i])
-         exported |= export_fs_mrt_color(ctx, i);
+         if (export_fs_mrt_color(ctx, compacted_mrt_index,
+                                 ctx->outputs.mask[i],
+                                 &ctx->outputs.temps[i * 4u])) {
+            compacted_mrt_index++;
+            exported = true;
+         }
+   }
 
    if (!exported)
       create_null_export(ctx);
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 987d1054356..0c81138f63d 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -3586,8 +3586,7 @@ handle_fs_outputs_post(struct radv_shader_context *ctx)
 			values[j] = ac_to_float(&ctx->ac,
 						radv_load_output(ctx, i, j));
 
-		bool ret = si_export_mrt_color(ctx, values,
-					       i - FRAG_RESULT_DATA0,
+		bool ret = si_export_mrt_color(ctx, values, index,
 					       &color_args[index]);
 		if (ret)
 			index++;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index e5c0723c707..5e7ea7e64ba 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -511,8 +511,10 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline,
 {
 	RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
 	struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
-	unsigned col_format = 0, is_int8 = 0, is_int10 = 0;
-	unsigned num_targets;
+	unsigned exp_fmt[MAX_RTS] = {0};
+	unsigned is_int8[MAX_RTS] = {0}, is_int10[MAX_RTS] = {0};
+	unsigned col_format = 0;
+	unsigned col_format_is_int8 = 0, col_format_is_int10 = 0;
 
 	for (unsigned i = 0; i < (blend->single_cb_enable ? 1 : subpass->color_count); ++i) {
 		unsigned cf;
@@ -529,42 +531,45 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline,
 			                                  blend_enable,
 							  blend->need_src_alpha & (1 << i));
 
-			if (format_is_int8(attachment->format))
-				is_int8 |= 1 << i;
-			if (format_is_int10(attachment->format))
-				is_int10 |= 1 << i;
+			is_int8[i] = format_is_int8(attachment->format);
+			is_int10[i] = format_is_int10(attachment->format);
 		}
 
-		col_format |= cf << (4 * i);
+		exp_fmt[i] = cf;
 	}
 
-	if (!(col_format & 0xf) && blend->need_src_alpha & (1 << 0)) {
+	if (!exp_fmt[0] && blend->need_src_alpha & (1 << 0)) {
 		/* When a subpass doesn't have any color attachments, write the
 		 * alpha channel of MRT0 when alpha coverage is enabled because
 		 * the depth attachment needs it.
 		 */
-		col_format |= V_028714_SPI_SHADER_32_AR;
-	}
-
-	/* If the i-th target format is set, all previous target formats must
-	 * be non-zero to avoid hangs.
-	 */
-	num_targets = (util_last_bit(col_format) + 3) / 4;
-	for (unsigned i = 0; i < num_targets; i++) {
-		if (!(col_format & (0xf << (i * 4)))) {
-			col_format |= V_028714_SPI_SHADER_32_R << (i * 4);
-		}
+		exp_fmt[0] = V_028714_SPI_SHADER_32_AR;
 	}
 
 	/* The output for dual source blending should have the same format as
 	 * the first output.
 	 */
-	if (blend->mrt0_is_dual_src)
-		col_format |= (col_format & 0xf) << 4;
+	if (blend->mrt0_is_dual_src) {
+		col_format |= (exp_fmt[0] << 4) | exp_fmt[0];
+		col_format_is_int8 |= (is_int8[0] << 1) | is_int8[0];
+		col_format_is_int10 |= (is_int10[0] << 1) | is_int10[0];
+	} else {
+		/* Remove holes in SPI_SHADER_COL_FORMAT. */
+		unsigned num_color_targets = 0;
+		for (unsigned i = 0; i < MAX_RTS; i++) {
+			if (!exp_fmt[i])
+				continue;
+
+			col_format |= exp_fmt[i] << (4 * num_color_targets);
+			col_format_is_int8 |= is_int8[i] << num_color_targets;
+			col_format_is_int10 |= is_int10[i] << num_color_targets;
+			num_color_targets++;
+		}
+	}
 
 	blend->spi_shader_col_format = col_format;
-	blend->col_format_is_int8 = is_int8;
-	blend->col_format_is_int10 = is_int10;
+	blend->col_format_is_int8 = col_format_is_int8;
+	blend->col_format_is_int10 = col_format_is_int10;
 }
 
 /*
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c
index eca46c81157..62a5f170f7c 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -846,18 +846,6 @@ radv_nir_shader_info_pass(const struct nir_shader *nir,
 	info->float_controls_mode = nir->info.float_controls_execution_mode;
 
 	if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-		/* If the i-th output is used, all previous outputs must be
-		 * non-zero to match the target format.
-		 * TODO: compact MRT to avoid holes and to remove this
-		 * workaround.
-		 */
-		unsigned num_targets = (util_last_bit(info->ps.cb_shader_mask) + 3) / 4;
-		for (unsigned i = 0; i < num_targets; i++) {
-			if (!(info->ps.cb_shader_mask & (0xf << (i * 4)))) {
-				info->ps.cb_shader_mask |= 0xf << (i * 4);
-			}
-		}
-
 		if (key->fs.is_dual_src) {
 			info->ps.cb_shader_mask |= (info->ps.cb_shader_mask & 0xf) << 4;
 		}