From: Kristian H. Kristensen <hoegsberg@google.com>
Date: Sat, 15 Feb 2020 00:47:06 +0000 (-0800)
Subject: freedreno/ir3: Lower output precision
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=daa4020948867cc2c9b38d7536a1b73bf79d2745;p=mesa.git

freedreno/ir3: Lower output precision

This lowers mediump FS outputs to fp16 in the ir3 backend. For now
this is a modest improvement, which mostly helps us whittle down the
full mediump work.  Once the GLSL level support lands, then right hand
side of the store output intrinsics will be fp16 expressions and we'll
cancel out the fp16 -> fp32 -> fp 16 round trip here.

We've had different attempts at implementing this: rewriting stores in
the GLSL IR, lowering GLSL IR outputs to temporaries and inserting
conversions when writing the temporaries to the outputs.  In the end,
GLSL ends up getting in the way a lot and doing it at the nir level is
easier and still possible since we have the output var precisions.

This part of the fp16 work is more of a step on the way towards full
fp16 support and will add a few extra conversion instructions:

total instructions in shared programs: 8151 -> 8163 (0.15%)
instructions in affected programs: 1187 -> 1199 (1.01%)
helped: 4
HURT: 10

total nops in shared programs: 3146 -> 3152 (0.19%)
nops in affected programs: 563 -> 569 (1.07%)
helped: 5
HURT: 10

total non-nops in shared programs: 5005 -> 5011 (0.12%)
non-nops in affected programs: 92 -> 98 (6.52%)
helped: 0
HURT: 3

total dwords in shared programs: 12832 -> 12800 (-0.25%)
dwords in affected programs: 96 -> 64 (-33.33%)
helped: 1
HURT: 0

total last-baryf in shared programs: 118 -> 115 (-2.54%)
last-baryf in affected programs: 21 -> 18 (-14.29%)
helped: 1
HURT: 0

total full in shared programs: 424 -> 417 (-1.65%)
full in affected programs: 15 -> 8 (-46.67%)
helped: 7

HURT: 0
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3822>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3822>
---

diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index d21f33e5bc5..7c762fffa28 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -42,6 +42,7 @@ static const struct debug_named_value shader_debug_options[] = {
 #ifdef DEBUG
 	{"schedmsgs",  IR3_DBG_SCHEDMSGS,  "Enable scheduler debug messages"},
 #endif
+	{"nofp16",     IR3_DBG_NOFP16,     "Don't lower mediump to fp16"},
 	DEBUG_NAMED_VALUE_END
 };
 
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index c5213a65b08..2824b3fe5f4 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -93,6 +93,7 @@ enum ir3_shader_debug {
 	IR3_DBG_FORCES2EN  = 0x100,
 	IR3_DBG_NOUBOOPT   = 0x200,
 	IR3_DBG_SCHEDMSGS  = 0x400,
+	IR3_DBG_NOFP16     = 0x800,
 };
 
 extern enum ir3_shader_debug ir3_shader_debug;
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
index 3d28aec4445..efa9359a976 100644
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -284,6 +284,53 @@ ir3_shader_destroy(struct ir3_shader *shader)
 	free(shader);
 }
 
+static bool
+lower_output_var(nir_shader *nir, int location)
+{
+	nir_foreach_variable(var, &nir->outputs) {
+		if (var->data.driver_location == location &&
+				var->data.precision == GLSL_PRECISION_MEDIUM) {
+			if (glsl_get_base_type(var->type) == GLSL_TYPE_FLOAT)
+				var->type = glsl_float16_type(var->type);
+
+			return glsl_get_base_type(var->type) == GLSL_TYPE_FLOAT16;
+		}
+	}
+
+	return false;
+}
+
+static void
+lower_mediump_outputs(nir_shader *nir)
+{
+	nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+	assert(impl);
+
+	/* Get rid of old derefs before we change the types of the variables */
+	nir_opt_dce(nir);
+
+	nir_builder b;
+	nir_builder_init(&b, impl);
+
+	nir_foreach_block_safe(block, impl) {
+		nir_foreach_instr_safe(instr, block) {
+			if (instr->type != nir_instr_type_intrinsic)
+				continue;
+
+			nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+			if (intr->intrinsic != nir_intrinsic_store_output)
+				continue;
+
+			if (!lower_output_var(nir, nir_intrinsic_base(intr)))
+				continue;
+
+			b.cursor = nir_before_instr(&intr->instr);
+			nir_instr_rewrite_src(&intr->instr, &intr->src[0],
+					nir_src_for_ssa(nir_f2f16(&b, intr->src[0].ssa)));
+		}
+	}
+}
+
 struct ir3_shader *
 ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir)
 {
@@ -297,6 +344,11 @@ ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir)
 	NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
 			   (nir_lower_io_options)0);
 
+	if (compiler->gpu_id >= 600 &&
+			nir->info.stage == MESA_SHADER_FRAGMENT &&
+			!(ir3_shader_debug & IR3_DBG_NOFP16))
+		lower_mediump_outputs(nir);
+
 	if (nir->info.stage == MESA_SHADER_FRAGMENT) {
 		/* NOTE: lower load_barycentric_at_sample first, since it
 		 * produces load_barycentric_at_offset: