From 3c78215a1cf4e3d58295a6d3171a2c34c51875d5 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Fri, 15 Sep 2017 18:34:48 +0200
Subject: [PATCH] tgsi: clarify the semantics of DFRACEXP
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

The status quo is quite the mess:

1. tgsi_exec will do a per-channel computation, and store the dst[0]
   result (significand) correctly for each channel. The dst[1] result
   (exponent) will be written to the first bit set in the writemask.
   So per-component calculation only works partially.

2. r600 will only do a single computation. It will replicate the
   exponent but not the significand.

3. The docs pretend that there's per-component calculation, but even
   get dst[0] and dst[1] confused.

4. Luckily, st_glsl_to_tgsi only ever emits single-component instructions,
   and kind-of assumes that everything is replicated, generating this for
   the dvec4 case:

     DFRACEXP TEMP[0].xy, TEMP[1].x, CONST[0][0].xyxy
     DFRACEXP TEMP[0].zw, TEMP[1].y, CONST[0][0].zwzw
     DFRACEXP TEMP[2].xy, TEMP[1].z, CONST[0][1].xyxy
     DFRACEXP TEMP[2].zw, TEMP[1].w, CONST[0][1].zwzw

Settle on the simplest behavior, which is single-component calculation
with replication, document it, and adjust tgsi_exec and r600.

Reviewed-by: Marek OlÅ¡Ã¡k <marek.olsak@amd.com>
Tested-by: Dieter NÃ¼tzel <Dieter@nuetzel-hh.de>
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c         | 16 +++++++---------
 src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h |  2 +-
 src/gallium/docs/source/tgsi.rst               | 10 ++++------
 src/gallium/drivers/r600/r600_shader.c         | 14 ++++++++------
 4 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 1264df0c622..2a47f5dfaef 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -3688,17 +3688,15 @@ exec_dfracexp(struct tgsi_exec_machine *mach,
    union tgsi_double_channel dst;
    union tgsi_exec_channel dst_exp;
 
-   if (((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)) {
-      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
-      micro_dfracexp(&dst, &dst_exp, &src);
+   fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
+   micro_dfracexp(&dst, &dst_exp, &src);
+   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)
       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
-      store_dest(mach, &dst_exp, &inst->Dst[1], inst, ffs(inst->Dst[1].Register.WriteMask) - 1, TGSI_EXEC_DATA_INT);
-   }
-   if (((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)) {
-      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
-      micro_dfracexp(&dst, &dst_exp, &src);
+   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)
       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
-      store_dest(mach, &dst_exp, &inst->Dst[1], inst, ffs(inst->Dst[1].Register.WriteMask) - 1, TGSI_EXEC_DATA_INT);
+   for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[1].Register.WriteMask & (1 << chan))
+         store_dest(mach, &dst_exp, &inst->Dst[1], inst, chan, TGSI_EXEC_DATA_INT);
    }
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h b/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h
index a4a97711750..3f39afe2196 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h
@@ -212,7 +212,7 @@ OPCODE(1, 1, COMP, DSQRT)
 OPCODE(1, 3, COMP, DMAD)
 OPCODE(1, 1, COMP, DFRAC)
 OPCODE(1, 2, COMP, DLDEXP)
-OPCODE(2, 1, COMP, DFRACEXP)
+OPCODE(2, 1, REPL, DFRACEXP)
 OPCODE(1, 1, COMP, D2I)
 OPCODE(1, 1, COMP, I2D)
 OPCODE(1, 1, COMP, D2U)
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 8633c929b9f..fd78c40ba3c 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -1838,17 +1838,15 @@ two-component vectors with doubled precision in each component.
 
 Like the ``frexp()`` routine in many math libraries, this opcode stores the
 exponent of its source to ``dst0``, and the significand to ``dst1``, such that
-:math:`dst1 \times 2^{dst0} = src` .
+:math:`dst1 \times 2^{dst0} = src` . The results are replicated across
+channels.
 
 .. math::
 
-  dst0.xy = exp(src.xy)
+  dst0.xy = dst.zw = frac(src.xy)
 
-  dst1.xy = frac(src.xy)
+  dst1 = frac(src.xy)
 
-  dst0.zw = exp(src.zw)
-
-  dst1.zw = frac(src.zw)
 
 .. opcode:: DLDEXP - Multiply Number by Integral Power of 2
 
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index c9c922fc02b..188fbc9d47d 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -4045,7 +4045,6 @@ static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
 	struct r600_bytecode_alu alu;
 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
 	int i, j, r;
-	int firsti = write_mask == 0xc ? 2 : 0;
 
 	for (i = 0; i <= 3; i++) {
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
@@ -4066,15 +4065,18 @@ static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
 			return r;
 	}
 
-	/* MOV first two channels to writemask dst0 */
-	for (i = 0; i <= 1; i++) {
+	/* Replicate significand result across channels. */
+	for (i = 0; i <= 3; i++) {
+		if (!(write_mask & (1 << i)))
+			continue;
+
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 		alu.op = ALU_OP1_MOV;
-		alu.src[0].chan = i + 2;
+		alu.src[0].chan = (i & 1) + 2;
 		alu.src[0].sel = ctx->temp_reg;
 
-		tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
-		alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
+		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+		alu.dst.write = 1;
 		alu.last = 1;
 		r = r600_bytecode_add_alu(ctx->bc, &alu);
 		if (r)
-- 
2.30.2