From eb32dad07c92f6693dd84d7af9cda8602fad12ce Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 16 Jan 2019 17:40:13 -0600
Subject: [PATCH] intel/fs: Don't touch accumulator destination while applying
 regioning alignment rule

In some shaders, you can end up with a stride in the source of a
SHADER_OPCODE_MULH.  One way this can happen is if the MULH is acting on
the top bits of a 64-bit value due to 64-bit integer lowering.  In this
case, the compiler will produce something like this:

mul(8)    acc0<1>UD   g5<8,4,2>UD   0x0004UW      { align1 1Q };
mach(8)   g6<1>UD     g5<8,4,2>UD   0x00000004UD  { align1 1Q AccWrEnable };

The new region fixup pass looks at the MUL and sees a strided source and
unstrided destination and determines that the sequence is illegal.  It
then attempts to fix the illegal stride by replacing the destination of
the MUL with a temporary and emitting a MOV into the accumulator:

mul(8)    g9<2>UD     g5<8,4,2>UD   0x0004UW      { align1 1Q };
mov(8)    acc0<1>UD   g9<8,4,2>UD                 { align1 1Q };
mach(8)   g6<1>UD     g5<8,4,2>UD   0x00000004UD  { align1 1Q AccWrEnable };

Unfortunately, this new sequence isn't correct because MOV accesses the
accumulator with a different precision to MUL and, instead of filling
the bottom 32 bits with the source and zeroing the top 32 bits, it
leaves the top 32 (or maybe 31) bits alone and full of garbage.  When
the MACH comes along and tries to complete the multiplication, the
result is correct in the bottom 32 bits (which we throw away) and
garbage in the top 32 bits which are actually returned by MACH.

This commit does two things:  First, it adds an assert to ensure that we
don't try to rewrite accumulator destinations of MUL instructions so we
can avoid this precision issue.  Second, it modifies
required_dst_byte_stride to require a tightly packed stride so that we
fix up the sources instead and the actual code which gets emitted is
this:

mov(8)    g9<1>UD     g5<8,4,2>UD                 { align1 1Q };
mul(8)    acc0<1>UD   g9<8,8,1>UD   0x0004UW      { align1 1Q };
mach(8)   g6<1>UD     g5<8,4,2>UD   0x00000004UD  { align1 1Q AccWrEnable };

Fixes: efa4e4bc5fc "intel/fs: Introduce regioning lowering pass"
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/intel/compiler/brw_fs_lower_regioning.cpp | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp
index cc4163b4c2c..df50993dee6 100644
--- a/src/intel/compiler/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@@ -53,7 +53,21 @@ namespace {
    unsigned
    required_dst_byte_stride(const fs_inst *inst)
    {
-      if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
+      if (inst->dst.is_accumulator()) {
+         /* If the destination is an accumulator, insist that we leave the
+          * stride alone.  We cannot "fix" accumulator destinations by writing
+          * to a temporary and emitting a MOV into the original destination.
+          * For multiply instructions (our one use of the accumulator), the
+          * MUL writes the full 66 bits of the accumulator whereas the MOV we
+          * would emit only writes 33 bits and leaves the top 33 bits
+          * undefined.
+          *
+          * It's safe to just require the original stride here because the
+          * lowering pass will detect the mismatch in has_invalid_src_region
+          * and fix the sources of the multiply instead of the destination.
+          */
+         return inst->dst.stride * type_sz(inst->dst.type);
+      } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
           !is_byte_raw_mov(inst)) {
          return get_exec_type_size(inst);
       } else {
@@ -316,6 +330,14 @@ namespace {
    bool
    lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
    {
+      /* We cannot replace the result of an integer multiply which writes the
+       * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
+       * value whereas the MOV will act on only 32 or 33 bits of the
+       * accumulator.
+       */
+      assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
+             brw_reg_type_is_floating_point(inst->dst.type));
+
       const fs_builder ibld(v, block, inst);
       const unsigned stride = required_dst_byte_stride(inst) /
                               type_sz(inst->dst.type);
-- 
2.30.2