anv: Improve brw_nir_lower_mem_access_bit_sizes

[mesa.git] / src / intel / compiler / brw_fs_lower_regioning.cpp
diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp

index cc4163b4c2c252ac7d1d872fc5bfb6c572f1efa6..9db2c540afa975827f72ce65b3086f07369dc36c 100644 (file)
--- a/src/intel/compiler/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@@ -53,19 +53,51 @@ namespace {
     unsigned
     required_dst_byte_stride(const fs_inst *inst)
     {
-      if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
+      if (inst->dst.is_accumulator()) {
+         /* If the destination is an accumulator, insist that we leave the
+          * stride alone.  We cannot "fix" accumulator destinations by writing
+          * to a temporary and emitting a MOV into the original destination.
+          * For multiply instructions (our one use of the accumulator), the
+          * MUL writes the full 66 bits of the accumulator whereas the MOV we
+          * would emit only writes 33 bits and leaves the top 33 bits
+          * undefined.
+          *
+          * It's safe to just require the original stride here because the
+          * lowering pass will detect the mismatch in has_invalid_src_region
+          * and fix the sources of the multiply instead of the destination.
+          */
+         return inst->dst.stride * type_sz(inst->dst.type);
+      } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
            !is_byte_raw_mov(inst)) {
           return get_exec_type_size(inst);
        } else {
-         unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
+         /* Calculate the maximum byte stride and the minimum/maximum type
+          * size across all source and destination operands we are required to
+          * lower.
+          */
+         unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
+         unsigned min_size = type_sz(inst->dst.type);
+         unsigned max_size = type_sz(inst->dst.type);
  
           for (unsigned i = 0; i < inst->sources; i++) {
-            if (!is_uniform(inst->src[i]))
-               stride = MAX2(stride, inst->src[i].stride *
-                             type_sz(inst->src[i].type));
+            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
+               const unsigned size = type_sz(inst->src[i].type);
+               max_stride = MAX2(max_stride, inst->src[i].stride * size);
+               min_size = MIN2(min_size, size);
+               max_size = MAX2(max_size, size);
+            }
           }
  
-         return stride;
+         /* All operands involved in lowering need to fit in the calculated
+          * stride.
+          */
+         assert(max_size <= 4 * min_size);
+
+         /* Attempt to use the largest byte stride among all present operands,
+          * but never exceed a stride of 4 since that would lead to illegal
+          * destination regions during lowering.
+          */
+         return MIN2(max_stride, 4 * min_size);
        }
     }
  
@@ -78,7 +110,7 @@ namespace {
     required_dst_byte_offset(const fs_inst *inst)
     {
        for (unsigned i = 0; i < inst->sources; i++) {
-         if (!is_uniform(inst->src[i]))
+         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
              if (reg_offset(inst->src[i]) % REG_SIZE !=
                  reg_offset(inst->dst) % REG_SIZE)
                 return 0;
@@ -95,20 +127,37 @@ namespace {
     has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
                            unsigned i)
     {
-      if (is_unordered(inst)) {
+      if (is_unordered(inst) || inst->is_control_source(i))
           return false;
-      } else {
-         const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
-         const unsigned src_byte_stride = inst->src[i].stride *
-            type_sz(inst->src[i].type);
-         const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
-         const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
  
-         return has_dst_aligned_region_restriction(devinfo, inst) &&
-                !is_uniform(inst->src[i]) &&
-                (src_byte_stride != dst_byte_stride ||
-                 src_byte_offset != dst_byte_offset);
+      /* Empirical testing shows that Broadwell has a bug affecting half-float
+       * MAD instructions when any of its sources has a non-zero offset, such
+       * as:
+       *
+       * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
+       *
+       * We used to generate code like this for SIMD8 executions where we
+       * used to pack components Y and W of a vector at offset 16B of a SIMD
+       * register. The problem doesn't occur if the stride of the source is 0.
+       */
+      if (devinfo->gen == 8 &&
+          inst->opcode == BRW_OPCODE_MAD &&
+          inst->src[i].type == BRW_REGISTER_TYPE_HF &&
+          reg_offset(inst->src[i]) % REG_SIZE > 0 &&
+          inst->src[i].stride != 0) {
+         return true;
        }
+
+      const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
+      const unsigned src_byte_stride = inst->src[i].stride *
+         type_sz(inst->src[i].type);
+      const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
+      const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
+
+      return has_dst_aligned_region_restriction(devinfo, inst) &&
+             !is_uniform(inst->src[i]) &&
+             (src_byte_stride != dst_byte_stride ||
+              src_byte_offset != dst_byte_offset);
     }
  
     /*
@@ -239,7 +288,9 @@ namespace {
        const unsigned stride =
           type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
           type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
-      const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride);
+      fs_reg tmp = ibld.vgrf(type, stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, stride);
  
        /* Emit a MOV taking care of all the destination modifiers. */
        fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
@@ -280,8 +331,9 @@ namespace {
        const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
                                type_sz(inst->src[i].type);
        assert(stride > 0);
-      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride),
-                                      stride);
+      fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, stride);
  
        /* Emit a series of 32-bit integer copies with any source modifiers
         * cleaned up (because their semantics are dependent on the type).
@@ -316,12 +368,21 @@ namespace {
     bool
     lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
     {
+      /* We cannot replace the result of an integer multiply which writes the
+       * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
+       * value whereas the MOV will act on only 32 or 33 bits of the
+       * accumulator.
+       */
+      assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
+             brw_reg_type_is_floating_point(inst->dst.type));
+
        const fs_builder ibld(v, block, inst);
        const unsigned stride = required_dst_byte_stride(inst) /
                                type_sz(inst->dst.type);
        assert(stride > 0);
-      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride),
-                                      stride);
+      fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
+      ibld.UNDEF(tmp);
+      tmp = horiz_stride(tmp, stride);
  
        /* Emit a series of 32-bit integer copies from the temporary into the
         * original destination.
@@ -393,7 +454,7 @@ fs_visitor::lower_regioning()
        progress |= lower_instruction(this, block, inst);
  
     if (progress)
-      invalidate_live_intervals();
+      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
  
     return progress;
  }