intel/fs: Use ALIGN16 instructions for all derivatives on gen <= 7

author Jason Ekstrand <jason@jlekstrand.net>

Thu, 25 Jul 2019 23:28:44 +0000 (18:28 -0500)

committer Jason Ekstrand <jason@jlekstrand.net>

Tue, 30 Jul 2019 22:38:19 +0000 (22:38 +0000)
author Jason Ekstrand <jason@jlekstrand.net>
Thu, 25 Jul 2019 23:28:44 +0000 (18:28 -0500)
committer Jason Ekstrand <jason@jlekstrand.net>
Tue, 30 Jul 2019 22:38:19 +0000 (22:38 +0000)
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index 89a6a7f6974f50c71789a39a09fb4427e20f8f3b..6d7435c5f3e79578f1c54c94dc65e38ae5e0d6b2 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6091,9 +6091,6 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
  
     case FS_OPCODE_LINTERP:
     case SHADER_OPCODE_GET_BUFFER_SIZE:
-   case FS_OPCODE_DDX_COARSE:
-   case FS_OPCODE_DDX_FINE:
-   case FS_OPCODE_DDY_COARSE:
     case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
     case FS_OPCODE_PACK_HALF_2x16_SPLIT:
     case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
@@ -6110,6 +6107,9 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
         */
        return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
  
+   case FS_OPCODE_DDX_COARSE:
+   case FS_OPCODE_DDX_FINE:
+   case FS_OPCODE_DDY_COARSE:
     case FS_OPCODE_DDY_FINE:
        /* The implementation of this virtual opcode may require emitting
         * compressed Align16 instructions, which are severely limited on some
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp

index 14cfdd776414f839ee2271ec2307adcb52058c79..62d19719c3968272818b49963b79a772585f02a4 100644 (file)
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1207,27 +1207,50 @@ fs_generator::generate_ddx(const fs_inst *inst,
  {
     unsigned vstride, width;
  
-   if (inst->opcode == FS_OPCODE_DDX_FINE) {
-      /* produce accurate derivatives */
-      vstride = BRW_VERTICAL_STRIDE_2;
-      width = BRW_WIDTH_2;
-   } else {
-      /* replicate the derivative at the top-left pixel to other pixels */
-      vstride = BRW_VERTICAL_STRIDE_4;
-      width = BRW_WIDTH_4;
-   }
+   if (devinfo->gen >= 8) {
+      if (inst->opcode == FS_OPCODE_DDX_FINE) {
+         /* produce accurate derivatives */
+         vstride = BRW_VERTICAL_STRIDE_2;
+         width = BRW_WIDTH_2;
+      } else {
+         /* replicate the derivative at the top-left pixel to other pixels */
+         vstride = BRW_VERTICAL_STRIDE_4;
+         width = BRW_WIDTH_4;
+      }
+
+      struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
+      struct brw_reg src1 = src;
  
-   struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
-   struct brw_reg src1 = src;
+      src0.vstride = vstride;
+      src0.width   = width;
+      src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+      src1.vstride = vstride;
+      src1.width   = width;
+      src1.hstride = BRW_HORIZONTAL_STRIDE_0;
  
-   src0.vstride = vstride;
-   src0.width   = width;
-   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
-   src1.vstride = vstride;
-   src1.width   = width;
-   src1.hstride = BRW_HORIZONTAL_STRIDE_0;
+      brw_ADD(p, dst, src0, negate(src1));
+   } else {
+      /* On Haswell and earlier, the region used above appears to not work
+       * correctly for compressed instructions.  At least on Haswell and
+       * Iron Lake, compressed ALIGN16 instructions do work.  Since we
+       * would have to split to SIMD8 no matter which method we choose, we
+       * may as well use ALIGN16 on all platforms gen7 and earlier.
+       */
+      struct brw_reg src0 = stride(src, 4, 4, 1);
+      struct brw_reg src1 = stride(src, 4, 4, 1);
+      if (inst->opcode == FS_OPCODE_DDX_FINE) {
+         src0.swizzle = BRW_SWIZZLE_XXZZ;
+         src1.swizzle = BRW_SWIZZLE_YYWW;
+      } else {
+         src0.swizzle = BRW_SWIZZLE_XXXX;
+         src1.swizzle = BRW_SWIZZLE_YYYY;
+      }
  
-   brw_ADD(p, dst, src0, negate(src1));
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, dst, negate(src0), src1);
+      brw_pop_insn_state(p);
+   }
  }
  
  /* The negate_value boolean is used to negate the derivative computation for
@@ -1280,10 +1303,28 @@ fs_generator::generate_ddy(const fs_inst *inst,
        }
     } else {
        /* replicate the derivative at the top-left pixel to other pixels */
-      struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
-      struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
+      if (devinfo->gen >= 8) {
+         struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
+         struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
  
-      brw_ADD(p, dst, negate(src0), src1);
+         brw_ADD(p, dst, negate(src0), src1);
+      } else {
+         /* On Haswell and earlier, the region used above appears to not work
+          * correctly for compressed instructions.  At least on Haswell and
+          * Iron Lake, compressed ALIGN16 instructions do work.  Since we
+          * would have to split to SIMD8 no matter which method we choose, we
+          * may as well use ALIGN16 on all platforms gen7 and earlier.
+          */
+         struct brw_reg src0 = stride(src, 4, 4, 1);
+         struct brw_reg src1 = stride(src, 4, 4, 1);
+         src0.swizzle = BRW_SWIZZLE_XXXX;
+         src1.swizzle = BRW_SWIZZLE_ZZZZ;
+
+         brw_push_insn_state(p);
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_ADD(p, dst, negate(src0), src1);
+         brw_pop_insn_state(p);
+      }
     }
  }
author	Jason Ekstrand <jason@jlekstrand.net>
	Thu, 25 Jul 2019 23:28:44 +0000 (18:28 -0500)
committer	Jason Ekstrand <jason@jlekstrand.net>
	Tue, 30 Jul 2019 22:38:19 +0000 (22:38 +0000)
src/intel/compiler/brw_fs.cpp		patch \| blob \| history
src/intel/compiler/brw_fs_generator.cpp		patch \| blob \| history