intel/fs: Add a helper for emitting scan operations

author Jason Ekstrand <jason.ekstrand@intel.com>

Fri, 1 Sep 2017 04:50:31 +0000 (21:50 -0700)

committer Jason Ekstrand <jason.ekstrand@intel.com>

Wed, 7 Mar 2018 20:13:47 +0000 (12:13 -0800)
author Jason Ekstrand <jason.ekstrand@intel.com>
Fri, 1 Sep 2017 04:50:31 +0000 (21:50 -0700)
committer Jason Ekstrand <jason.ekstrand@intel.com>
Wed, 7 Mar 2018 20:13:47 +0000 (12:13 -0800)
diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h

index b157e33c39ba63d2c293fede6e4eeaf1771676cf..cf603b0c86891b1feed371cf1c5c0c86172dad09 100644 (file)
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -412,6 +412,115 @@ namespace brw {
           return src_reg(component(dst, 0));
        }
  
+      void
+      emit_scan(enum opcode opcode, const dst_reg &tmp,
+                unsigned cluster_size, brw_conditional_mod mod) const
+      {
+         assert(dispatch_width() >= 8);
+
+         /* The instruction splitting code isn't advanced enough to split
+          * these so we need to handle that ourselves.
+          */
+         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
+            const unsigned half_width = dispatch_width() / 2;
+            const fs_builder ubld = exec_all().group(half_width, 0);
+            dst_reg left = tmp;
+            dst_reg right = horiz_offset(tmp, half_width);
+            ubld.emit_scan(opcode, left, cluster_size, mod);
+            ubld.emit_scan(opcode, right, cluster_size, mod);
+            if (cluster_size > half_width) {
+               src_reg left_comp = component(left, half_width - 1);
+               set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
+            }
+            return;
+         }
+
+         if (cluster_size > 1) {
+            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
+            dst_reg left = horiz_stride(tmp, 2);
+            dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
+
+            /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
+             *
+             *    "When source or destination datatype is 64b or operation is
+             *    integer DWord multiply, regioning in Align1 must follow
+             *    these rules:
+             *
+             *    [...]
+             *
+             *    3. Source and Destination offset must be the same, except
+             *       the case of scalar source."
+             *
+             * In order to work around this, we create a temporary register
+             * and shift left over to match right.  If we have a 64-bit type,
+             * we have to use two integer MOVs instead of a 64-bit MOV.
+             */
+            if (need_matching_subreg_offset(opcode, tmp.type)) {
+               dst_reg tmp2 = vgrf(tmp.type);
+               dst_reg new_left = horiz_stride(horiz_offset(tmp2, 1), 2);
+               if (type_sz(tmp.type) > 4) {
+                  ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 0),
+                           subscript(left, BRW_REGISTER_TYPE_D, 0));
+                  ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 1),
+                           subscript(left, BRW_REGISTER_TYPE_D, 1));
+               } else {
+                  ubld.MOV(new_left, left);
+               }
+               left = new_left;
+            }
+            set_condmod(mod, ubld.emit(opcode, right, left, right));
+         }
+
+         if (cluster_size > 2) {
+            if (type_sz(tmp.type) <= 4 &&
+                !need_matching_subreg_offset(opcode, tmp.type)) {
+               const fs_builder ubld =
+                  exec_all().group(dispatch_width() / 4, 0);
+               src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
+
+               dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
+               set_condmod(mod, ubld.emit(opcode, right, left, right));
+
+               right = horiz_stride(horiz_offset(tmp, 3), 4);
+               set_condmod(mod, ubld.emit(opcode, right, left, right));
+            } else {
+               /* For 64-bit types, we have to do things differently because
+                * the code above would land us with destination strides that
+                * the hardware can't handle.  Fortunately, we'll only be
+                * 8-wide in that case and it's the same number of
+                * instructions.
+                */
+               const fs_builder ubld = exec_all().group(2, 0);
+
+               for (unsigned i = 0; i < dispatch_width(); i += 4) {
+                  src_reg left = component(tmp, i + 1);
+                  dst_reg right = horiz_offset(tmp, i + 2);
+                  set_condmod(mod, ubld.emit(opcode, right, left, right));
+               }
+            }
+         }
+
+         if (cluster_size > 4) {
+            const fs_builder ubld = exec_all().group(4, 0);
+            src_reg left = component(tmp, 3);
+            dst_reg right = horiz_offset(tmp, 4);
+            set_condmod(mod, ubld.emit(opcode, right, left, right));
+
+            if (dispatch_width() > 8) {
+               left = component(tmp, 8 + 3);
+               right = horiz_offset(tmp, 8 + 4);
+               set_condmod(mod, ubld.emit(opcode, right, left, right));
+            }
+         }
+
+         if (cluster_size > 8 && dispatch_width() > 8) {
+            const fs_builder ubld = exec_all().group(8, 0);
+            src_reg left = component(tmp, 7);
+            dst_reg right = horiz_offset(tmp, 8);
+            set_condmod(mod, ubld.emit(opcode, right, left, right));
+         }
+      }
+
        /**
         * Assorted arithmetic ops.
         * @{
@@ -644,6 +753,38 @@ namespace brw {
           }
        }
  
+
+      /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
+       *
+       *    "When source or destination datatype is 64b or operation is
+       *    integer DWord multiply, regioning in Align1 must follow
+       *    these rules:
+       *
+       *    [...]
+       *
+       *    3. Source and Destination offset must be the same, except
+       *       the case of scalar source."
+       *
+       * This helper just detects when we're in this case.
+       */
+      bool
+      need_matching_subreg_offset(enum opcode opcode,
+                                  enum brw_reg_type type) const
+      {
+         if (!shader->devinfo->is_cherryview &&
+             !gen_device_info_is_9lp(shader->devinfo))
+            return false;
+
+         if (type_sz(type > 4))
+            return true;
+
+         if (opcode == BRW_OPCODE_MUL &&
+             !brw_reg_type_is_floating_point(type))
+            return true;
+
+         return false;
+      }
+
        bblock_t *block;
        exec_node *cursor;
  
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h

index cd603630a47ae0524cbeeae33c7438cb91c025cb..54797ff0fa2a636ebfbc8e69ee081873e9f72e55 100644 (file)
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -312,6 +312,13 @@ subscript(fs_reg reg, brw_reg_type type, unsigned i)
     return byte_offset(retype(reg, type), i * type_sz(type));
  }
  
+static inline fs_reg
+horiz_stride(fs_reg reg, unsigned s)
+{
+   reg.stride *= s;
+   return reg;
+}
+
  static const fs_reg reg_undef;
  
  class fs_inst : public backend_instruction {
author	Jason Ekstrand <jason.ekstrand@intel.com>
	Fri, 1 Sep 2017 04:50:31 +0000 (21:50 -0700)
committer	Jason Ekstrand <jason.ekstrand@intel.com>
	Wed, 7 Mar 2018 20:13:47 +0000 (12:13 -0800)
src/intel/compiler/brw_fs_builder.h		patch \| blob \| history
src/intel/compiler/brw_ir_fs.h		patch \| blob \| history