intel/compiler: implement SIMD16 restrictions for mixed-float instructions

[mesa.git] / src / intel / compiler / brw_fs.cpp
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index e7118703358eafe9882f70916005a083b46add14..ac01d137ce1a63b969814743fea959fe763d020e 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -734,14 +734,33 @@ fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
   * it.
   */
  bool
-fs_inst::is_partial_write() const
+fs_inst::is_partial_reg_write() const
  {
     return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
-           (this->exec_size * type_sz(this->dst.type)) < 32 ||
             !this->dst.is_contiguous() ||
+           (this->exec_size * type_sz(this->dst.type)) < REG_SIZE ||
             this->dst.offset % REG_SIZE != 0);
  }
  
+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire variable for the given dispatch width.
+ *
+ * This is only different from is_partial_reg_write() for SIMD8
+ * dispatches of 16-bit (or smaller) instructions.
+ */
+bool
+fs_inst::is_partial_var_write(uint32_t dispatch_width) const
+{
+   const uint32_t type_size = type_sz(this->dst.type);
+   uint32_t var_size = MIN2(REG_SIZE, dispatch_width * type_size);
+
+   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
+           !this->dst.is_contiguous() ||
+           (this->exec_size * type_sz(this->dst.type)) < var_size ||
+           this->dst.offset % var_size != 0);
+}
+
  unsigned
  fs_inst::components_read(unsigned i) const
  {
@@ -2557,15 +2576,6 @@ fs_visitor::opt_algebraic()
              break;
           }
  
-         /* a * 0.0 = 0.0 */
-         if (inst->src[1].is_zero()) {
-            inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0] = inst->src[1];
-            inst->src[1] = reg_undef;
-            progress = true;
-            break;
-         }
-
           if (inst->src[0].file == IMM) {
              assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
              inst->opcode = BRW_OPCODE_MOV;
@@ -2579,14 +2589,6 @@ fs_visitor::opt_algebraic()
           if (inst->src[1].file != IMM)
              continue;
  
-         /* a + 0.0 = a */
-         if (inst->src[1].is_zero()) {
-            inst->opcode = BRW_OPCODE_MOV;
-            inst->src[1] = reg_undef;
-            progress = true;
-            break;
-         }
-
           if (inst->src[0].file == IMM) {
              assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
              inst->opcode = BRW_OPCODE_MOV;
@@ -2614,16 +2616,6 @@ fs_visitor::opt_algebraic()
              break;
           }
           break;
-      case BRW_OPCODE_LRP:
-         if (inst->src[1].equals(inst->src[2])) {
-            inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0] = inst->src[1];
-            inst->src[1] = reg_undef;
-            inst->src[2] = reg_undef;
-            progress = true;
-            break;
-         }
-         break;
        case BRW_OPCODE_CMP:
           if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
                inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
@@ -2701,17 +2693,11 @@ fs_visitor::opt_algebraic()
           }
           break;
        case BRW_OPCODE_MAD:
-         if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
-            inst->opcode = BRW_OPCODE_MOV;
-            inst->src[1] = reg_undef;
-            inst->src[2] = reg_undef;
-            progress = true;
-         } else if (inst->src[0].is_zero()) {
-            inst->opcode = BRW_OPCODE_MUL;
-            inst->src[0] = inst->src[2];
-            inst->src[2] = reg_undef;
-            progress = true;
-         } else if (inst->src[1].is_one()) {
+         if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
+             inst->src[1].type != BRW_REGISTER_TYPE_F ||
+             inst->src[2].type != BRW_REGISTER_TYPE_F)
+            break;
+         if (inst->src[1].is_one()) {
              inst->opcode = BRW_OPCODE_ADD;
              inst->src[1] = inst->src[2];
              inst->src[2] = reg_undef;
@@ -2720,11 +2706,6 @@ fs_visitor::opt_algebraic()
              inst->opcode = BRW_OPCODE_ADD;
              inst->src[2] = reg_undef;
              progress = true;
-         } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
-            inst->opcode = BRW_OPCODE_ADD;
-            inst->src[1].f *= inst->src[2].f;
-            inst->src[2] = reg_undef;
-            progress = true;
           }
           break;
        case SHADER_OPCODE_BROADCAST:
@@ -2961,7 +2942,7 @@ fs_visitor::opt_register_renaming()
        if (depth == 0 &&
            inst->dst.file == VGRF &&
            alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
-          !inst->is_partial_write()) {
+          !inst->is_partial_reg_write()) {
           if (remap[dst] == ~0u) {
              remap[dst] = dst;
           } else {
@@ -3165,7 +3146,7 @@ fs_visitor::compute_to_mrf()
        next_ip++;
  
        if (inst->opcode != BRW_OPCODE_MOV ||
-         inst->is_partial_write() ||
+         inst->is_partial_reg_write() ||
           inst->dst.file != MRF || inst->src[0].file != VGRF ||
           inst->dst.type != inst->src[0].type ||
           inst->src[0].abs || inst->src[0].negate ||
@@ -3198,7 +3179,7 @@ fs_visitor::compute_to_mrf()
              * that writes that reg, but it would require smarter
              * tracking.
              */
-           if (scan_inst->is_partial_write())
+           if (scan_inst->is_partial_reg_write())
                break;
  
              /* Handling things not fully contained in the source of the copy
@@ -3516,7 +3497,7 @@ fs_visitor::remove_duplicate_mrf_writes()
        if (inst->opcode == BRW_OPCODE_MOV &&
           inst->dst.file == MRF &&
           inst->src[0].file != ARF &&
-         !inst->is_partial_write()) {
+         !inst->is_partial_reg_write()) {
           last_mrf_move[inst->dst.nr] = inst;
        }
     }
@@ -5623,6 +5604,49 @@ fs_visitor::lower_logical_sends()
     return progress;
  }
  
+static bool
+is_mixed_float_with_fp32_dst(const fs_inst *inst)
+{
+   /* This opcode sometimes uses :W type on the source even if the operand is
+    * a :HF, because in gen7 there is no support for :HF, and thus it uses :W.
+    */
+   if (inst->opcode == BRW_OPCODE_F16TO32)
+      return true;
+
+   if (inst->dst.type != BRW_REGISTER_TYPE_F)
+      return false;
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
+         return true;
+   }
+
+   return false;
+}
+
+static bool
+is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
+{
+   /* This opcode sometimes uses :W type on the destination even if the
+    * destination is a :HF, because in gen7 there is no support for :HF, and
+    * thus it uses :W.
+    */
+   if (inst->opcode == BRW_OPCODE_F32TO16 &&
+       inst->dst.stride == 1)
+      return true;
+
+   if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
+       inst->dst.stride != 1)
+      return false;
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].type == BRW_REGISTER_TYPE_F)
+         return true;
+   }
+
+   return false;
+}
+
  /**
   * Get the closest allowed SIMD width for instruction \p inst accounting for
   * some common regioning and execution control restrictions that apply to FPU
@@ -5785,6 +5809,35 @@ get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
           max_width = MIN2(max_width, 4);
     }
  
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+    * Float Operations:
+    *
+    *    "No SIMD16 in mixed mode when destination is f32. Instruction
+    *     execution size must be no more than 8."
+    *
+    * FIXME: the simulator doesn't seem to complain if we don't do this and
+    * empirical testing with existing CTS tests show that they pass just fine
+    * without implementing this, however, since our interpretation of the PRM
+    * is that conversion MOVs between HF and F are still mixed-float
+    * instructions (and therefore subject to this restriction) we decided to
+    * split them to be safe. Might be useful to do additional investigation to
+    * lift the restriction if we can ensure that it is safe though, since these
+    * conversions are common when half-float types are involved since many
+    * instructions do not support HF types and conversions from/to F are
+    * required.
+    */
+   if (is_mixed_float_with_fp32_dst(inst))
+      max_width = MIN2(max_width, 8);
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+    * Float Operations:
+    *
+    *    "No SIMD16 in mixed mode when destination is packed f16 for both
+    *     Align1 and Align16."
+    */
+   if (is_mixed_float_with_packed_fp16_dst(inst))
+      max_width = MIN2(max_width, 8);
+
     /* Only power-of-two execution sizes are representable in the instruction
      * control fields.
      */
@@ -5941,18 +5994,27 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
     case SHADER_OPCODE_EXP2:
     case SHADER_OPCODE_LOG2:
     case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS:
+   case SHADER_OPCODE_COS: {
        /* Unary extended math instructions are limited to SIMD8 on Gen4 and
-       * Gen6.
+       * Gen6. Extended Math Function is limited to SIMD8 with half-float.
         */
-      return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
-              devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
-              MIN2(8, inst->exec_size));
+      if (devinfo->gen == 6 || (devinfo->gen == 4 && !devinfo->is_g4x))
+         return MIN2(8, inst->exec_size);
+      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+         return MIN2(8, inst->exec_size);
+      return MIN2(16, inst->exec_size);
+   }
  
-   case SHADER_OPCODE_POW:
-      /* SIMD16 is only allowed on Gen7+. */
-      return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
-              MIN2(8, inst->exec_size));
+   case SHADER_OPCODE_POW: {
+      /* SIMD16 is only allowed on Gen7+. Extended Math Function is limited
+       * to SIMD8 with half-float
+       */
+      if (devinfo->gen < 7)
+         return MIN2(8, inst->exec_size);
+      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
+         return MIN2(8, inst->exec_size);
+      return MIN2(16, inst->exec_size);
+   }
  
     case SHADER_OPCODE_INT_QUOTIENT:
     case SHADER_OPCODE_INT_REMAINDER: