nir/glsl: Add another way of doing lower_imul64 for gen8+

author Sagar Ghuge <sagar.ghuge@intel.com>

Fri, 15 Feb 2019 07:08:39 +0000 (23:08 -0800)

committer Sagar Ghuge <sagar.ghuge@intel.com>

Mon, 4 Mar 2019 23:50:25 +0000 (15:50 -0800)
author Sagar Ghuge <sagar.ghuge@intel.com>
Fri, 15 Feb 2019 07:08:39 +0000 (23:08 -0800)
committer Sagar Ghuge <sagar.ghuge@intel.com>
Mon, 4 Mar 2019 23:50:25 +0000 (15:50 -0800)
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h

index 0c96bdfbc569cfd85fce4d407a1cdea43b23ba04..0b10fb2e2b42e94faf01db57051b189228c8a57b 100644 (file)
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2118,6 +2118,7 @@ typedef enum {
     nir_lower_logic64 = (1 << 9),
     nir_lower_minmax64 = (1 << 10),
     nir_lower_shift64 = (1 << 11),
+   nir_lower_imul_2x32_64 = (1 << 12),
  } nir_lower_int64_options;
  
  typedef enum {
@@ -2259,6 +2260,9 @@ typedef struct nir_shader_compiler_options {
      */
     bool use_interpolated_input_intrinsics;
  
+   /* Lowers when 32x32->64 bit multiplication is not supported */
+   bool lower_mul_2x32_64;
+
     unsigned max_unroll_iterations;
  
     nir_lower_int64_options lower_int64_options;
diff --git a/src/compiler/nir/nir_lower_int64.c b/src/compiler/nir/nir_lower_int64.c

index 1c4b4b337973bbc8b211c8188844c7ebd619cacd..6aae1816bd24a1a0a73f614370ff061df7754da9 100644 (file)
--- a/src/compiler/nir/nir_lower_int64.c
+++ b/src/compiler/nir/nir_lower_int64.c
@@ -383,6 +383,16 @@ lower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
     return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y);
  }
  
+static nir_ssa_def *
+lower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
+                  bool sign_extend)
+{
+   nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y)
+                                     : nir_umul_high(b, x, y);
+
+   return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi);
+}
+
  static nir_ssa_def *
  lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
  {
@@ -391,12 +401,13 @@ lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
     nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
     nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
  
-   nir_ssa_def *res_lo = nir_imul(b, x_lo, y_lo);
-   nir_ssa_def *res_hi = nir_iadd(b, nir_umul_high(b, x_lo, y_lo),
+   nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo);
+   nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo),
                           nir_iadd(b, nir_imul(b, x_lo, y_hi),
                                       nir_imul(b, x_hi, y_lo)));
  
-   return nir_pack_64_2x32_split(b, res_lo, res_hi);
+   return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo),
+                                 res_hi);
  }
  
  static nir_ssa_def *
@@ -441,9 +452,8 @@ lower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
            * so we're guaranteed that we can add in two more 32-bit values
            * without overflowing tmp.
            */
-         nir_ssa_def *tmp =
-            nir_pack_64_2x32_split(b, nir_imul(b, x32[i], y32[j]),
-                                      nir_umul_high(b, x32[i], y32[j]));
+         nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[i]);
+
           if (res[i + j])
              tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j]));
           if (carry)
@@ -626,6 +636,9 @@ opcode_to_options_mask(nir_op opcode)
     switch (opcode) {
     case nir_op_imul:
        return nir_lower_imul64;
+   case nir_op_imul_2x32_64:
+   case nir_op_umul_2x32_64:
+      return nir_lower_imul_2x32_64;
     case nir_op_imul_high:
     case nir_op_umul_high:
        return nir_lower_imul_high64;
@@ -688,6 +701,10 @@ lower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu)
     switch (alu->op) {
     case nir_op_imul:
        return lower_imul64(b, src[0], src[1]);
+   case nir_op_imul_2x32_64:
+      return lower_mul_2x32_64(b, src[0], src[1], true);
+   case nir_op_umul_2x32_64:
+      return lower_mul_2x32_64(b, src[0], src[1], false);
     case nir_op_imul_high:
        return lower_mul_high64(b, src[0], src[1], true);
     case nir_op_umul_high:
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py

index 499deb947e88db3994b89afb76048e711be70c80..42f8662352e2c8d9f1ead8602845a285601859a0 100644 (file)
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -475,6 +475,12 @@ binop("fmul", tfloat, commutative + associative, "src0 * src1")
  # low 32-bits of signed/unsigned integer multiply
  binop("imul", tint, commutative + associative, "src0 * src1")
  
+# Generate 64 bit result from 2 32 bits quantity
+binop_convert("imul_2x32_64", tint64, tint32, commutative,
+              "(int64_t)src0 * (int64_t)src1")
+binop_convert("umul_2x32_64", tuint64, tuint32, commutative,
+              "(uint64_t)src0 * (uint64_t)src1")
+
  # high 32-bits of signed integer multiply
  binop("imul_high", tint, commutative, """
  if (bit_size == 64) {
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py

index 617ca0ea93349d27c4dfb9f435a19d5658948461..53cfa94ae93c4e9d6c170f9004e148f839e7f3d8 100644 (file)
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -70,6 +70,8 @@ optimizations = [
  
     (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))),
     (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))),
+   (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
+   (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
     (('udiv', a, 1), a),
     (('idiv', a, 1), a),
     (('umod', a, 1), 0),
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c

index b3df0d9fa23bd1060be66bd8bf0885cabab15d1b..28793b1f0e6b9d5f127d6347a73f49a8b3b2d750 100644 (file)
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -171,6 +171,13 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
        fp64_options |= nir_lower_fp64_full_software;
     }
  
+   /* The Bspec's section tittled "Instruction_multiply[DevBDW+]" claims that
+    * destination type can be Quadword and source type Doubleword for Gen8 and
+    * Gen9. So, lower 64 bit multiply instruction on rest of the platforms.
+    */
+   if (devinfo->gen < 8 || devinfo->gen > 9)
+      int64_options |= nir_lower_imul_2x32_64;
+
     /* We want the GLSL compiler to emit code that uses condition codes */
     for (int i = 0; i < MESA_SHADER_STAGES; i++) {
        compiler->glsl_compiler_options[i].MaxUnrollIterations = 0;
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp

index a4fd29ff9ec445980cc8981d58dec1a654b80983..a7abaf742e28edf2747b5a456ef4fd2cb988ae85 100644 (file)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -1055,6 +1055,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        inst->saturate = instr->dest.saturate;
        break;
  
+   case nir_op_imul_2x32_64:
+   case nir_op_umul_2x32_64:
+      bld.MUL(result, op[0], op[1]);
+      break;
+
     case nir_op_imul:
        assert(nir_dest_bit_size(instr->dest.dest) < 64);
        bld.MUL(result, op[0], op[1]);
author	Sagar Ghuge <sagar.ghuge@intel.com>
	Fri, 15 Feb 2019 07:08:39 +0000 (23:08 -0800)
committer	Sagar Ghuge <sagar.ghuge@intel.com>
	Mon, 4 Mar 2019 23:50:25 +0000 (15:50 -0800)
src/compiler/nir/nir.h		patch \| blob \| history
src/compiler/nir/nir_lower_int64.c		patch \| blob \| history
src/compiler/nir/nir_opcodes.py		patch \| blob \| history
src/compiler/nir/nir_opt_algebraic.py		patch \| blob \| history
src/intel/compiler/brw_compiler.c		patch \| blob \| history
src/intel/compiler/brw_fs_nir.cpp		patch \| blob \| history