glsl: [u/i]mulExtended optimization for GLSL
authorSagar Ghuge <sagar.ghuge@intel.com>
Wed, 27 Feb 2019 22:02:54 +0000 (14:02 -0800)
committerSagar Ghuge <sagar.ghuge@intel.com>
Mon, 4 Mar 2019 23:50:25 +0000 (15:50 -0800)
Optimize mulExtended to use 32x32->64 multiplication.

Drivers which are not based on NIR, they can set the
MUL64_TO_MUL_AND_MUL_HIGH lowering flag in order to have same old
behavior.

v2: Add missing condition check (Jason Ekstrand)

Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com>
Suggested-by: Matt Turner <Matt Turner <mattst88@gmail.com>
Suggested-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
src/compiler/glsl/builtin_functions.cpp
src/compiler/glsl/glsl_to_nir.cpp
src/compiler/glsl/ir_optimization.h
src/compiler/glsl/ir_validate.cpp
src/compiler/glsl/lower_instructions.cpp
src/mesa/program/ir_to_mesa.cpp
src/mesa/state_tracker/st_glsl_to_tgsi.cpp

index aba1a14bd9025981cccfe9e11577e477d13bd9b3..386cbc0ccd79f25be1bb624e5ac5dcfcc0ca91db 100644 (file)
@@ -5866,14 +5866,42 @@ builtin_builder::_usubBorrow(const glsl_type *type)
 ir_function_signature *
 builtin_builder::_mulExtended(const glsl_type *type)
 {
+   const glsl_type *mul_type, *unpack_type;
+   ir_expression_operation unpack_op;
+
+   if (type->base_type == GLSL_TYPE_INT) {
+      unpack_op = ir_unop_unpack_int_2x32;
+      mul_type = glsl_type::get_instance(GLSL_TYPE_INT64, type->vector_elements, 1);
+      unpack_type = glsl_type::ivec2_type;
+   } else {
+      unpack_op = ir_unop_unpack_uint_2x32;
+      mul_type = glsl_type::get_instance(GLSL_TYPE_UINT64, type->vector_elements, 1);
+      unpack_type = glsl_type::uvec2_type;
+   }
+
    ir_variable *x = in_var(type, "x");
    ir_variable *y = in_var(type, "y");
    ir_variable *msb = out_var(type, "msb");
    ir_variable *lsb = out_var(type, "lsb");
    MAKE_SIG(glsl_type::void_type, gpu_shader5_or_es31_or_integer_functions, 4, x, y, msb, lsb);
 
-   body.emit(assign(msb, imul_high(x, y)));
-   body.emit(assign(lsb, mul(x, y)));
+   ir_variable *unpack_val = body.make_temp(unpack_type, "_unpack_val");
+
+   ir_expression *mul_res = new(mem_ctx) ir_expression(ir_binop_mul, mul_type,
+                                                       new(mem_ctx)ir_dereference_variable(x),
+                                                       new(mem_ctx)ir_dereference_variable(y));
+
+   if (type->vector_elements == 1) {
+      body.emit(assign(unpack_val, expr(unpack_op, mul_res)));
+      body.emit(assign(msb, swizzle_y(unpack_val)));
+      body.emit(assign(lsb, swizzle_x(unpack_val)));
+   } else {
+      for (int i = 0; i < type->vector_elements; i++) {
+         body.emit(assign(unpack_val, expr(unpack_op, swizzle(mul_res, i, 1))));
+         body.emit(assign(array_ref(msb, i), swizzle_y(unpack_val)));
+         body.emit(assign(array_ref(lsb, i), swizzle_x(unpack_val)));
+      }
+   }
 
    return sig;
 }
index 09a4f19f6f2955cae9b1ded71efd228745848497..f7df91d887d3513f8a31a46063b4e45b09c2555a 100644 (file)
@@ -1865,8 +1865,18 @@ nir_visitor::visit(ir_expression *ir)
                                        : nir_isub(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_mul:
-      result = type_is_float(out_type) ? nir_fmul(&b, srcs[0], srcs[1])
-                                       : nir_imul(&b, srcs[0], srcs[1]);
+      if (type_is_float(out_type))
+         result = nir_fmul(&b, srcs[0], srcs[1]);
+      else if (out_type == GLSL_TYPE_INT64 &&
+               (ir->operands[0]->type->base_type == GLSL_TYPE_INT ||
+                ir->operands[1]->type->base_type == GLSL_TYPE_INT))
+         result = nir_imul_2x32_64(&b, srcs[0], srcs[1]);
+      else if (out_type == GLSL_TYPE_UINT64 &&
+               (ir->operands[0]->type->base_type == GLSL_TYPE_UINT ||
+                ir->operands[1]->type->base_type == GLSL_TYPE_UINT))
+         result = nir_umul_2x32_64(&b, srcs[0], srcs[1]);
+      else
+         result = nir_imul(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_div:
       if (type_is_float(out_type))
index ef68b93c09e43daa1dfb8cd3e13a3f92e4b5a30b..e027654d3a02b8a57a2950da22cf6ecfe95026ac 100644 (file)
@@ -57,6 +57,7 @@ struct gl_shader_program;
 #define DDIV_TO_MUL_RCP           0x100000
 #define DIV_TO_MUL_RCP            (FDIV_TO_MUL_RCP | DDIV_TO_MUL_RCP)
 #define SQRT_TO_ABS_SQRT          0x200000
+#define MUL64_TO_MUL_AND_MUL_HIGH 0x400000
 
 /* Opertaions for lower_64bit_integer_instructions() */
 #define MUL64                     (1U << 0)
index 819e8aa60dd3e5a6e0dc8b1a60f777fb01fd7c9a..18d27cbf6b17085707cb0b6b217926edda561d0b 100644 (file)
@@ -621,6 +621,17 @@ ir_validate::visit_leave(ir_expression *ir)
       assert(ir->operands[0]->type->base_type ==
              ir->operands[1]->type->base_type);
 
+      if (ir->operation == ir_binop_mul &&
+          (ir->type->base_type == GLSL_TYPE_UINT64 ||
+           ir->type->base_type == GLSL_TYPE_INT64) &&
+          (ir->operands[0]->type->base_type == GLSL_TYPE_INT ||
+           ir->operands[1]->type->base_type == GLSL_TYPE_INT ||
+           ir->operands[0]->type->base_type == GLSL_TYPE_UINT ||
+           ir->operands[1]->type->base_type == GLSL_TYPE_UINT)) {
+         assert(ir->operands[0]->type == ir->operands[1]->type);
+         break;
+      }
+
       if (ir->operands[0]->type->is_scalar())
         assert(ir->operands[1]->type == ir->type);
       else if (ir->operands[1]->type->is_scalar())
index 91f71b37619af38c11a43e32c53ab19fd5673f1a..8e0c8744048fc575d875e340abb6337f5643bbc1 100644 (file)
@@ -169,6 +169,7 @@ private:
    void find_msb_to_float_cast(ir_expression *ir);
    void imul_high_to_mul(ir_expression *ir);
    void sqrt_to_abs_sqrt(ir_expression *ir);
+   void mul64_to_mul_and_mul_high(ir_expression *ir);
 
    ir_expression *_carry(operand a, operand b);
 };
@@ -1666,6 +1667,66 @@ lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir)
    this->progress = true;
 }
 
+void
+lower_instructions_visitor::mul64_to_mul_and_mul_high(ir_expression *ir)
+{
+   /* Lower 32x32-> 64 to
+    *    msb = imul_high(x_lo, y_lo)
+    *    lsb = mul(x_lo, y_lo)
+    */
+   const unsigned elements = ir->operands[0]->type->vector_elements;
+
+   const ir_expression_operation operation =
+      ir->type->base_type == GLSL_TYPE_UINT64 ? ir_unop_pack_uint_2x32
+                                              : ir_unop_pack_int_2x32;
+
+   const glsl_type *var_type = ir->type->base_type == GLSL_TYPE_UINT64
+                               ? glsl_type::uvec(elements)
+                               : glsl_type::ivec(elements);
+
+   const glsl_type *ret_type = ir->type->base_type == GLSL_TYPE_UINT64
+                               ? glsl_type::uvec2_type
+                               : glsl_type::ivec2_type;
+
+   ir_instruction &i = *base_ir;
+
+   ir_variable *msb =
+      new(ir) ir_variable(var_type, "msb", ir_var_temporary);
+   ir_variable *lsb =
+      new(ir) ir_variable(var_type, "lsb", ir_var_temporary);
+   ir_variable *x =
+      new(ir) ir_variable(var_type, "x", ir_var_temporary);
+   ir_variable *y =
+      new(ir) ir_variable(var_type, "y", ir_var_temporary);
+
+   i.insert_before(x);
+   i.insert_before(assign(x, ir->operands[0]));
+   i.insert_before(y);
+   i.insert_before(assign(y, ir->operands[1]));
+   i.insert_before(msb);
+   i.insert_before(lsb);
+
+   i.insert_before(assign(msb, imul_high(x, y)));
+   i.insert_before(assign(lsb, mul(x, y)));
+
+   ir_rvalue *result[4] = {NULL};
+   for (unsigned elem = 0; elem < elements; elem++) {
+      ir_rvalue *val = new(ir) ir_expression(ir_quadop_vector, ret_type,
+                                             swizzle(lsb, elem, 1),
+                                             swizzle(msb, elem, 1), NULL, NULL);
+      result[elem] = expr(operation, val);
+   }
+
+   ir->operation = ir_quadop_vector;
+   ir->init_num_operands();
+   ir->operands[0] = result[0];
+   ir->operands[1] = result[1];
+   ir->operands[2] = result[2];
+   ir->operands[3] = result[3];
+
+   this->progress = true;
+}
+
 ir_visitor_status
 lower_instructions_visitor::visit_leave(ir_expression *ir)
 {
@@ -1803,6 +1864,15 @@ lower_instructions_visitor::visit_leave(ir_expression *ir)
          imul_high_to_mul(ir);
       break;
 
+   case ir_binop_mul:
+      if (lowering(MUL64_TO_MUL_AND_MUL_HIGH) &&
+          (ir->type->base_type == GLSL_TYPE_INT64 ||
+           ir->type->base_type == GLSL_TYPE_UINT64) &&
+          (ir->operands[0]->type->base_type == GLSL_TYPE_INT ||
+           ir->operands[1]->type->base_type == GLSL_TYPE_UINT))
+         mul64_to_mul_and_mul_high(ir);
+      break;
+
    case ir_unop_rsq:
    case ir_unop_sqrt:
       if (lowering(SQRT_TO_ABS_SQRT))
index e65a6743353f091268009da3313efc7c3bf3a6c6..ed194eb13a3fbea00be1e967a6e308117419112d 100644 (file)
@@ -3053,6 +3053,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
         do_mat_op_to_vec(ir);
         lower_instructions(ir, (MOD_TO_FLOOR | DIV_TO_MUL_RCP | EXP_TO_EXP2
                                 | LOG_TO_LOG2 | INT_DIV_TO_MUL_RCP
+                                | MUL64_TO_MUL_AND_MUL_HIGH
                                 | ((options->EmitNoPow) ? POW_TO_EXP2 : 0)));
 
         progress = do_common_optimization(ir, true, true,
index 484a5329455c8d5182661105e3486530bd7849de..264557c9f58249396bdaf6a5859bf17a24c24777 100644 (file)
@@ -7379,6 +7379,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                          FDIV_TO_MUL_RCP |
                          EXP_TO_EXP2 |
                          LOG_TO_LOG2 |
+                         MUL64_TO_MUL_AND_MUL_HIGH |
                          (have_ldexp ? 0 : LDEXP_TO_ARITH) |
                          (have_dfrexp ? 0 : DFREXP_DLDEXP_TO_ARITH) |
                          CARRY_TO_ARITH |