Optimize mulExtended to use 32x32->64 multiplication.
Drivers which are not based on NIR, they can set the
MUL64_TO_MUL_AND_MUL_HIGH lowering flag in order to have same old
behavior.
v2: Add missing condition check (Jason Ekstrand)
Signed-off-by: Sagar Ghuge <sagar.ghuge@intel.com>
Suggested-by: Matt Turner <Matt Turner <mattst88@gmail.com>
Suggested-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
ir_function_signature *
builtin_builder::_mulExtended(const glsl_type *type)
{
+ const glsl_type *mul_type, *unpack_type;
+ ir_expression_operation unpack_op;
+
+ if (type->base_type == GLSL_TYPE_INT) {
+ unpack_op = ir_unop_unpack_int_2x32;
+ mul_type = glsl_type::get_instance(GLSL_TYPE_INT64, type->vector_elements, 1);
+ unpack_type = glsl_type::ivec2_type;
+ } else {
+ unpack_op = ir_unop_unpack_uint_2x32;
+ mul_type = glsl_type::get_instance(GLSL_TYPE_UINT64, type->vector_elements, 1);
+ unpack_type = glsl_type::uvec2_type;
+ }
+
ir_variable *x = in_var(type, "x");
ir_variable *y = in_var(type, "y");
ir_variable *msb = out_var(type, "msb");
ir_variable *lsb = out_var(type, "lsb");
MAKE_SIG(glsl_type::void_type, gpu_shader5_or_es31_or_integer_functions, 4, x, y, msb, lsb);
- body.emit(assign(msb, imul_high(x, y)));
- body.emit(assign(lsb, mul(x, y)));
+ ir_variable *unpack_val = body.make_temp(unpack_type, "_unpack_val");
+
+ ir_expression *mul_res = new(mem_ctx) ir_expression(ir_binop_mul, mul_type,
+ new(mem_ctx)ir_dereference_variable(x),
+ new(mem_ctx)ir_dereference_variable(y));
+
+ if (type->vector_elements == 1) {
+ body.emit(assign(unpack_val, expr(unpack_op, mul_res)));
+ body.emit(assign(msb, swizzle_y(unpack_val)));
+ body.emit(assign(lsb, swizzle_x(unpack_val)));
+ } else {
+ for (int i = 0; i < type->vector_elements; i++) {
+ body.emit(assign(unpack_val, expr(unpack_op, swizzle(mul_res, i, 1))));
+ body.emit(assign(array_ref(msb, i), swizzle_y(unpack_val)));
+ body.emit(assign(array_ref(lsb, i), swizzle_x(unpack_val)));
+ }
+ }
return sig;
}
: nir_isub(&b, srcs[0], srcs[1]);
break;
case ir_binop_mul:
- result = type_is_float(out_type) ? nir_fmul(&b, srcs[0], srcs[1])
- : nir_imul(&b, srcs[0], srcs[1]);
+ if (type_is_float(out_type))
+ result = nir_fmul(&b, srcs[0], srcs[1]);
+ else if (out_type == GLSL_TYPE_INT64 &&
+ (ir->operands[0]->type->base_type == GLSL_TYPE_INT ||
+ ir->operands[1]->type->base_type == GLSL_TYPE_INT))
+ result = nir_imul_2x32_64(&b, srcs[0], srcs[1]);
+ else if (out_type == GLSL_TYPE_UINT64 &&
+ (ir->operands[0]->type->base_type == GLSL_TYPE_UINT ||
+ ir->operands[1]->type->base_type == GLSL_TYPE_UINT))
+ result = nir_umul_2x32_64(&b, srcs[0], srcs[1]);
+ else
+ result = nir_imul(&b, srcs[0], srcs[1]);
break;
case ir_binop_div:
if (type_is_float(out_type))
#define DDIV_TO_MUL_RCP 0x100000
#define DIV_TO_MUL_RCP (FDIV_TO_MUL_RCP | DDIV_TO_MUL_RCP)
#define SQRT_TO_ABS_SQRT 0x200000
+#define MUL64_TO_MUL_AND_MUL_HIGH 0x400000
/* Opertaions for lower_64bit_integer_instructions() */
#define MUL64 (1U << 0)
assert(ir->operands[0]->type->base_type ==
ir->operands[1]->type->base_type);
+ if (ir->operation == ir_binop_mul &&
+ (ir->type->base_type == GLSL_TYPE_UINT64 ||
+ ir->type->base_type == GLSL_TYPE_INT64) &&
+ (ir->operands[0]->type->base_type == GLSL_TYPE_INT ||
+ ir->operands[1]->type->base_type == GLSL_TYPE_INT ||
+ ir->operands[0]->type->base_type == GLSL_TYPE_UINT ||
+ ir->operands[1]->type->base_type == GLSL_TYPE_UINT)) {
+ assert(ir->operands[0]->type == ir->operands[1]->type);
+ break;
+ }
+
if (ir->operands[0]->type->is_scalar())
assert(ir->operands[1]->type == ir->type);
else if (ir->operands[1]->type->is_scalar())
void find_msb_to_float_cast(ir_expression *ir);
void imul_high_to_mul(ir_expression *ir);
void sqrt_to_abs_sqrt(ir_expression *ir);
+ void mul64_to_mul_and_mul_high(ir_expression *ir);
ir_expression *_carry(operand a, operand b);
};
this->progress = true;
}
+void
+lower_instructions_visitor::mul64_to_mul_and_mul_high(ir_expression *ir)
+{
+ /* Lower 32x32-> 64 to
+ * msb = imul_high(x_lo, y_lo)
+ * lsb = mul(x_lo, y_lo)
+ */
+ const unsigned elements = ir->operands[0]->type->vector_elements;
+
+ const ir_expression_operation operation =
+ ir->type->base_type == GLSL_TYPE_UINT64 ? ir_unop_pack_uint_2x32
+ : ir_unop_pack_int_2x32;
+
+ const glsl_type *var_type = ir->type->base_type == GLSL_TYPE_UINT64
+ ? glsl_type::uvec(elements)
+ : glsl_type::ivec(elements);
+
+ const glsl_type *ret_type = ir->type->base_type == GLSL_TYPE_UINT64
+ ? glsl_type::uvec2_type
+ : glsl_type::ivec2_type;
+
+ ir_instruction &i = *base_ir;
+
+ ir_variable *msb =
+ new(ir) ir_variable(var_type, "msb", ir_var_temporary);
+ ir_variable *lsb =
+ new(ir) ir_variable(var_type, "lsb", ir_var_temporary);
+ ir_variable *x =
+ new(ir) ir_variable(var_type, "x", ir_var_temporary);
+ ir_variable *y =
+ new(ir) ir_variable(var_type, "y", ir_var_temporary);
+
+ i.insert_before(x);
+ i.insert_before(assign(x, ir->operands[0]));
+ i.insert_before(y);
+ i.insert_before(assign(y, ir->operands[1]));
+ i.insert_before(msb);
+ i.insert_before(lsb);
+
+ i.insert_before(assign(msb, imul_high(x, y)));
+ i.insert_before(assign(lsb, mul(x, y)));
+
+ ir_rvalue *result[4] = {NULL};
+ for (unsigned elem = 0; elem < elements; elem++) {
+ ir_rvalue *val = new(ir) ir_expression(ir_quadop_vector, ret_type,
+ swizzle(lsb, elem, 1),
+ swizzle(msb, elem, 1), NULL, NULL);
+ result[elem] = expr(operation, val);
+ }
+
+ ir->operation = ir_quadop_vector;
+ ir->init_num_operands();
+ ir->operands[0] = result[0];
+ ir->operands[1] = result[1];
+ ir->operands[2] = result[2];
+ ir->operands[3] = result[3];
+
+ this->progress = true;
+}
+
ir_visitor_status
lower_instructions_visitor::visit_leave(ir_expression *ir)
{
imul_high_to_mul(ir);
break;
+ case ir_binop_mul:
+ if (lowering(MUL64_TO_MUL_AND_MUL_HIGH) &&
+ (ir->type->base_type == GLSL_TYPE_INT64 ||
+ ir->type->base_type == GLSL_TYPE_UINT64) &&
+ (ir->operands[0]->type->base_type == GLSL_TYPE_INT ||
+ ir->operands[1]->type->base_type == GLSL_TYPE_UINT))
+ mul64_to_mul_and_mul_high(ir);
+ break;
+
case ir_unop_rsq:
case ir_unop_sqrt:
if (lowering(SQRT_TO_ABS_SQRT))
do_mat_op_to_vec(ir);
lower_instructions(ir, (MOD_TO_FLOOR | DIV_TO_MUL_RCP | EXP_TO_EXP2
| LOG_TO_LOG2 | INT_DIV_TO_MUL_RCP
+ | MUL64_TO_MUL_AND_MUL_HIGH
| ((options->EmitNoPow) ? POW_TO_EXP2 : 0)));
progress = do_common_optimization(ir, true, true,
FDIV_TO_MUL_RCP |
EXP_TO_EXP2 |
LOG_TO_LOG2 |
+ MUL64_TO_MUL_AND_MUL_HIGH |
(have_ldexp ? 0 : LDEXP_TO_ARITH) |
(have_dfrexp ? 0 : DFREXP_DLDEXP_TO_ARITH) |
CARRY_TO_ARITH |