From: Sagar Ghuge Date: Fri, 15 Feb 2019 07:08:39 +0000 (-0800) Subject: nir/glsl: Add another way of doing lower_imul64 for gen8+ X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=e551040c602d392019e68f54d9a3a310d2a937a3;p=mesa.git nir/glsl: Add another way of doing lower_imul64 for gen8+ On Gen 8 and 9, "mul" instruction supports 64 bit destination type. We can reduce our 64x64 int multiplication from 4 instructions to 3. Also instead of emitting two mul instructions, we can emit single mul instuction and extract low/high 32 bits from 64 bit result for [i/u]mulExtended v2: 1) Allow lower_mul_high64 to use new opcode (Jason Ekstrand) 2) Add lower_mul_2x32_64 flag (Matt Turner) 3) Remove associative property as bit size is different (Connor Abbott) v3: Fix indentation and variable naming convention (Jason Ekstrand) Signed-off-by: Sagar Ghuge Reviewed-by: Jason Ekstrand --- diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 0c96bdfbc56..0b10fb2e2b4 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2118,6 +2118,7 @@ typedef enum { nir_lower_logic64 = (1 << 9), nir_lower_minmax64 = (1 << 10), nir_lower_shift64 = (1 << 11), + nir_lower_imul_2x32_64 = (1 << 12), } nir_lower_int64_options; typedef enum { @@ -2259,6 +2260,9 @@ typedef struct nir_shader_compiler_options { */ bool use_interpolated_input_intrinsics; + /* Lowers when 32x32->64 bit multiplication is not supported */ + bool lower_mul_2x32_64; + unsigned max_unroll_iterations; nir_lower_int64_options lower_int64_options; diff --git a/src/compiler/nir/nir_lower_int64.c b/src/compiler/nir/nir_lower_int64.c index 1c4b4b33797..6aae1816bd2 100644 --- a/src/compiler/nir/nir_lower_int64.c +++ b/src/compiler/nir/nir_lower_int64.c @@ -383,6 +383,16 @@ lower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y); } +static nir_ssa_def * +lower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, + bool sign_extend) +{ + nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y) + : nir_umul_high(b, x, y); + + return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi); +} + static nir_ssa_def * lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) { @@ -391,12 +401,13 @@ lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); - nir_ssa_def *res_lo = nir_imul(b, x_lo, y_lo); - nir_ssa_def *res_hi = nir_iadd(b, nir_umul_high(b, x_lo, y_lo), + nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo); + nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo), nir_iadd(b, nir_imul(b, x_lo, y_hi), nir_imul(b, x_hi, y_lo))); - return nir_pack_64_2x32_split(b, res_lo, res_hi); + return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo), + res_hi); } static nir_ssa_def * @@ -441,9 +452,8 @@ lower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, * so we're guaranteed that we can add in two more 32-bit values * without overflowing tmp. */ - nir_ssa_def *tmp = - nir_pack_64_2x32_split(b, nir_imul(b, x32[i], y32[j]), - nir_umul_high(b, x32[i], y32[j])); + nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[i]); + if (res[i + j]) tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j])); if (carry) @@ -626,6 +636,9 @@ opcode_to_options_mask(nir_op opcode) switch (opcode) { case nir_op_imul: return nir_lower_imul64; + case nir_op_imul_2x32_64: + case nir_op_umul_2x32_64: + return nir_lower_imul_2x32_64; case nir_op_imul_high: case nir_op_umul_high: return nir_lower_imul_high64; @@ -688,6 +701,10 @@ lower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu) switch (alu->op) { case nir_op_imul: return lower_imul64(b, src[0], src[1]); + case nir_op_imul_2x32_64: + return lower_mul_2x32_64(b, src[0], src[1], true); + case nir_op_umul_2x32_64: + return lower_mul_2x32_64(b, src[0], src[1], false); case nir_op_imul_high: return lower_mul_high64(b, src[0], src[1], true); case nir_op_umul_high: diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 499deb947e8..42f8662352e 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -475,6 +475,12 @@ binop("fmul", tfloat, commutative + associative, "src0 * src1") # low 32-bits of signed/unsigned integer multiply binop("imul", tint, commutative + associative, "src0 * src1") +# Generate 64 bit result from 2 32 bits quantity +binop_convert("imul_2x32_64", tint64, tint32, commutative, + "(int64_t)src0 * (int64_t)src1") +binop_convert("umul_2x32_64", tuint64, tuint32, commutative, + "(uint64_t)src0 * (uint64_t)src1") + # high 32-bits of signed integer multiply binop("imul_high", tint, commutative, """ if (bit_size == 64) { diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 617ca0ea933..53cfa94ae93 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -70,6 +70,8 @@ optimizations = [ (('imul', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b))), (('imul', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b))))), + (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), + (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), (('udiv', a, 1), a), (('idiv', a, 1), a), (('umod', a, 1), 0), diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index b3df0d9fa23..28793b1f0e6 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -171,6 +171,13 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo) fp64_options |= nir_lower_fp64_full_software; } + /* The Bspec's section tittled "Instruction_multiply[DevBDW+]" claims that + * destination type can be Quadword and source type Doubleword for Gen8 and + * Gen9. So, lower 64 bit multiply instruction on rest of the platforms. + */ + if (devinfo->gen < 8 || devinfo->gen > 9) + int64_options |= nir_lower_imul_2x32_64; + /* We want the GLSL compiler to emit code that uses condition codes */ for (int i = 0; i < MESA_SHADER_STAGES; i++) { compiler->glsl_compiler_options[i].MaxUnrollIterations = 0; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index a4fd29ff9ec..a7abaf742e2 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -1055,6 +1055,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; + case nir_op_imul_2x32_64: + case nir_op_umul_2x32_64: + bld.MUL(result, op[0], op[1]); + break; + case nir_op_imul: assert(nir_dest_bit_size(instr->dest.dest) < 64); bld.MUL(result, op[0], op[1]);