From 8b98d0954e6168484479cf51d56bface448d00d5 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 5 Feb 2019 15:56:24 +0000 Subject: [PATCH] nir/lower_idiv: add new llvm-based path MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit v2: make variable names snake_case v2: minor cleanups in emit_udiv() v2: fix Panfrost build failure v3: use an enum instead of a boolean flag in nir_lower_idiv()'s signature v4: remove nir_op_urcp v5: drop nv50 path v5: rebase v6: add back nv50 path v6: add comment for nir_lower_idiv_path enum v7: rename _nv50/_llvm to _fast/_precise v8: fix etnaviv build failure Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann --- .../aco_instruction_selection_setup.cpp | 2 +- src/broadcom/compiler/vir.c | 2 +- src/compiler/nir/nir.h | 14 +- src/compiler/nir/nir_lower_idiv.c | 127 ++++++++++++++++-- src/freedreno/ir3/ir3_nir.c | 2 +- .../drivers/etnaviv/etnaviv_compiler_nir.c | 2 +- src/gallium/drivers/vc4/vc4_program.c | 2 +- src/panfrost/midgard/midgard_compile.c | 2 +- 8 files changed, 136 insertions(+), 17 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 35383671c21..fa457e2e246 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -1328,7 +1328,7 @@ setup_isel_context(Program* program, nir_lower_iabs64)); nir_opt_idiv_const(nir, 32); - nir_lower_idiv(nir); // TODO: use the LLVM path once !1239 is merged + nir_lower_idiv(nir, nir_lower_idiv_fast); // TODO: use the LLVM path once !1239 is merged /* optimize the lowered ALU operations */ nir_copy_prop(nir); diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index dc5d3fe3bed..af8c7aab1a7 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -932,7 +932,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, NIR_PASS_V(c->s, v3d_nir_lower_io, c); NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c); NIR_PASS_V(c->s, v3d_nir_lower_image_load_store); - NIR_PASS_V(c->s, nir_lower_idiv); + NIR_PASS_V(c->s, nir_lower_idiv, nir_lower_idiv_fast); v3d_optimize_nir(c->s); diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index d002102cad8..6b437040dcc 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3917,7 +3917,19 @@ enum nir_lower_non_uniform_access_type { bool nir_lower_non_uniform_access(nir_shader *shader, enum nir_lower_non_uniform_access_type); -bool nir_lower_idiv(nir_shader *shader); +enum nir_lower_idiv_path { + /* This path is based on NV50LegalizeSSA::handleDIV(). It is the faster of + * the two but it is not exact in some cases (for example, 1091317713u / + * 1034u gives 5209173 instead of 1055432) */ + nir_lower_idiv_fast, + /* This path is based on AMDGPUTargetLowering::LowerUDIVREM() and + * AMDGPUTargetLowering::LowerSDIVREM(). It requires more instructions than + * the nv50 path and many of them are integer multiplications, so it is + * probably slower. It should always return the correct result, though. */ + nir_lower_idiv_precise, +}; + +bool nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path); bool nir_lower_input_attachments(nir_shader *shader, bool use_fragcoord_sysval); diff --git a/src/compiler/nir/nir_lower_idiv.c b/src/compiler/nir/nir_lower_idiv.c index c59a3eb8b3d..ed8cebd2963 100644 --- a/src/compiler/nir/nir_lower_idiv.c +++ b/src/compiler/nir/nir_lower_idiv.c @@ -27,13 +27,17 @@ #include "nir.h" #include "nir_builder.h" -/* Lowers idiv/udiv/umod - * Based on NV50LegalizeSSA::handleDIV() +/* Has two paths + * One (nir_lower_idiv_fast) lowers idiv/udiv/umod and is based on + * NV50LegalizeSSA::handleDIV() * - * Note that this is probably not enough precision for compute shaders. - * Perhaps we want a second higher precision (looping) version of this? - * Or perhaps we assume if you can do compute shaders you can also - * branch out to a pre-optimized shader library routine.. + * Note that this path probably does not have not enough precision for + * compute shaders. Perhaps we want a second higher precision (looping) + * version of this? Or perhaps we assume if you can do compute shaders you + * can also branch out to a pre-optimized shader library routine.. + * + * The other path (nir_lower_idiv_precise) is based off of code used by LLVM's + * AMDGPU target. It should handle 32-bit idiv/irem/imod/udiv/umod exactly. */ static bool @@ -130,8 +134,109 @@ convert_instr(nir_builder *bld, nir_alu_instr *alu) return true; } +/* ported from LLVM's AMDGPUTargetLowering::LowerUDIVREM */ +static nir_ssa_def * +emit_udiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, bool modulo) +{ + nir_ssa_def *rcp = nir_frcp(bld, nir_u2f32(bld, denom)); + rcp = nir_f2u32(bld, nir_fmul_imm(bld, rcp, 4294967296.0)); + nir_ssa_def *rcp_lo = nir_imul(bld, rcp, denom); + nir_ssa_def *rcp_hi = nir_umul_high(bld, rcp, denom); + nir_ssa_def *rcp_hi_ne_zero = nir_ine(bld, rcp_hi, nir_imm_int(bld, 0)); + nir_ssa_def *neg_rcp_lo = nir_ineg(bld, rcp_lo); + nir_ssa_def *abs_rcp_lo = nir_bcsel(bld, rcp_hi_ne_zero, rcp_lo, neg_rcp_lo); + nir_ssa_def *e = nir_umul_high(bld, abs_rcp_lo, rcp); + nir_ssa_def *rcp_plus_e = nir_iadd(bld, rcp, e); + nir_ssa_def *rcp_minus_e = nir_isub(bld, rcp, e); + nir_ssa_def *tmp0 = nir_bcsel(bld, rcp_hi_ne_zero, rcp_minus_e, rcp_plus_e); + nir_ssa_def *quotient = nir_umul_high(bld, tmp0, numer); + nir_ssa_def *num_s_remainder = nir_imul(bld, quotient, denom); + nir_ssa_def *remainder = nir_isub(bld, numer, num_s_remainder); + nir_ssa_def *remainder_ge_den = nir_uge(bld, remainder, denom); + nir_ssa_def *remainder_ge_zero = nir_uge(bld, numer, num_s_remainder); + nir_ssa_def *tmp1 = nir_iand(bld, remainder_ge_den, remainder_ge_zero); + + if (modulo) { + nir_ssa_def *rem = nir_bcsel(bld, tmp1, + nir_isub(bld, remainder, denom), remainder); + return nir_bcsel(bld, remainder_ge_zero, + rem, nir_iadd(bld, remainder, denom)); + } else { + nir_ssa_def *one = nir_imm_int(bld, 1); + nir_ssa_def *div = nir_bcsel(bld, tmp1, + nir_iadd(bld, quotient, one), quotient); + return nir_bcsel(bld, remainder_ge_zero, + div, nir_isub(bld, quotient, one)); + } +} + +/* ported from LLVM's AMDGPUTargetLowering::LowerSDIVREM */ +static nir_ssa_def * +emit_idiv(nir_builder *bld, nir_ssa_def *numer, nir_ssa_def *denom, nir_op op) +{ + nir_ssa_def *lh_sign = nir_ilt(bld, numer, nir_imm_int(bld, 0)); + nir_ssa_def *rh_sign = nir_ilt(bld, denom, nir_imm_int(bld, 0)); + lh_sign = nir_bcsel(bld, lh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0)); + rh_sign = nir_bcsel(bld, rh_sign, nir_imm_int(bld, -1), nir_imm_int(bld, 0)); + + nir_ssa_def *lhs = nir_iadd(bld, numer, lh_sign); + nir_ssa_def *rhs = nir_iadd(bld, denom, rh_sign); + lhs = nir_ixor(bld, lhs, lh_sign); + rhs = nir_ixor(bld, rhs, rh_sign); + + if (op == nir_op_idiv) { + nir_ssa_def *d_sign = nir_ixor(bld, lh_sign, rh_sign); + nir_ssa_def *res = emit_udiv(bld, lhs, rhs, false); + res = nir_ixor(bld, res, d_sign); + return nir_isub(bld, res, d_sign); + } else { + nir_ssa_def *res = emit_udiv(bld, lhs, rhs, true); + res = nir_ixor(bld, res, lh_sign); + res = nir_isub(bld, res, lh_sign); + if (op == nir_op_imod) { + nir_ssa_def *cond = nir_ieq(bld, res, nir_imm_int(bld, 0)); + cond = nir_ior(bld, nir_ieq(bld, lh_sign, rh_sign), cond); + res = nir_bcsel(bld, cond, res, nir_iadd(bld, res, denom)); + } + return res; + } +} + +static bool +convert_instr_precise(nir_builder *bld, nir_alu_instr *alu) +{ + nir_op op = alu->op; + + if ((op != nir_op_idiv) && + (op != nir_op_imod) && + (op != nir_op_irem) && + (op != nir_op_udiv) && + (op != nir_op_umod)) + return false; + + if (alu->dest.dest.ssa.bit_size != 32) + return false; + + bld->cursor = nir_before_instr(&alu->instr); + + nir_ssa_def *numer = nir_ssa_for_alu_src(bld, alu, 0); + nir_ssa_def *denom = nir_ssa_for_alu_src(bld, alu, 1); + + nir_ssa_def *res = NULL; + + if (op == nir_op_udiv || op == nir_op_umod) + res = emit_udiv(bld, numer, denom, op == nir_op_umod); + else + res = emit_idiv(bld, numer, denom, op); + + assert(alu->dest.dest.is_ssa); + nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(res)); + + return true; +} + static bool -convert_impl(nir_function_impl *impl) +convert_impl(nir_function_impl *impl, enum nir_lower_idiv_path path) { nir_builder b; nir_builder_init(&b, impl); @@ -139,7 +244,9 @@ convert_impl(nir_function_impl *impl) nir_foreach_block(block, impl) { nir_foreach_instr_safe(instr, block) { - if (instr->type == nir_instr_type_alu) + if (instr->type == nir_instr_type_alu && path == nir_lower_idiv_precise) + progress |= convert_instr_precise(&b, nir_instr_as_alu(instr)); + else if (instr->type == nir_instr_type_alu) progress |= convert_instr(&b, nir_instr_as_alu(instr)); } } @@ -151,13 +258,13 @@ convert_impl(nir_function_impl *impl) } bool -nir_lower_idiv(nir_shader *shader) +nir_lower_idiv(nir_shader *shader, enum nir_lower_idiv_path path) { bool progress = false; nir_foreach_function(function, shader) { if (function->impl) - progress |= convert_impl(function->impl); + progress |= convert_impl(function->impl, path); } return progress; diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index f2fc46db7cc..99659a7ddef 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -277,7 +277,7 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, * NOTE that UBO analysis pass should only be done once, before variants */ const bool ubo_progress = !key && OPT(s, ir3_nir_analyze_ubo_ranges, shader); - const bool idiv_progress = OPT(s, nir_lower_idiv); + const bool idiv_progress = OPT(s, nir_lower_idiv, nir_lower_idiv_fast); if (ubo_progress || idiv_progress) ir3_optimize_loop(s); diff --git a/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c b/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c index 96b669787fe..4cf48b46e78 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c +++ b/src/gallium/drivers/etnaviv/etnaviv_compiler_nir.c @@ -762,7 +762,7 @@ etna_compile_shader_nir(struct etna_shader_variant *v) OPT_V(s, nir_opt_algebraic); OPT_V(s, nir_lower_bool_to_float); } else { - OPT_V(s, nir_lower_idiv); + OPT_V(s, nir_lower_idiv, nir_lower_idiv_fast); OPT_V(s, nir_lower_bool_to_int32); } diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index fc148861989..8ec181bc8fa 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2321,7 +2321,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, NIR_PASS_V(c->s, vc4_nir_lower_io, c); NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c); - NIR_PASS_V(c->s, nir_lower_idiv); + NIR_PASS_V(c->s, nir_lower_idiv, nir_lower_idiv_fast); vc4_optimize_nir(c->s); diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index cd8a8165157..d73d6007e7b 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -482,7 +482,7 @@ optimise_nir(nir_shader *nir) NIR_PASS(progress, nir, nir_lower_regs_to_ssa); NIR_PASS(progress, nir, midgard_nir_lower_fdot2); - NIR_PASS(progress, nir, nir_lower_idiv); + NIR_PASS(progress, nir, nir_lower_idiv, nir_lower_idiv_fast); nir_lower_tex_options lower_tex_options = { .lower_txs_lod = true, -- 2.30.2