From 58907568ec526df87fa87177441743fa0d1d0a66 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 19 Sep 2018 01:28:06 -0700 Subject: [PATCH] intel/fs: Add SHADER_OPCODE_[IU]SUB_SAT pseudo-ops v2: Add a big comment explaining the [IU]SUB_SAT lowering. Suggested by Caio. v3: Use get_fpu_lowered_simd_width in get_lowered_simd_width. Suggested by Ken on IRC. v4: Fix a typo in a comment. Noticed by Caio. Reviewed-by: Caio Marcelo de Oliveira Filho Part-of: --- src/intel/compiler/brw_eu_defines.h | 6 ++ src/intel/compiler/brw_fs.cpp | 94 +++++++++++++++++++++++++++++ src/intel/compiler/brw_fs.h | 1 + 3 files changed, 101 insertions(+) diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h index 1b56903e213..c83a478b908 100644 --- a/src/intel/compiler/brw_eu_defines.h +++ b/src/intel/compiler/brw_eu_defines.h @@ -742,6 +742,12 @@ enum opcode { */ SHADER_OPCODE_MULH, + /** Signed subtraction with saturation. */ + SHADER_OPCODE_ISUB_SAT, + + /** Unsigned subtraction with saturation. */ + SHADER_OPCODE_USUB_SAT, + /** * A MOV that uses VxH indirect addressing. * diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index b09203cc87b..855c99dcb9a 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -4179,6 +4179,95 @@ fs_visitor::lower_minmax() return progress; } +bool +fs_visitor::lower_sub_sat() +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { + const fs_builder ibld(this, block, inst); + + if (inst->opcode == SHADER_OPCODE_USUB_SAT || + inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* The fundamental problem is the hardware performs source negation + * at the bit width of the source. If the source is 0x80000000D, the + * negation is 0x80000000D. As a result, subtractSaturate(0, + * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There + * are at least three ways to resolve this: + * + * 1. Use the accumulator for the negated source. The accumulator is + * 33 bits, so our source 0x80000000 is sign-extended to + * 0x1800000000. The negation of which is 0x080000000. This + * doesn't help for 64-bit integers (which are already bigger than + * 33 bits). There are also only 8 accumulators, so SIMD16 or + * SIMD32 instructions would have to be split into multiple SIMD8 + * instructions. + * + * 2. Use slightly different math. For any n-bit value x, we know (x + * >> 1) != -(x >> 1). We can use this fact to only do + * subtractions involving (x >> 1). subtractSaturate(a, b) == + * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)). + * + * 3. For unsigned sources, it is sufficient to replace the + * subtractSaturate with (a > b) ? a - b : 0. + * + * It may also be possible to use the SUBB instruction. This + * implicitly writes the accumulator, so it could only be used in the + * same situations as #1 above. It is further limited by only + * allowing UD sources. + */ + if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q && + inst->src[0].type != BRW_REGISTER_TYPE_UQ) { + fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type); + + ibld.MOV(acc, inst->src[1]); + fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]); + add->saturate = true; + add->src[0].negate = true; + } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* tmp = src1 >> 1; + * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp)); + */ + fs_reg tmp1 = ibld.vgrf(inst->src[0].type); + fs_reg tmp2 = ibld.vgrf(inst->src[0].type); + fs_reg tmp3 = ibld.vgrf(inst->src[0].type); + fs_inst *add; + + ibld.SHR(tmp1, inst->src[1], brw_imm_d(1)); + + add = ibld.ADD(tmp2, inst->src[1], tmp1); + add->src[1].negate = true; + + add = ibld.ADD(tmp3, inst->src[0], tmp1); + add->src[1].negate = true; + add->saturate = true; + + add = ibld.ADD(inst->dst, tmp3, tmp2); + add->src[1].negate = true; + add->saturate = true; + } else { + /* a > b ? a - b : 0 */ + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + BRW_CONDITIONAL_G); + + fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]); + add->src[1].negate = !add->src[1].negate; + + ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0)) + ->predicate = BRW_PREDICATE_NORMAL; + } + + inst->remove(block); + progress = true; + } + } + + if (progress) + invalidate_live_intervals(); + + return progress; +} + static void setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, fs_reg *dst, fs_reg color, unsigned components) @@ -6279,6 +6368,10 @@ get_lowered_simd_width(const struct gen_device_info *devinfo, return MIN2(16, inst->exec_size); } + case SHADER_OPCODE_USUB_SAT: + case SHADER_OPCODE_ISUB_SAT: + return get_fpu_lowered_simd_width(devinfo, inst); + case SHADER_OPCODE_INT_QUOTIENT: case SHADER_OPCODE_INT_REMAINDER: /* Integer division is limited to SIMD8 on all generations. */ @@ -7390,6 +7483,7 @@ fs_visitor::optimize() OPT(opt_combine_constants); OPT(lower_integer_multiplication); + OPT(lower_sub_sat); if (devinfo->gen <= 5 && OPT(lower_minmax)) { OPT(opt_cmod_propagation); diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 7fac6d51c9e..5784da7535c 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -169,6 +169,7 @@ public: bool lower_simd_width(); bool lower_barycentrics(); bool lower_scoreboard(); + bool lower_sub_sat(); bool opt_combine_constants(); void emit_dummy_fs(); -- 2.30.2