From 75b3868dcc81e9b629e997f354aae691b7933f32 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Thu, 11 Jul 2019 16:56:05 -0700 Subject: [PATCH] intel/compiler: extract subfunctions of lower_integer_multiplication() The lower_integer_multiplication() function is already a little too big. I want to add more to it, so let's reorganize the existing code first. Let's start with just extracting the current code to subfunctions. Later we'll change them a little more. v2: Make private functions private (Caio). v3: Fix typo (Caio). Reviewed-by: Matt Turner Reviewed-by: Caio Marcelo de Oliveira Filho Signed-off-by: Paulo Zanoni --- src/intel/compiler/brw_fs.cpp | 379 +++++++++++++++++----------------- src/intel/compiler/brw_fs.h | 4 + 2 files changed, 197 insertions(+), 186 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 1a45a0f63d8..755157cf1cf 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -3862,6 +3862,197 @@ fs_visitor::lower_load_payload() return progress; } +void +fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block, + const fs_builder &ibld) +{ + if (inst->src[1].file == IMM && inst->src[1].ud < (1 << 16)) { + /* The MUL instruction isn't commutative. On Gen <= 6, only the low + * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of + * src1 are used. + * + * If multiplying by an immediate value that fits in 16-bits, do a + * single MUL instruction with that value in the proper location. + */ + if (devinfo->gen < 7) { + fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type); + ibld.MOV(imm, inst->src[1]); + ibld.MUL(inst->dst, imm, inst->src[0]); + } else { + const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD); + ibld.MUL(inst->dst, inst->src[0], + ud ? brw_imm_uw(inst->src[1].ud) + : brw_imm_w(inst->src[1].d)); + } + } else { + /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot + * do 32-bit integer multiplication in one instruction, but instead + * must do a sequence (which actually calculates a 64-bit result): + * + * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D + * mach(8) null g3<8,8,1>D g4<8,8,1>D + * mov(8) g2<1>D acc0<8,8,1>D + * + * But on Gen > 6, the ability to use second accumulator register + * (acc1) for non-float data types was removed, preventing a simple + * implementation in SIMD16. A 16-channel result can be calculated by + * executing the three instructions twice in SIMD8, once with quarter + * control of 1Q for the first eight channels and again with 2Q for + * the second eight channels. + * + * Which accumulator register is implicitly accessed (by AccWrEnable + * for instance) is determined by the quarter control. Unfortunately + * Ivybridge (and presumably Baytrail) has a hardware bug in which an + * implicit accumulator access by an instruction with 2Q will access + * acc1 regardless of whether the data type is usable in acc1. + * + * Specifically, the 2Q mach(8) writes acc1 which does not exist for + * integer data types. + * + * Since we only want the low 32-bits of the result, we can do two + * 32-bit x 16-bit multiplies (like the mul and mach are doing), and + * adjust the high result and add them (like the mach is doing): + * + * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW + * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW + * shl(8) g9<1>D g8<8,8,1>D 16D + * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D + * + * We avoid the shl instruction by realizing that we only want to add + * the low 16-bits of the "high" result to the high 16-bits of the + * "low" result and using proper regioning on the add: + * + * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW + * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW + * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW + * + * Since it does not use the (single) accumulator register, we can + * schedule multi-component multiplications much better. + */ + + bool needs_mov = false; + fs_reg orig_dst = inst->dst; + + /* Get a new VGRF for the "low" 32x16-bit multiplication result if + * reusing the original destination is impossible due to hardware + * restrictions, source/destination overlap, or it being the null + * register. + */ + fs_reg low = inst->dst; + if (orig_dst.is_null() || orig_dst.file == MRF || + regions_overlap(inst->dst, inst->size_written, + inst->src[0], inst->size_read(0)) || + regions_overlap(inst->dst, inst->size_written, + inst->src[1], inst->size_read(1)) || + inst->dst.stride >= 4) { + needs_mov = true; + low = fs_reg(VGRF, alloc.allocate(regs_written(inst)), + inst->dst.type); + } + + /* Get a new VGRF but keep the same stride as inst->dst */ + fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type); + high.stride = inst->dst.stride; + high.offset = inst->dst.offset % REG_SIZE; + + if (devinfo->gen >= 7) { + if (inst->src[1].abs) + lower_src_modifiers(this, block, inst, 1); + + if (inst->src[1].file == IMM) { + ibld.MUL(low, inst->src[0], + brw_imm_uw(inst->src[1].ud & 0xffff)); + ibld.MUL(high, inst->src[0], + brw_imm_uw(inst->src[1].ud >> 16)); + } else { + ibld.MUL(low, inst->src[0], + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0)); + ibld.MUL(high, inst->src[0], + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1)); + } + } else { + if (inst->src[0].abs) + lower_src_modifiers(this, block, inst, 0); + + ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0), + inst->src[1]); + ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1), + inst->src[1]); + } + + ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1), + subscript(low, BRW_REGISTER_TYPE_UW, 1), + subscript(high, BRW_REGISTER_TYPE_UW, 0)); + + if (needs_mov || inst->conditional_mod) + set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low)); + } +} + +void +fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block, + const fs_builder &ibld) +{ + /* According to the BDW+ BSpec page for the "Multiply Accumulate + * High" instruction: + * + * "An added preliminary mov is required for source modification on + * src1: + * mov (8) r3.0<1>:d -r3<8;8,1>:d + * mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw + * mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d" + */ + if (devinfo->gen >= 8 && (inst->src[1].negate || inst->src[1].abs)) + lower_src_modifiers(this, block, inst, 1); + + /* Should have been lowered to 8-wide. */ + assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst)); + const fs_reg acc = retype(brw_acc_reg(inst->exec_size), inst->dst.type); + fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]); + fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]); + + if (devinfo->gen >= 8) { + /* Until Gen8, integer multiplies read 32-bits from one source, + * and 16-bits from the other, and relying on the MACH instruction + * to generate the high bits of the result. + * + * On Gen8, the multiply instruction does a full 32x32-bit + * multiply, but in order to do a 64-bit multiply we can simulate + * the previous behavior and then use a MACH instruction. + */ + assert(mul->src[1].type == BRW_REGISTER_TYPE_D || + mul->src[1].type == BRW_REGISTER_TYPE_UD); + mul->src[1].type = BRW_REGISTER_TYPE_UW; + mul->src[1].stride *= 2; + + if (mul->src[1].file == IMM) { + mul->src[1] = brw_imm_uw(mul->src[1].ud); + } + } else if (devinfo->gen == 7 && !devinfo->is_haswell && + inst->group > 0) { + /* Among other things the quarter control bits influence which + * accumulator register is used by the hardware for instructions + * that access the accumulator implicitly (e.g. MACH). A + * second-half instruction would normally map to acc1, which + * doesn't exist on Gen7 and up (the hardware does emulate it for + * floating-point instructions *only* by taking advantage of the + * extra precision of acc0 not normally used for floating point + * arithmetic). + * + * HSW and up are careful enough not to try to access an + * accumulator register that doesn't exist, but on earlier Gen7 + * hardware we need to make sure that the quarter control bits are + * zero to avoid non-deterministic behaviour and emit an extra MOV + * to get the result masked correctly according to the current + * channel enables. + */ + mach->group = 0; + mach->force_writemask_all = true; + mach->dst = ibld.vgrf(inst->dst.type); + ibld.MOV(inst->dst, mach->dst); + } +} + bool fs_visitor::lower_integer_multiplication() { @@ -3879,193 +4070,9 @@ fs_visitor::lower_integer_multiplication() if (devinfo->has_integer_dword_mul) continue; - if (inst->src[1].file == IMM && - inst->src[1].ud < (1 << 16)) { - /* The MUL instruction isn't commutative. On Gen <= 6, only the low - * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of - * src1 are used. - * - * If multiplying by an immediate value that fits in 16-bits, do a - * single MUL instruction with that value in the proper location. - */ - if (devinfo->gen < 7) { - fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), - inst->dst.type); - ibld.MOV(imm, inst->src[1]); - ibld.MUL(inst->dst, imm, inst->src[0]); - } else { - const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD); - ibld.MUL(inst->dst, inst->src[0], - ud ? brw_imm_uw(inst->src[1].ud) - : brw_imm_w(inst->src[1].d)); - } - } else { - /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot - * do 32-bit integer multiplication in one instruction, but instead - * must do a sequence (which actually calculates a 64-bit result): - * - * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D - * mach(8) null g3<8,8,1>D g4<8,8,1>D - * mov(8) g2<1>D acc0<8,8,1>D - * - * But on Gen > 6, the ability to use second accumulator register - * (acc1) for non-float data types was removed, preventing a simple - * implementation in SIMD16. A 16-channel result can be calculated by - * executing the three instructions twice in SIMD8, once with quarter - * control of 1Q for the first eight channels and again with 2Q for - * the second eight channels. - * - * Which accumulator register is implicitly accessed (by AccWrEnable - * for instance) is determined by the quarter control. Unfortunately - * Ivybridge (and presumably Baytrail) has a hardware bug in which an - * implicit accumulator access by an instruction with 2Q will access - * acc1 regardless of whether the data type is usable in acc1. - * - * Specifically, the 2Q mach(8) writes acc1 which does not exist for - * integer data types. - * - * Since we only want the low 32-bits of the result, we can do two - * 32-bit x 16-bit multiplies (like the mul and mach are doing), and - * adjust the high result and add them (like the mach is doing): - * - * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW - * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW - * shl(8) g9<1>D g8<8,8,1>D 16D - * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D - * - * We avoid the shl instruction by realizing that we only want to add - * the low 16-bits of the "high" result to the high 16-bits of the - * "low" result and using proper regioning on the add: - * - * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW - * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW - * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW - * - * Since it does not use the (single) accumulator register, we can - * schedule multi-component multiplications much better. - */ - - bool needs_mov = false; - fs_reg orig_dst = inst->dst; - - /* Get a new VGRF for the "low" 32x16-bit multiplication result if - * reusing the original destination is impossible due to hardware - * restrictions, source/destination overlap, or it being the null - * register. - */ - fs_reg low = inst->dst; - if (orig_dst.is_null() || orig_dst.file == MRF || - regions_overlap(inst->dst, inst->size_written, - inst->src[0], inst->size_read(0)) || - regions_overlap(inst->dst, inst->size_written, - inst->src[1], inst->size_read(1)) || - inst->dst.stride >= 4) { - needs_mov = true; - low = fs_reg(VGRF, alloc.allocate(regs_written(inst)), - inst->dst.type); - } - - /* Get a new VGRF but keep the same stride as inst->dst */ - fs_reg high(VGRF, alloc.allocate(regs_written(inst)), - inst->dst.type); - high.stride = inst->dst.stride; - high.offset = inst->dst.offset % REG_SIZE; - - if (devinfo->gen >= 7) { - if (inst->src[1].abs) - lower_src_modifiers(this, block, inst, 1); - - if (inst->src[1].file == IMM) { - ibld.MUL(low, inst->src[0], - brw_imm_uw(inst->src[1].ud & 0xffff)); - ibld.MUL(high, inst->src[0], - brw_imm_uw(inst->src[1].ud >> 16)); - } else { - ibld.MUL(low, inst->src[0], - subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0)); - ibld.MUL(high, inst->src[0], - subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1)); - } - } else { - if (inst->src[0].abs) - lower_src_modifiers(this, block, inst, 0); - - ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0), - inst->src[1]); - ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1), - inst->src[1]); - } - - ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1), - subscript(low, BRW_REGISTER_TYPE_UW, 1), - subscript(high, BRW_REGISTER_TYPE_UW, 0)); - - if (needs_mov || inst->conditional_mod) { - set_condmod(inst->conditional_mod, - ibld.MOV(orig_dst, low)); - } - } - + lower_mul_dword_inst(inst, block, ibld); } else if (inst->opcode == SHADER_OPCODE_MULH) { - /* According to the BDW+ BSpec page for the "Multiply Accumulate - * High" instruction: - * - * "An added preliminary mov is required for source modification on - * src1: - * mov (8) r3.0<1>:d -r3<8;8,1>:d - * mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw - * mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d" - */ - if (devinfo->gen >= 8 && (inst->src[1].negate || inst->src[1].abs)) - lower_src_modifiers(this, block, inst, 1); - - /* Should have been lowered to 8-wide. */ - assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst)); - const fs_reg acc = retype(brw_acc_reg(inst->exec_size), - inst->dst.type); - fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]); - fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]); - - if (devinfo->gen >= 8) { - /* Until Gen8, integer multiplies read 32-bits from one source, - * and 16-bits from the other, and relying on the MACH instruction - * to generate the high bits of the result. - * - * On Gen8, the multiply instruction does a full 32x32-bit - * multiply, but in order to do a 64-bit multiply we can simulate - * the previous behavior and then use a MACH instruction. - */ - assert(mul->src[1].type == BRW_REGISTER_TYPE_D || - mul->src[1].type == BRW_REGISTER_TYPE_UD); - mul->src[1].type = BRW_REGISTER_TYPE_UW; - mul->src[1].stride *= 2; - - if (mul->src[1].file == IMM) { - mul->src[1] = brw_imm_uw(mul->src[1].ud); - } - } else if (devinfo->gen == 7 && !devinfo->is_haswell && - inst->group > 0) { - /* Among other things the quarter control bits influence which - * accumulator register is used by the hardware for instructions - * that access the accumulator implicitly (e.g. MACH). A - * second-half instruction would normally map to acc1, which - * doesn't exist on Gen7 and up (the hardware does emulate it for - * floating-point instructions *only* by taking advantage of the - * extra precision of acc0 not normally used for floating point - * arithmetic). - * - * HSW and up are careful enough not to try to access an - * accumulator register that doesn't exist, but on earlier Gen7 - * hardware we need to make sure that the quarter control bits are - * zero to avoid non-deterministic behaviour and emit an extra MOV - * to get the result masked correctly according to the current - * channel enables. - */ - mach->group = 0; - mach->force_writemask_all = true; - mach->dst = ibld.vgrf(inst->dst.type); - ibld.MOV(inst->dst, mach->dst); - } + lower_mulh_inst(inst, block, ibld); } else { continue; } diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 1dde4c9c7d2..82d572916ce 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -406,6 +406,10 @@ private: void resolve_inot_sources(const brw::fs_builder &bld, nir_alu_instr *instr, fs_reg *op); + void lower_mul_dword_inst(fs_inst *inst, bblock_t *block, + const brw::fs_builder &ibld); + void lower_mulh_inst(fs_inst *inst, bblock_t *block, + const brw::fs_builder &ibld); }; /** -- 2.30.2