From 866bb775de7942ee120fb1f33c7a92ce47f8904b Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Thu, 11 Jul 2019 15:08:03 -0700 Subject: [PATCH] intel/fs: add 64 bit integer multiplication lowering While NIR's lower_imul64() solves the case of 64 bit integer multiplications generated early, we don't have a way to lower such instructions when they are generated by our own backend, such as the scan/reduce intrinsics. We'll need this soon, so implement it now. An easy way to test this is to simply disable nir_lower_imul64 to let those operations reach the backend. v2: - Fix Q/UQ copy/paste errors (Caio). - Transform an 'if' into 'else if' (Caio). - Add an extra comment to clarify the need for 64b = 32b * 32b (Caio). - Make private functions private (Caio). v3: - Remove ambiguity with 'b' and 'd' variables (Caio). - Allocate potentially less regs for the dwords (Caio). Cc: Jason Ekstrand Cc: Matt Turner Cc: Caio Marcelo de Oliveira Filho Reviewed-by: Matt Turner Reviewed-by: Caio Marcelo de Oliveira Filho Signed-off-by: Paulo Zanoni --- src/intel/compiler/brw_fs.cpp | 73 +++++++++++++++++++++++++++++++++-- src/intel/compiler/brw_fs.h | 1 + 2 files changed, 70 insertions(+), 4 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index ccf6c955202..a4ebae336f5 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -3990,6 +3990,62 @@ fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block) } } +void +fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block) +{ + const fs_builder ibld(this, block, inst); + + /* Considering two 64-bit integers ab and cd where each letter ab + * corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd + * only need to provide the YZ part of the result. ------- + * BD + * Only BD needs to be 64 bits. For AD and BC we only care + AD + * about the lower 32 bits (since they are part of the upper + BC + * 32 bits of our result). AC is not needed since it starts + AC + * on the 65th bit of the result. ------- + * WXYZ + */ + unsigned int q_regs = regs_written(inst); + unsigned int d_regs = (q_regs + 1) / 2; + + fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ); + fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD); + fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD); + + /* Here we need the full 64 bit result for 32b * 32b. */ + if (devinfo->has_integer_dword_mul) { + ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)); + } else { + fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD); + fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD); + fs_reg acc = retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD); + + fs_inst *mul = ibld.MUL(acc, + subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0)); + mul->writes_accumulator = true; + + ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)); + ibld.MOV(bd_low, acc); + + ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low); + ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high); + } + + ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)); + ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), + subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)); + + ibld.ADD(ad, ad, bc); + ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1), + subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad); + + ibld.MOV(inst->dst, bd); +} + void fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block) { @@ -4062,10 +4118,19 @@ fs_visitor::lower_integer_multiplication() foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { if (inst->opcode == BRW_OPCODE_MUL) { - if (!inst->dst.is_accumulator() && - (inst->dst.type == BRW_REGISTER_TYPE_D || - inst->dst.type == BRW_REGISTER_TYPE_UD) && - !devinfo->has_integer_dword_mul) { + if ((inst->dst.type == BRW_REGISTER_TYPE_Q || + inst->dst.type == BRW_REGISTER_TYPE_UQ) && + (inst->src[0].type == BRW_REGISTER_TYPE_Q || + inst->src[0].type == BRW_REGISTER_TYPE_UQ) && + (inst->src[1].type == BRW_REGISTER_TYPE_Q || + inst->src[1].type == BRW_REGISTER_TYPE_UQ)) { + lower_mul_qword_inst(inst, block); + inst->remove(block); + progress = true; + } else if (!inst->dst.is_accumulator() && + (inst->dst.type == BRW_REGISTER_TYPE_D || + inst->dst.type == BRW_REGISTER_TYPE_UD) && + !devinfo->has_integer_dword_mul) { lower_mul_dword_inst(inst, block); inst->remove(block); progress = true; diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index fb78fa829a0..a3604aef509 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -407,6 +407,7 @@ private: void resolve_inot_sources(const brw::fs_builder &bld, nir_alu_instr *instr, fs_reg *op); void lower_mul_dword_inst(fs_inst *inst, bblock_t *block); + void lower_mul_qword_inst(fs_inst *inst, bblock_t *block); void lower_mulh_inst(fs_inst *inst, bblock_t *block); }; -- 2.30.2