From: Rhys Perry Date: Thu, 14 May 2020 20:09:36 +0000 (+0100) Subject: aco: create 16-bit mad/fma X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=7f511efa16adb5e820c4535473de9bfb59f5e470;p=mesa.git aco: create 16-bit mad/fma fossil-db (Navi, fp16 enabled): Totals from 1 (0.00% of 127638) affected shaders: CodeSize: 4868 -> 4552 (-6.49%) Instrs: 956 -> 863 (-9.73%) Cycles: 3824 -> 3452 (-9.73%) VMEM: 504 -> 490 (-2.78%) SMEM: 109 -> 107 (-1.83%) VClause: 19 -> 20 (+5.26%) Copies: 54 -> 58 (+7.41%) PreVGPRs: 43 -> 41 (-4.65%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 67d18231319..f56ef5f5170 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1103,6 +1103,10 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } break; } + case aco_opcode::v_mul_f16: { + ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); + break; + } case aco_opcode::v_and_b32: /* abs */ if (!instr->usesModifiers() && instr->operands[0].constantEquals(0x7FFFFFFF) && instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr) @@ -2415,11 +2419,15 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_subrev_f32; - if (mad32) { - bool need_fma = block.fp_mode.denorm32 != 0; + bool mad16 = instr->opcode == aco_opcode::v_add_f16 || + instr->opcode == aco_opcode::v_sub_f16 || + instr->opcode == aco_opcode::v_subrev_f16; + if (mad16 || mad32) { + bool need_fma = mad32 ? block.fp_mode.denorm32 != 0 : + (block.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10); if (need_fma && instr->definitions[0].isPrecise()) return; - if (need_fma && !ctx.program->has_fast_fma32) + if (need_fma && mad32 && !ctx.program->has_fast_fma32) return; uint32_t uses_src0 = UINT32_MAX; @@ -2500,12 +2508,15 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr /* neg of the multiplication result */ neg[1] = neg[1] ^ vop3->neg[1 - add_op_idx]; } - if (instr->opcode == aco_opcode::v_sub_f32) + if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16) neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true; - else if (instr->opcode == aco_opcode::v_subrev_f32) + else if (instr->opcode == aco_opcode::v_subrev_f32 || instr->opcode == aco_opcode::v_subrev_f16) neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true; aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; + if (mad16) + mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16 : aco_opcode::v_fma_f16) : + (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16 : aco_opcode::v_mad_f16); aco_ptr mad{create_instruction(mad_op, Format::VOP3A, 3, 1)}; for (unsigned i = 0; i < 3; i++) @@ -2730,7 +2741,8 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) /* check literals */ else if (!instr->usesModifiers()) { /* FMA can only take literals on GFX10+ */ - if (instr->opcode == aco_opcode::v_fma_f32 && ctx.program->chip_class < GFX10) + if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) && + ctx.program->chip_class < GFX10) return; bool sgpr_used = false; @@ -2903,6 +2915,10 @@ void apply_literals(opt_ctx &ctx, aco_ptr& instr) aco_opcode new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32; if (instr->opcode == aco_opcode::v_fma_f32) new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32; + else if (instr->opcode == aco_opcode::v_mad_f16 || instr->opcode == aco_opcode::v_mad_legacy_f16) + new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16; + else if (instr->opcode == aco_opcode::v_fma_f16) + new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16; new_mad.reset(create_instruction(new_op, Format::VOP2, 3, 1)); if (info->literal_idx == 2) { /* add literal -> madak */ diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index a824e8b546c..505f5cb613d 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1735,7 +1735,10 @@ void register_allocation(Program *program, std::vector& live_out_per_bl if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) op = instr->operands[i]; else if ((instr->opcode == aco_opcode::v_mad_f32 || - (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10)) && !instr->usesModifiers()) + (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) || + instr->opcode == aco_opcode::v_mad_f16 || + instr->opcode == aco_opcode::v_mad_legacy_f16 || + (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && !instr->usesModifiers()) op = instr->operands[2]; if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) { @@ -2011,13 +2014,19 @@ void register_allocation(Program *program, std::vector& live_out_per_bl /* try to optimize v_mad_f32 -> v_mac_f32 */ if ((instr->opcode == aco_opcode::v_mad_f32 || - (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10)) && + (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) || + instr->opcode == aco_opcode::v_mad_f16 || + instr->opcode == aco_opcode::v_mad_legacy_f16 || + (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() && instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr && - !instr->usesModifiers()) { + !instr->usesModifiers() && + instr->operands[0].physReg().byte() == 0 && + instr->operands[1].physReg().byte() == 0 && + instr->operands[2].physReg().byte() == 0) { unsigned def_id = instr->definitions[0].tempId(); auto it = ctx.affinities.find(def_id); if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned || @@ -2031,6 +2040,13 @@ void register_allocation(Program *program, std::vector& live_out_per_bl case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break; + case aco_opcode::v_mad_f16: + case aco_opcode::v_mad_legacy_f16: + instr->opcode = aco_opcode::v_mac_f16; + break; + case aco_opcode::v_fma_f16: + instr->opcode = aco_opcode::v_fmac_f16; + break; default: break; } @@ -2041,6 +2057,8 @@ void register_allocation(Program *program, std::vector& live_out_per_bl if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 || + instr->opcode == aco_opcode::v_mac_f16 || + instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) { instr->definitions[0].setFixed(instr->operands[2].physReg());