X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_optimizer.cpp;h=c1f0bc20f9e41a38acd841cda76952346c1d235c;hp=5e68e3f3b4328a807589dea96e8090bbc725ccc0;hb=3af2b9e3de2ba9878c03b192d3b1574172d1923d;hpb=3c1b55962e0da924bc96a16689c9421888891959 diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 5e68e3f3b43..c1f0bc20f9e 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -52,7 +52,7 @@ namespace aco { struct mad_info { aco_ptr add_instr; uint32_t mul_temp_id; - uint32_t literal_idx; + uint16_t literal_idx; bool check_literal; mad_info(aco_ptr instr, uint32_t id) @@ -83,7 +83,7 @@ enum Label { label_add_sub = 1 << 17, label_bitwise = 1 << 18, label_minmax = 1 << 19, - label_fcmp = 1 << 20, + label_vopc = 1 << 20, label_uniform_bool = 1 << 21, label_constant_64bit = 1 << 22, label_uniform_bitwise = 1 << 23, @@ -94,19 +94,19 @@ enum Label { label_constant_16bit = 1 << 29, }; -static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | - label_add_sub | label_bitwise | label_uniform_bitwise | label_minmax | label_fcmp; -static constexpr uint32_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool | +static constexpr uint64_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | + label_add_sub | label_bitwise | label_uniform_bitwise | label_minmax | label_vopc; +static constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f | label_uniform_bool | label_omod2 | label_omod4 | label_omod5 | label_clamp | label_scc_invert | label_b2i; -static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal | label_mad; +static constexpr uint32_t val_labels = label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal; struct ssa_info { - uint32_t val; + uint64_t label; union { + uint32_t val; Temp temp; Instruction* instr; }; - uint32_t label; ssa_info() : label(0) {} @@ -116,18 +116,21 @@ struct ssa_info { * (indicating the defining instruction), there is no need to clear * any other instr labels. */ if (new_label & instr_labels) - label &= ~temp_labels; /* instr and temp alias */ + label &= ~(temp_labels | val_labels); /* instr, temp and val alias */ if (new_label & temp_labels) { label &= ~temp_labels; - label &= ~instr_labels; /* instr and temp alias */ + label &= ~(instr_labels | val_labels); /* instr, temp and val alias */ } uint32_t const_labels = label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit; - if (new_label & const_labels) + if (new_label & const_labels) { label &= ~val_labels | const_labels; - else if (new_label & val_labels) + label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */ + } else if (new_label & val_labels) { label &= ~val_labels; + label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */ + } label |= new_label; } @@ -277,7 +280,7 @@ struct ssa_info { void set_mad(Instruction* mad, uint32_t mad_info_idx) { add_label(label_mad); - val = mad_info_idx; + mad->pass_flags = mad_info_idx; instr = mad; } @@ -427,15 +430,15 @@ struct ssa_info { return label & label_minmax; } - void set_fcmp(Instruction *fcmp_instr) + void set_vopc(Instruction *vopc_instr) { - add_label(label_fcmp); - instr = fcmp_instr; + add_label(label_vopc); + instr = vopc_instr; } - bool is_fcmp() + bool is_vopc() { - return label & label_fcmp; + return label & label_vopc; } void set_scc_needed() @@ -502,6 +505,18 @@ struct opt_ctx { std::vector uses; }; +struct CmpInfo { + aco_opcode ordered; + aco_opcode unordered; + aco_opcode ordered_swapped; + aco_opcode unordered_swapped; + aco_opcode inverse; + aco_opcode f32; + unsigned size; +}; + +ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info); + bool can_swap_operands(aco_ptr& instr) { if (instr->operands[0].isConstant() || @@ -509,6 +524,10 @@ bool can_swap_operands(aco_ptr& instr) return false; switch (instr->opcode) { + case aco_opcode::v_add_u32: + case aco_opcode::v_add_co_u32: + case aco_opcode::v_add_co_u32_e64: + case aco_opcode::v_add_i32: case aco_opcode::v_add_f16: case aco_opcode::v_add_f32: case aco_opcode::v_mul_f16: @@ -524,10 +543,14 @@ bool can_swap_operands(aco_ptr& instr) case aco_opcode::v_min_i32: case aco_opcode::v_max_u32: case aco_opcode::v_min_u32: - case aco_opcode::v_cmp_eq_f16: - case aco_opcode::v_cmp_eq_f32: - case aco_opcode::v_cmp_lg_f16: - case aco_opcode::v_cmp_lg_f32: + case aco_opcode::v_max_i16: + case aco_opcode::v_min_i16: + case aco_opcode::v_max_u16: + case aco_opcode::v_min_u16: + case aco_opcode::v_max_i16_e64: + case aco_opcode::v_min_i16_e64: + case aco_opcode::v_max_u16_e64: + case aco_opcode::v_min_u16_e64: return true; case aco_opcode::v_sub_f16: instr->opcode = aco_opcode::v_subrev_f16; @@ -535,24 +558,29 @@ bool can_swap_operands(aco_ptr& instr) case aco_opcode::v_sub_f32: instr->opcode = aco_opcode::v_subrev_f32; return true; - case aco_opcode::v_cmp_lt_f16: - instr->opcode = aco_opcode::v_cmp_gt_f16; - return true; - case aco_opcode::v_cmp_lt_f32: - instr->opcode = aco_opcode::v_cmp_gt_f32; - return true; - case aco_opcode::v_cmp_ge_f16: - instr->opcode = aco_opcode::v_cmp_le_f16; + case aco_opcode::v_sub_co_u32: + instr->opcode = aco_opcode::v_subrev_co_u32; return true; - case aco_opcode::v_cmp_ge_f32: - instr->opcode = aco_opcode::v_cmp_le_f32; + case aco_opcode::v_sub_u16: + instr->opcode = aco_opcode::v_subrev_u16; return true; - case aco_opcode::v_cmp_lt_i32: - instr->opcode = aco_opcode::v_cmp_gt_i32; + case aco_opcode::v_sub_u32: + instr->opcode = aco_opcode::v_subrev_u32; return true; - default: + default: { + CmpInfo info; + get_cmp_info(instr->opcode, &info); + if (info.ordered == instr->opcode) { + instr->opcode = info.ordered_swapped; + return true; + } + if (info.unordered == instr->opcode) { + instr->opcode = info.unordered_swapped; + return true; + } return false; } + } } bool can_use_VOP3(opt_ctx& ctx, const aco_ptr& instr) @@ -687,7 +715,7 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands) return true; } -bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset) +bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow) { Operand op = instr->operands[op_index]; @@ -709,6 +737,8 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp default: return false; } + if (prevent_overflow && !add_instr->definitions[0].isNUW()) + return false; if (add_instr->usesModifiers()) return false; @@ -726,7 +756,7 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp continue; uint32_t offset2 = 0; - if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) { + if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) { *offset += offset2; } else { *base = add_instr->operands[!i].getTemp(); @@ -899,6 +929,15 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) while (info.is_temp()) info = ctx.info[info.temp.id()]; + /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr + * overflow for scratch accesses works only on GFX9+ and saddr overflow + * never works. Since swizzling is the only thing that separates + * scratch accesses and other accesses and swizzling changing how + * addressing works significantly, this probably applies to swizzled + * MUBUF accesses. */ + bool vaddr_prevent_overflow = mubuf->swizzled && ctx.program->chip_class < GFX9; + bool saddr_prevent_overflow = mubuf->swizzled; + if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) { assert(!mubuf->idxen); instr->operands[1] = Operand(v1); @@ -909,12 +948,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->operands[2] = Operand((uint32_t) 0); mubuf->offset += info.val; continue; - } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) { + } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) && + base.regClass() == v1 && mubuf->offset + offset < 4096) { assert(!mubuf->idxen); instr->operands[1].setTemp(base); mubuf->offset += offset; continue; - } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) { + } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) && + base.regClass() == s1 && mubuf->offset + offset < 4096) { instr->operands[i].setTemp(base); mubuf->offset += offset; continue; @@ -929,7 +970,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) uint32_t offset; bool has_usable_ds_offset = ctx.program->chip_class >= GFX7; if (has_usable_ds_offset && - i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && + i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && base.regClass() == instr->operands[i].regClass() && instr->opcode != aco_opcode::ds_swizzle_b32) { if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 || @@ -959,13 +1000,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) SMEM_instruction *smem = static_cast(instr.get()); Temp base; uint32_t offset; + bool prevent_overflow = smem->operands[0].size() > 2 || smem->prevent_overflow; if (i == 1 && info.is_constant_or_literal(32) && ((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) || (ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) || (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) { instr->operands[i] = Operand(info.val); continue; - } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { + } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4); if (soe && (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) || @@ -984,8 +1026,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) new_instr->operands.back() = Operand(base); if (!smem->definitions.empty()) new_instr->definitions[0] = smem->definitions[0]; - new_instr->can_reorder = smem->can_reorder; - new_instr->barrier = smem->barrier; + new_instr->sync = smem->sync; new_instr->glc = smem->glc; new_instr->dlc = smem->dlc; new_instr->nv = smem->nv; @@ -1010,6 +1051,11 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) if (instr->definitions.empty()) return; + if ((uint16_t) instr->format & (uint16_t) Format::VOPC) { + ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get()); + return; + } + switch (instr->opcode) { case aco_opcode::p_create_vector: { bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() && @@ -1327,6 +1373,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) ctx.info[instr->definitions[1].tempId()].set_temp(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); ctx.info[instr->definitions[0].tempId()].set_uniform_bool(ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp()); break; + } else if (ctx.info[instr->operands[0].tempId()].is_vopc()) { + Instruction* vopc_instr = ctx.info[instr->operands[0].tempId()].instr; + /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus already produces the same result */ + if (vopc_instr->pass_flags == instr->pass_flags) { + assert(instr->pass_flags > 0); + ctx.info[instr->definitions[0].tempId()].set_temp(vopc_instr->definitions[0].getTemp()); + break; + } } } /* fallthrough */ @@ -1359,28 +1413,6 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) case aco_opcode::v_max_i16: ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get()); break; - #define CMP(cmp) \ - case aco_opcode::v_cmp_##cmp##_f16:\ - case aco_opcode::v_cmp_##cmp##_f32:\ - case aco_opcode::v_cmp_##cmp##_f64:\ - case aco_opcode::v_cmp_n##cmp##_f16:\ - case aco_opcode::v_cmp_n##cmp##_f32:\ - case aco_opcode::v_cmp_n##cmp##_f64: - CMP(lt) - CMP(eq) - CMP(le) - CMP(gt) - CMP(lg) - CMP(ge) - case aco_opcode::v_cmp_o_f16: - case aco_opcode::v_cmp_u_f16: - case aco_opcode::v_cmp_o_f32: - case aco_opcode::v_cmp_u_f32: - case aco_opcode::v_cmp_o_f64: - case aco_opcode::v_cmp_u_f64: - #undef CMP - ctx.info[instr->definitions[0].tempId()].set_fcmp(instr.get()); - break; case aco_opcode::s_cselect_b64: case aco_opcode::s_cselect_b32: if (instr->operands[0].constantEquals((unsigned) -1) && @@ -1405,38 +1437,34 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } } -struct CmpInfo { - aco_opcode ordered; - aco_opcode unordered; - aco_opcode inverse; - aco_opcode f32; - unsigned size; -}; - ALWAYS_INLINE bool get_cmp_info(aco_opcode op, CmpInfo *info) { info->ordered = aco_opcode::num_opcodes; info->unordered = aco_opcode::num_opcodes; + info->ordered_swapped = aco_opcode::num_opcodes; + info->unordered_swapped = aco_opcode::num_opcodes; switch (op) { - #define CMP2(ord, unord, sz) \ + #define CMP2(ord, unord, ord_swap, unord_swap, sz) \ case aco_opcode::v_cmp_##ord##_f##sz:\ case aco_opcode::v_cmp_n##unord##_f##sz:\ info->ordered = aco_opcode::v_cmp_##ord##_f##sz;\ info->unordered = aco_opcode::v_cmp_n##unord##_f##sz;\ + info->ordered_swapped = aco_opcode::v_cmp_##ord_swap##_f##sz;\ + info->unordered_swapped = aco_opcode::v_cmp_n##unord_swap##_f##sz;\ info->inverse = op == aco_opcode::v_cmp_n##unord##_f##sz ? aco_opcode::v_cmp_##unord##_f##sz : aco_opcode::v_cmp_n##ord##_f##sz;\ info->f32 = op == aco_opcode::v_cmp_##ord##_f##sz ? aco_opcode::v_cmp_##ord##_f32 : aco_opcode::v_cmp_n##unord##_f32;\ info->size = sz;\ return true; - #define CMP(ord, unord) \ - CMP2(ord, unord, 16)\ - CMP2(ord, unord, 32)\ - CMP2(ord, unord, 64) - CMP(lt, /*n*/ge) - CMP(eq, /*n*/lg) - CMP(le, /*n*/gt) - CMP(gt, /*n*/le) - CMP(lg, /*n*/eq) - CMP(ge, /*n*/lt) + #define CMP(ord, unord, ord_swap, unord_swap) \ + CMP2(ord, unord, ord_swap, unord_swap, 16)\ + CMP2(ord, unord, ord_swap, unord_swap, 32)\ + CMP2(ord, unord, ord_swap, unord_swap, 64) + CMP(lt, /*n*/ge, gt, /*n*/le) + CMP(eq, /*n*/lg, eq, /*n*/lg) + CMP(le, /*n*/gt, ge, /*n*/lt) + CMP(gt, /*n*/le, lt, /*n*/le) + CMP(lg, /*n*/eq, lg, /*n*/eq) + CMP(ge, /*n*/lt, le, /*n*/gt) #undef CMP #undef CMP2 #define ORD_TEST(sz) \ @@ -1622,7 +1650,7 @@ bool combine_ordering_test(opt_ctx &ctx, aco_ptr& instr) new_instr->definitions[0] = instr->definitions[0]; ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); instr.reset(new_instr); @@ -1692,7 +1720,7 @@ bool combine_comparison_ordering(opt_ctx &ctx, aco_ptr& instr) new_instr->definitions[0] = instr->definitions[0]; ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); instr.reset(new_instr); @@ -1795,7 +1823,7 @@ bool combine_constant_comparison_ordering(opt_ctx &ctx, aco_ptr& in new_instr->definitions[0] = instr->definitions[0]; ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); instr.reset(new_instr); @@ -1844,7 +1872,7 @@ bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr& instr) new_instr->definitions[0] = instr->definitions[0]; ctx.info[instr->definitions[0].tempId()].label = 0; - ctx.info[instr->definitions[0].tempId()].set_fcmp(new_instr); + ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr); instr.reset(new_instr); @@ -1942,7 +1970,7 @@ void create_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr& bool combine_three_valu_op(opt_ctx& ctx, aco_ptr& instr, aco_opcode op2, aco_opcode new_op, const char *shuffle, uint8_t ops) { - uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + uint64_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & (label_omod_success | label_clamp_success); for (unsigned swap = 0; swap < 2; swap++) { @@ -1973,7 +2001,7 @@ bool combine_minmax(opt_ctx& ctx, aco_ptr& instr, aco_opcode opposi if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2)) return true; - uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + uint64_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & (label_omod_success | label_clamp_success); /* min(-max(a, b), c) -> min3(-a, -b, c) * @@ -2227,7 +2255,7 @@ bool combine_clamp(opt_ctx& ctx, aco_ptr& instr, else return false; - uint32_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & + uint64_t omod_clamp = ctx.info[instr->definitions[0].tempId()].label & (label_omod_success | label_clamp_success); for (unsigned swap = 0; swap < 2; swap++) { @@ -2422,7 +2450,7 @@ bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) /* omod was successfully applied */ /* if the omod instruction is v_mad, we also have to change the original add */ if (ctx.info[instr->operands[idx].tempId()].is_mad()) { - Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get(); + Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].instr->pass_flags].add_instr.get(); if (ctx.info[instr->definitions[0].tempId()].is_clamp()) static_cast(add_instr)->clamp = true; add_instr->definitions[0] = instr->definitions[0]; @@ -2468,7 +2496,7 @@ bool apply_omod_clamp(opt_ctx &ctx, Block& block, aco_ptr& instr) /* clamp was successfully applied */ /* if the clamp instruction is v_mad, we also have to change the original add */ if (ctx.info[instr->operands[idx].tempId()].is_mad()) { - Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].val].add_instr.get(); + Instruction* add_instr = ctx.mad_infos[ctx.info[instr->operands[idx].tempId()].instr->pass_flags].add_instr.get(); add_instr->definitions[0] = instr->definitions[0]; } Instruction* clamp_instr = ctx.info[instr->operands[idx].tempId()].instr; @@ -2890,7 +2918,7 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) mad_info* mad_info = NULL; if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) { - mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; + mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags]; /* re-check mad instructions */ if (ctx.uses[mad_info->mul_temp_id]) { ctx.uses[mad_info->mul_temp_id]++; @@ -3072,7 +3100,7 @@ void apply_literals(opt_ctx &ctx, aco_ptr& instr) /* apply literals on MAD */ if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) { - mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].val]; + mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags]; if (info->check_literal && (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) { aco_ptr new_mad;