X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_opt_value_numbering.cpp;h=93668442d329848b223a347883ef38bd55e6bacf;hb=51bc11abc206ae5ea0946f5a79c68527701c24e0;hp=8071ace1f97449502f5c6f4327cb59adc7d0ced5;hpb=93c8ebfa780ebd1495095e794731881aef29e7d3;p=mesa.git diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 8071ace1f97..93668442d32 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -23,8 +23,7 @@ */ #include -#include - +#include #include "aco_ir.h" /* @@ -35,41 +34,89 @@ namespace aco { namespace { +inline +uint32_t murmur_32_scramble(uint32_t h, uint32_t k) { + k *= 0xcc9e2d51; + k = (k << 15) | (k >> 17); + h ^= k * 0x1b873593; + h = (h << 13) | (h >> 19); + h = h * 5 + 0xe6546b64; + return h; +} + +template +uint32_t hash_murmur_32(Instruction* instr) +{ + uint32_t hash = uint32_t(instr->format) << 16 | uint32_t(instr->opcode); + + for (const Operand& op : instr->operands) + hash = murmur_32_scramble(hash, op.constantValue()); + + /* skip format, opcode and pass_flags */ + for (unsigned i = 2; i < (sizeof(T) >> 2); i++) { + uint32_t u; + /* Accesses it though a byte array, so doesn't violate the strict aliasing rule */ + memcpy(&u, reinterpret_cast(instr) + i * 4, 4); + hash = murmur_32_scramble(hash, u); + } + + /* Finalize. */ + uint32_t len = instr->operands.size() + instr->definitions.size() + sizeof(T); + hash ^= len; + hash ^= hash >> 16; + hash *= 0x85ebca6b; + hash ^= hash >> 13; + hash *= 0xc2b2ae35; + hash ^= hash >> 16; + return hash; +} + struct InstrHash { + /* This hash function uses the Murmur3 algorithm written by Austin Appleby + * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp + * + * In order to calculate the expression set, only the right-hand-side of an + * instruction is used for the hash, i.e. everything except the definitions. + */ std::size_t operator()(Instruction* instr) const { - uint64_t hash = (uint64_t) instr->opcode + (uint64_t) instr->format; - for (unsigned i = 0; i < instr->operands.size(); i++) { - Operand op = instr->operands[i]; - uint64_t val = op.isTemp() ? op.tempId() : op.isFixed() ? op.physReg() : op.constantValue(); - hash |= val << (i+1) * 8; - } - if (instr->isVOP3()) { - VOP3A_instruction* vop3 = static_cast(instr); - for (unsigned i = 0; i < 3; i++) { - hash ^= vop3->abs[i] << (i*3 + 0); - hash ^= vop3->opsel[i] << (i*3 + 1); - hash ^= vop3->neg[i] << (i*3 + 2); - } - hash ^= (vop3->clamp << 28) * 13; - hash += vop3->omod << 19; - } + if (instr->isVOP3()) + return hash_murmur_32(instr); + + if (instr->isDPP()) + return hash_murmur_32(instr); + + if (instr->isSDWA()) + return hash_murmur_32(instr); + switch (instr->format) { case Format::SMEM: - break; - case Format::VINTRP: { - Interp_instruction* interp = static_cast(instr); - hash ^= interp->attribute << 13; - hash ^= interp->component << 27; - break; - } + return hash_murmur_32(instr); + case Format::VINTRP: + return hash_murmur_32(instr); case Format::DS: - break; + return hash_murmur_32(instr); + case Format::SOPP: + return hash_murmur_32(instr); + case Format::SOPK: + return hash_murmur_32(instr); + case Format::EXP: + return hash_murmur_32(instr); + case Format::MUBUF: + return hash_murmur_32(instr); + case Format::MIMG: + return hash_murmur_32(instr); + case Format::MTBUF: + return hash_murmur_32(instr); + case Format::FLAT: + return hash_murmur_32(instr); + case Format::PSEUDO_BRANCH: + return hash_murmur_32(instr); + case Format::PSEUDO_REDUCTION: + return hash_murmur_32(instr); default: - break; + return hash_murmur_32(instr); } - - return hash; } }; @@ -98,11 +145,11 @@ struct InstrPred { else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined()) return false; if (a->operands[i].isFixed()) { - if (a->operands[i].physReg() == exec) - return false; if (!b->operands[i].isFixed()) return false; - if (!(a->operands[i].physReg() == b->operands[i].physReg())) + if (a->operands[i].physReg() != b->operands[i].physReg()) + return false; + if (a->operands[i].physReg() == exec && a->pass_flags != b->pass_flags) return false; } } @@ -116,28 +163,37 @@ struct InstrPred { if (a->definitions[i].isFixed()) { if (!b->definitions[i].isFixed()) return false; - if (!(a->definitions[i].physReg() == b->definitions[i].physReg())) + if (a->definitions[i].physReg() != b->definitions[i].physReg()) + return false; + if (a->definitions[i].physReg() == exec) return false; } } - if (a->format == Format::PSEUDO_BRANCH) + + if (a->opcode == aco_opcode::v_readfirstlane_b32) + return a->pass_flags == b->pass_flags; + + /* The results of VOPC depend on the exec mask if used for subgroup operations. */ + if ((uint32_t) a->format & (uint32_t) Format::VOPC && a->pass_flags != b->pass_flags) return false; + if (a->isVOP3()) { VOP3A_instruction* a3 = static_cast(a); VOP3A_instruction* b3 = static_cast(b); for (unsigned i = 0; i < 3; i++) { if (a3->abs[i] != b3->abs[i] || - a3->opsel[i] != b3->opsel[i] || a3->neg[i] != b3->neg[i]) return false; } return a3->clamp == b3->clamp && - a3->omod == b3->omod; + a3->omod == b3->omod && + a3->opsel == b3->opsel; } if (a->isDPP()) { DPP_instruction* aDPP = static_cast(a); DPP_instruction* bDPP = static_cast(b); - return aDPP->dpp_ctrl == bDPP->dpp_ctrl && + return aDPP->pass_flags == bDPP->pass_flags && + aDPP->dpp_ctrl == bDPP->dpp_ctrl && aDPP->bank_mask == bDPP->bank_mask && aDPP->row_mask == bDPP->row_mask && aDPP->bound_ctrl == bDPP->bound_ctrl && @@ -146,12 +202,22 @@ struct InstrPred { aDPP->neg[0] == bDPP->neg[0] && aDPP->neg[1] == bDPP->neg[1]; } + if (a->isSDWA()) { + SDWA_instruction* aSDWA = static_cast(a); + SDWA_instruction* bSDWA = static_cast(b); + return aSDWA->sel[0] == bSDWA->sel[0] && + aSDWA->sel[1] == bSDWA->sel[1] && + aSDWA->dst_sel == bSDWA->dst_sel && + aSDWA->abs[0] == bSDWA->abs[0] && + aSDWA->abs[1] == bSDWA->abs[1] && + aSDWA->neg[0] == bSDWA->neg[0] && + aSDWA->neg[1] == bSDWA->neg[1] && + aSDWA->dst_preserve == bSDWA->dst_preserve && + aSDWA->clamp == bSDWA->clamp && + aSDWA->omod == bSDWA->omod; + } + switch (a->format) { - case Format::VOPC: { - /* Since the results depend on the exec mask, these shouldn't - * be value numbered (this is especially useful for subgroupBallot()). */ - return false; - } case Format::SOPK: { SOPK_instruction* aK = static_cast(a); SOPK_instruction* bK = static_cast(b); @@ -172,33 +238,70 @@ struct InstrPred { return false; return true; } - case Format::PSEUDO_REDUCTION: - return false; + case Format::PSEUDO_REDUCTION: { + Pseudo_reduction_instruction *aR = static_cast(a); + Pseudo_reduction_instruction *bR = static_cast(b); + return aR->pass_flags == bR->pass_flags && + aR->reduce_op == bR->reduce_op && + aR->cluster_size == bR->cluster_size; + } case Format::MTBUF: { - /* this is fine since they are only used for vertex input fetches */ MTBUF_instruction* aM = static_cast(a); MTBUF_instruction* bM = static_cast(b); - return aM->dfmt == bM->dfmt && + return aM->can_reorder && bM->can_reorder && + aM->barrier == bM->barrier && + aM->dfmt == bM->dfmt && aM->nfmt == bM->nfmt && aM->offset == bM->offset && aM->offen == bM->offen && aM->idxen == bM->idxen && aM->glc == bM->glc && + aM->dlc == bM->dlc && + aM->slc == bM->slc && + aM->tfe == bM->tfe && + aM->disable_wqm == bM->disable_wqm; + } + case Format::MUBUF: { + MUBUF_instruction* aM = static_cast(a); + MUBUF_instruction* bM = static_cast(b); + return aM->can_reorder && bM->can_reorder && + aM->barrier == bM->barrier && + aM->offset == bM->offset && + aM->offen == bM->offen && + aM->idxen == bM->idxen && + aM->glc == bM->glc && + aM->dlc == bM->dlc && aM->slc == bM->slc && aM->tfe == bM->tfe && + aM->lds == bM->lds && aM->disable_wqm == bM->disable_wqm; } /* we want to optimize these in NIR and don't hassle with load-store dependencies */ - case Format::MUBUF: case Format::FLAT: case Format::GLOBAL: case Format::SCRATCH: - case Format::DS: + case Format::EXP: + case Format::SOPP: + case Format::PSEUDO_BRANCH: + case Format::PSEUDO_BARRIER: return false; + case Format::DS: { + if (a->opcode != aco_opcode::ds_bpermute_b32 && + a->opcode != aco_opcode::ds_permute_b32 && + a->opcode != aco_opcode::ds_swizzle_b32) + return false; + DS_instruction* aD = static_cast(a); + DS_instruction* bD = static_cast(b); + return aD->pass_flags == bD->pass_flags && + aD->gds == bD->gds && + aD->offset0 == bD->offset0 && + aD->offset1 == bD->offset1; + } case Format::MIMG: { MIMG_instruction* aM = static_cast(a); MIMG_instruction* bM = static_cast(b); return aM->can_reorder && bM->can_reorder && + aM->barrier == bM->barrier && aM->dmask == bM->dmask && aM->unrm == bM->unrm && aM->glc == bM->glc && @@ -217,46 +320,63 @@ struct InstrPred { } }; +using expr_set = std::unordered_map; + +struct vn_ctx { + Program* program; + expr_set expr_values; + std::map renames; + + /* The exec id should be the same on the same level of control flow depth. + * Together with the check for dominator relations, it is safe to assume + * that the same exec_id also means the same execution mask. + * Discards increment the exec_id, so that it won't return to the previous value. + */ + uint32_t exec_id = 1; + + vn_ctx(Program* program) : program(program) { + static_assert(sizeof(Temp) == 4, "Temp must fit in 32bits"); + unsigned size = 0; + for (Block& block : program->blocks) + size += block.instructions.size(); + expr_values.reserve(size); + } +}; -typedef std::unordered_set expr_set; -void process_block(Block& block, - expr_set& expr_values, - std::map& renames) +/* dominates() returns true if the parent block dominates the child block and + * if the parent block is part of the same loop or has a smaller loop nest depth. + */ +bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child) +{ + unsigned parent_loop_nest_depth = ctx.program->blocks[parent].loop_nest_depth; + while (parent < child && parent_loop_nest_depth <= ctx.program->blocks[child].loop_nest_depth) + child = ctx.program->blocks[child].logical_idom; + + return parent == child; +} + +void process_block(vn_ctx& ctx, Block& block) { - bool run = false; - std::vector>::iterator it = block.instructions.begin(); std::vector> new_instructions; new_instructions.reserve(block.instructions.size()); - expr_set phi_values; - while (it != block.instructions.end()) { - aco_ptr& instr = *it; + for (aco_ptr& instr : block.instructions) { /* first, rename operands */ for (Operand& op : instr->operands) { if (!op.isTemp()) continue; - auto it = renames.find(op.tempId()); - if (it != renames.end()) + auto it = ctx.renames.find(op.tempId()); + if (it != ctx.renames.end()) op.setTemp(it->second); } - if (instr->definitions.empty() || !run) { - if (instr->opcode == aco_opcode::p_logical_start) - run = true; - else if (instr->opcode == aco_opcode::p_logical_end) - run = false; - else if (instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi) { - std::pair res = phi_values.emplace(instr.get()); - if (!res.second) { - Instruction* orig_phi = *(res.first); - renames.emplace(instr->definitions[0].tempId(), orig_phi->definitions[0].getTemp()).second; - ++it; - continue; - } - } + if (instr->opcode == aco_opcode::p_discard_if || + instr->opcode == aco_opcode::p_demote_to_helper) + ctx.exec_id++; + + if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi) { new_instructions.emplace_back(std::move(instr)); - ++it; continue; } @@ -264,26 +384,37 @@ void process_block(Block& block, if ((instr->opcode == aco_opcode::s_mov_b32 || instr->opcode == aco_opcode::s_mov_b64 || instr->opcode == aco_opcode::v_mov_b32) && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() && instr->operands[0].regClass() == instr->definitions[0].regClass() && !instr->isDPP() && !((int)instr->format & (int)Format::SDWA)) { - renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp(); + ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp(); } - std::pair res = expr_values.emplace(instr.get()); + instr->pass_flags = ctx.exec_id; + std::pair res = ctx.expr_values.emplace(instr.get(), block.index); /* if there was already an expression with the same value number */ if (!res.second) { - Instruction* orig_instr = *(res.first); + Instruction* orig_instr = res.first->first; assert(instr->definitions.size() == orig_instr->definitions.size()); - for (unsigned i = 0; i < instr->definitions.size(); i++) { - assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass()); - renames.emplace(instr->definitions[i].tempId(), orig_instr->definitions[i].getTemp()).second; + /* check if the original instruction dominates the current one */ + if (dominates(ctx, res.first->second, block.index) && + ctx.program->blocks[res.first->second].fp_mode.canReplace(block.fp_mode)) { + for (unsigned i = 0; i < instr->definitions.size(); i++) { + assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass()); + assert(instr->definitions[i].isTemp()); + ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp(); + if (instr->definitions[i].isPrecise()) + orig_instr->definitions[i].setPrecise(true); + } + } else { + ctx.expr_values.erase(res.first); + ctx.expr_values.emplace(instr.get(), block.index); + new_instructions.emplace_back(std::move(instr)); } } else { new_instructions.emplace_back(std::move(instr)); } - ++it; } - block.instructions.swap(new_instructions); + block.instructions = std::move(new_instructions); } void rename_phi_operands(Block& block, std::map& renames) @@ -306,22 +437,43 @@ void rename_phi_operands(Block& block, std::map& renames) void value_numbering(Program* program) { - std::vector expr_values(program->blocks.size()); - std::map renames; + vn_ctx ctx(program); + std::vector loop_headers; for (Block& block : program->blocks) { - if (block.logical_idom != -1) { - /* initialize expr_values from idom */ - expr_values[block.index] = expr_values[block.logical_idom]; - process_block(block, expr_values[block.index], renames); - } else { - expr_set empty; - process_block(block, empty, renames); + assert(ctx.exec_id > 0); + /* decrement exec_id when leaving nested control flow */ + if (block.kind & block_kind_loop_header) + loop_headers.push_back(block.index); + if (block.kind & block_kind_merge) { + ctx.exec_id--; + } else if (block.kind & block_kind_loop_exit) { + ctx.exec_id -= program->blocks[loop_headers.back()].linear_preds.size(); + ctx.exec_id -= block.linear_preds.size(); + loop_headers.pop_back(); } + + if (block.logical_idom != -1) + process_block(ctx, block); + else + rename_phi_operands(block, ctx.renames); + + /* increment exec_id when entering nested control flow */ + if (block.kind & block_kind_branch || + block.kind & block_kind_loop_preheader || + block.kind & block_kind_break || + block.kind & block_kind_continue || + block.kind & block_kind_discard) + ctx.exec_id++; + else if (block.kind & block_kind_continue_or_break) + ctx.exec_id += 2; } - for (Block& block : program->blocks) - rename_phi_operands(block, renames); + /* rename loop header phi operands */ + for (Block& block : program->blocks) { + if (block.kind & block_kind_loop_header) + rename_phi_operands(block, ctx.renames); + } } }