From: Christoph Bumiller Date: Sun, 29 Apr 2012 15:56:57 +0000 (+0200) Subject: nvc0/ir: implement better placement of texture barriers X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=00fe442253744c4c4e7e68da44d6983da053968b;p=mesa.git nvc0/ir: implement better placement of texture barriers Put them before first uses instead of right after the texturing instruction and cull unnecessary barriers. --- diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.h b/src/gallium/drivers/nv50/codegen/nv50_ir.h index c299cab3f52..da9042066ad 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.h @@ -135,6 +135,8 @@ enum operation OP_LAST }; +// various instruction-specific modifier definitions Instruction::subOp +// MOV_FINAL marks a MOV originating from an EXPORT (used for placing TEXBARs) #define NV50_IR_SUBOP_MUL_HIGH 1 #define NV50_IR_SUBOP_EMIT_RESTART 1 #define NV50_IR_SUBOP_LDC_IL 1 @@ -143,6 +145,7 @@ enum operation #define NV50_IR_SUBOP_SHIFT_WRAP 1 #define NV50_IR_SUBOP_EMU_PRERET 1 #define NV50_IR_SUBOP_TEXBAR(n) n +#define NV50_IR_SUBOP_MOV_FINAL 1 enum DataType { diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp index 90147b6bfb8..f1bff973636 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.cpp @@ -21,6 +21,9 @@ */ #include "nv50_ir_graph.h" +#include +#include +#include "nv50_ir.h" namespace nv50_ir { @@ -228,8 +231,7 @@ public: virtual bool end() const { return pos >= count; } virtual void next() { if (pos < count) ++pos; } virtual void *get() const { return nodes[pos]; } - - void reset() { pos = 0; } + virtual void reset() { pos = 0; } protected: Graph::Node **nodes; @@ -274,6 +276,7 @@ public: virtual void *get() const { return nodes[pos]; } virtual bool end() const { return pos >= count; } virtual void next() { if (pos < count) ++pos; } + virtual void reset() { pos = 0; } private: void search(Graph::Node *node, const int sequence) @@ -389,4 +392,43 @@ void Graph::classifyDFS(Node *curr, int& seq) curr->tag = 0; } +// @dist is indexed by Node::tag, returns -1 if no path found +int +Graph::findLightestPathWeight(Node *a, Node *b, const std::vector &weight) +{ + std::vector path(weight.size(), std::numeric_limits::max()); + std::list nodeList; + const int seq = nextSequence(); + + path[a->tag] = 0; + for (Node *c = a; c && c != b;) { + const int p = path[c->tag] + weight[c->tag]; + for (EdgeIterator ei = c->outgoing(); !ei.end(); ei.next()) { + Node *t = ei.getNode(); + if (t->getSequence() < seq) { + if (path[t->tag] == std::numeric_limits::max()) + nodeList.push_front(t); + if (p < path[t->tag]) + path[t->tag] = p; + } + } + c->visit(seq); + Node *next = NULL; + for (std::list::iterator n = nodeList.begin(); + n != nodeList.end(); ++n) { + if (!next || path[(*n)->tag] < path[next->tag]) + next = *n; + if ((*n) == c) { + // erase visited + n = nodeList.erase(n); + --n; + } + } + c = next; + } + if (path[b->tag] == std::numeric_limits::max()) + return -1; + return path[b->tag]; +} + } // namespace nv50_ir diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h index cfa73c3dd1a..9ef317f943c 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_graph.h @@ -24,6 +24,7 @@ #define __NV50_IR_GRAPH_H__ #include "nv50_ir_util.h" +#include namespace nv50_ir { @@ -165,6 +166,9 @@ public: void classifyEdges(); + // @weights: indexed by Node::tag + int findLightestPathWeight(Node *, Node *, const std::vector& weights); + private: void classifyDFS(Node *, int&); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp index 30d8acee3bc..27373b4cc47 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp @@ -998,6 +998,7 @@ NV50LoweringPreSSA::handleEXPORT(Instruction *i) int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units i->op = OP_MOV; + i->subOp = NV50_IR_SUBOP_MOV_FINAL; i->src(0).set(i->src(1)); i->setSrc(1, NULL); i->setDef(0, new_LValue(func, FILE_GPR)); diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp index 7d0ebc69710..5bc3a450779 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp @@ -33,7 +33,7 @@ namespace nv50_ir { bool Instruction::isNop() const { - if (op == OP_CONSTRAINT || op == OP_PHI) + if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT) return true; if (terminator || join) // XXX: should terminator imply flow ? return false; @@ -1855,7 +1855,10 @@ FlatteningPass::visit(BasicBlock *bb) Instruction *insn = bb->getExit(); if (insn && insn->op == OP_JOIN && !insn->getPredicate()) { insn = insn->prev; - if (insn && !insn->getPredicate() && !insn->asFlow() && !insn->isNop()) { + if (insn && !insn->getPredicate() && + !insn->asFlow() && + insn->op != OP_TEXBAR && + !insn->isNop()) { insn->join = 1; bb->remove(bb->getExit()); return true; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp index 5d2aa587a31..77edaa6067a 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp @@ -1615,7 +1615,6 @@ GCRA::resolveSplitsAndMerges() v->join = v; reg += v->reg.size; } - delete_Instruction(prog, split); } splits.clear(); @@ -1630,7 +1629,6 @@ GCRA::resolveSplitsAndMerges() v->join = v; reg += v->reg.size; } - delete_Instruction(prog, merge); } merges.clear(); } diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h index f348f1811b8..a0020789c11 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_util.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_util.h @@ -91,6 +91,7 @@ public: virtual void next() = 0; virtual void *get() const = 0; virtual bool end() const = 0; // if true, get will return 0 + virtual void reset() { assert(0); } // only for graph iterators }; typedef std::auto_ptr IteratorRef; diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp index 318d345efdb..02ae9fd5d0e 100644 --- a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp @@ -25,6 +25,8 @@ #include "nv50_ir_target_nvc0.h" +#include + namespace nv50_ir { #define QOP_ADD 0 @@ -129,8 +131,26 @@ private: bool tryReplaceContWithBra(BasicBlock *); void propagateJoin(BasicBlock *); - LValue *r63; + struct TexUse + { + TexUse(Instruction *use, const Instruction *tex) + : insn(use), tex(tex), level(-1) { } + Instruction *insn; + const Instruction *tex; // or split / mov + int level; + }; + struct Limits + { + Limits() { } + Limits(int min, int max) : min(min), max(max) { } + int min, max; + }; + bool insertTextureBarriers(Function *); + inline bool insnDominatedBy(const Instruction *, const Instruction *) const; + void findFirstUses(const Instruction *, std::list&); +private: + LValue *r63; const bool needTexBar; }; @@ -139,9 +159,256 @@ NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog) { } +bool +NVC0LegalizePostRA::insnDominatedBy(const Instruction *later, + const Instruction *early) const +{ + if (early->bb == later->bb) + return early->serial < later->serial; + return later->bb->dominatedBy(early->bb); +} + +void +NVC0LegalizePostRA::findFirstUses(const Instruction *insn, + std::list &uses) +{ + for (int d = 0; insn->defExists(d); ++d) { + Value *v = insn->getDef(d); + for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) { + Instruction *usei = (*u)->getInsn(); + if (usei->op == OP_SPLIT || + usei->op == OP_PHI || + usei->op == OP_UNION) { + // these uses don't manifest in the machine code + findFirstUses(usei, uses); + } else + if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) && + usei->subOp != NV50_IR_SUBOP_MOV_FINAL) { + findFirstUses(usei, uses); + } else { + bool add = true; + for (std::list::iterator it = uses.begin(); + it != uses.end();) { + if (insnDominatedBy(usei, it->insn)) { + add = false; + break; + } + if (insnDominatedBy(it->insn, usei)) + it = uses.erase(it); + else + ++it; + } + if (add) + uses.push_back(TexUse(usei, insn)); + } + } + } +} + +// Texture barriers: +// This pass is a bit long and ugly and can probably be optimized. +// +// 1. obtain a list of TEXes and their outputs' first use(s) +// 2. calculate the barrier level of each first use (minimal number of TEXes, +// over all paths, between the TEX and the use in question) +// 3. for each barrier, if all paths from the source TEX to that barrier +// contain a barrier of lesser level, it can be culled +bool +NVC0LegalizePostRA::insertTextureBarriers(Function *fn) +{ + std::list *uses; + std::vector texes; + std::vector bbFirstTex; + std::vector bbFirstUse; + std::vector texCounts; + std::vector useVec; + ArrayList insns; + + fn->orderInstructions(insns); + + texCounts.resize(fn->allBBlocks.getSize(), 0); + bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize()); + bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize()); + + // tag BB CFG nodes by their id for later + for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) { + BasicBlock *bb = reinterpret_cast(i.get()); + if (bb) + bb->cfg.tag = bb->getId(); + } + + // gather the first uses for each TEX + for (int i = 0; i < insns.getSize(); ++i) { + Instruction *tex = reinterpret_cast(insns.get(i)); + if (isTextureOp(tex->op)) { + texes.push_back(tex); + if (!texCounts.at(tex->bb->getId())) + bbFirstTex[tex->bb->getId()] = texes.size() - 1; + texCounts[tex->bb->getId()]++; + } + } + insns.clear(); + if (texes.empty()) + return false; + uses = new std::list[texes.size()]; + if (!uses) + return false; + for (size_t i = 0; i < texes.size(); ++i) + findFirstUses(texes[i], uses[i]); + + // determine the barrier level at each use + for (size_t i = 0; i < texes.size(); ++i) { + for (std::list::iterator u = uses[i].begin(); u != uses[i].end(); + ++u) { + BasicBlock *tb = texes[i]->bb; + BasicBlock *ub = u->insn->bb; + if (tb == ub) { + u->level = 0; + for (size_t j = i + 1; j < texes.size() && + texes[j]->bb == tb && texes[j]->serial < u->insn->serial; + ++j) + u->level++; + } else { + u->level = fn->cfg.findLightestPathWeight(&tb->cfg, + &ub->cfg, texCounts); + if (u->level < 0) { + WARN("Failed to find path TEX -> TEXBAR\n"); + u->level = 0; + continue; + } + // this counted all TEXes in the origin block, correct that + u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */; + // and did not count the TEXes in the destination block, add those + for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() && + texes[j]->bb == ub && texes[j]->serial < u->insn->serial; + ++j) + u->level++; + } + assert(u->level >= 0); + useVec.push_back(*u); + } + } + delete[] uses; + uses = NULL; + + // insert the barriers + for (size_t i = 0; i < useVec.size(); ++i) { + Instruction *prev = useVec[i].insn->prev; + if (useVec[i].level < 0) + continue; + if (prev && prev->op == OP_TEXBAR) { + if (prev->subOp > useVec[i].level) + prev->subOp = useVec[i].level; + prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0)); + } else { + Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE); + bar->fixed = 1; + bar->subOp = useVec[i].level; + // make use explicit to ease latency calculation + bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0)); + useVec[i].insn->bb->insertBefore(useVec[i].insn, bar); + } + } + + if (fn->getProgram()->optLevel < 3) { + if (uses) + delete[] uses; + return true; + } + + std::vector limitT, limitB, limitS; // entry, exit, single + + limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0)); + limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0)); + limitS.resize(fn->allBBlocks.getSize()); + + // cull unneeded barriers (should do that earlier, but for simplicity) + IteratorRef bi = fn->cfg.iteratorDFS(true); + // first calculate min/max outstanding TEXes for each BB + for (bi->reset(); !bi->end(); bi->next()) { + Graph::Node *n = reinterpret_cast(bi->get()); + BasicBlock *bb = BasicBlock::get(n); + int min = 0; + int max = std::numeric_limits::max(); + for (Instruction *i = bb->getFirst(); i; i = i->next) { + if (isTextureOp(i->op)) { + min++; + if (max < std::numeric_limits::max()) + max++; + } else + if (i->op == OP_TEXBAR) { + min = MIN2(min, i->subOp); + max = MIN2(max, i->subOp); + } + } + // limits when looking at an isolated block + limitS[bb->getId()].min = min; + limitS[bb->getId()].max = max; + } + // propagate the min/max values + for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) { + for (bi->reset(); !bi->end(); bi->next()) { + Graph::Node *n = reinterpret_cast(bi->get()); + BasicBlock *bb = BasicBlock::get(n); + const int bbId = bb->getId(); + for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) { + BasicBlock *in = BasicBlock::get(ei.getNode()); + const int inId = in->getId(); + limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min); + limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max); + } + // I just hope this is correct ... + if (limitS[bbId].max == std::numeric_limits::max()) { + // no barrier + limitB[bbId].min = limitT[bbId].min + limitS[bbId].min; + limitB[bbId].max = limitT[bbId].max + limitS[bbId].min; + } else { + // block contained a barrier + limitB[bbId].min = MIN2(limitS[bbId].max, + limitT[bbId].min + limitS[bbId].min); + limitB[bbId].max = MIN2(limitS[bbId].max, + limitT[bbId].max + limitS[bbId].min); + } + } + } + // finally delete unnecessary barriers + for (bi->reset(); !bi->end(); bi->next()) { + Graph::Node *n = reinterpret_cast(bi->get()); + BasicBlock *bb = BasicBlock::get(n); + Instruction *prev = NULL; + Instruction *next; + int max = limitT[bb->getId()].max; + for (Instruction *i = bb->getFirst(); i; i = next) { + next = i->next; + if (i->op == OP_TEXBAR) { + if (i->subOp >= max) { + delete_Instruction(prog, i); + } else { + max = i->subOp; + if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) { + delete_Instruction(prog, prev); + prev = NULL; + } + } + } else + if (isTextureOp(i->op)) { + max++; + } + if (!i->isNop()) + prev = i; + } + } + if (uses) + delete[] uses; + return true; +} + bool NVC0LegalizePostRA::visit(Function *fn) { + if (needTexBar) + insertTextureBarriers(fn); + r63 = new_LValue(fn, FILE_GPR); r63->reg.data.id = 63; return true; @@ -235,12 +502,6 @@ NVC0LegalizePostRA::visit(BasicBlock *bb) } else if (i->isNop()) { bb->remove(i); - } else - if (needTexBar && isTextureOp(i->op)) { - Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE); - bar->fixed = 1; - bar->subOp = 0; - bb->insertAfter(i, bar); } else { if (i->op != OP_MOV && i->op != OP_PFETCH) replaceZero(i); @@ -673,6 +934,7 @@ NVC0LoweringPass::handleEXPORT(Instruction *i) if (i->src(0).isIndirect(0)) // TODO, ugly return false; i->op = OP_MOV; + i->subOp = NV50_IR_SUBOP_MOV_FINAL; i->src(0).set(i->src(1)); i->setSrc(1, NULL); i->setDef(0, new_LValue(func, FILE_GPR));