From afcd7b5d1614a8a758ccb4353a9c31a601c9b9b4 Mon Sep 17 00:00:00 2001 From: Christoph Bumiller Date: Sun, 29 Apr 2012 17:59:06 +0200 Subject: [PATCH] nvc0/ir: initial implementation of nve4 scheduling hints --- src/gallium/drivers/nv50/codegen/nv50_ir.h | 2 + .../drivers/nv50/codegen/nv50_ir_print.cpp | 20 + .../drivers/nv50/codegen/nv50_ir_target.cpp | 81 +++ .../drivers/nv50/codegen/nv50_ir_target.h | 38 +- .../nv50/codegen/nv50_ir_target_nv50.cpp | 4 +- .../nvc0/codegen/nv50_ir_emit_nvc0.cpp | 530 +++++++++++++++++- .../nvc0/codegen/nv50_ir_target_nvc0.cpp | 77 ++- .../nvc0/codegen/nv50_ir_target_nvc0.h | 1 + 8 files changed, 738 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir.h b/src/gallium/drivers/nv50/codegen/nv50_ir.h index da9042066ad..e544d071b52 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir.h @@ -681,6 +681,8 @@ public: uint8_t subOp; // quadop, 1 for mul-high, etc. + uint8_t sched; // scheduling data (NOTE: maybe move to separate storage) + unsigned encSize : 4; // encoding size in bytes unsigned saturate : 1; // to [0.0f, 1.0f] unsigned join : 1; // converge control flow (use OP_JOIN until end) diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp index 4652bb95f69..9d92b7bc24b 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_print.cpp @@ -53,6 +53,26 @@ static const char *colour[8] = #endif }; +static const char *OpClassStr[OPCLASS_OTHER + 1] = +{ + "MOVE", + "LOAD", + "STORE", + "ARITH", + "SHIFT", + "SFU", + "LOGIC", + "COMPARE", + "CONVERT", + "ATOMIC", + "TEXTURE", + "SURFACE", + "FLOW", + "(INVALID)", + "PSEUDO", + "OTHER" +}; + const char *operationStr[OP_LAST + 1] = { "nop", diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp index e3eae69554c..f718912fb39 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target.cpp @@ -52,6 +52,65 @@ const uint8_t Target::operationSrcNr[OP_LAST + 1] = 0 }; +const OpClass Target::operationClass[OP_LAST + 1] = +{ + // NOP; PHI; UNION, SPLIT, MERGE, CONSTRAINT + OPCLASS_OTHER, + OPCLASS_PSEUDO, + OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO, + // MOV; LOAD; STORE + OPCLASS_MOVE, + OPCLASS_LOAD, + OPCLASS_STORE, + // ADD, SUB, MUL; DIV, MOD; MAD, FMA, SAD + OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, + OPCLASS_ARITH, OPCLASS_ARITH, + OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH, + // ABS, NEG; NOT, AND, OR, XOR; SHL, SHR + OPCLASS_CONVERT, OPCLASS_CONVERT, + OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, + OPCLASS_SHIFT, OPCLASS_SHIFT, + // MAX, MIN + OPCLASS_COMPARE, OPCLASS_COMPARE, + // SAT, CEIL, FLOOR, TRUNC; CVT + OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT, + OPCLASS_CONVERT, + // SET(AND,OR,XOR); SELP, SLCT + OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE, + OPCLASS_COMPARE, OPCLASS_COMPARE, + // RCP, RSQ, LG2, SIN, COS; EX2, EXP, LOG, PRESIN, PREEX2; SQRT, POW + OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, + OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, + OPCLASS_SFU, OPCLASS_SFU, + // BRA, CALL, RET; CONT, BREAK, PRE(RET,CONT,BREAK); BRKPT, JOINAT, JOIN + OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, + OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, + OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, + // DISCARD, EXIT + OPCLASS_FLOW, OPCLASS_FLOW, + // MEMBAR + OPCLASS_OTHER, + // VFETCH, PFETCH, EXPORT + OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE, + // LINTERP, PINTERP + OPCLASS_SFU, OPCLASS_SFU, + // EMIT, RESTART + OPCLASS_OTHER, OPCLASS_OTHER, + // TEX, TXB, TXL, TXF; TXQ, TXD, TXG, TEXCSAA + OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, + OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, + // SULD, SUST + OPCLASS_SURFACE, OPCLASS_SURFACE, + // DFDX, DFDY, RDSV, WRSV; PIXLD, QUADOP, QUADON, QUADPOP + OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, + OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, + // POPCNT, INSBF, EXTBF + OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, + // TEXBAR + OPCLASS_OTHER, + OPCLASS_PSEUDO // LAST +}; + extern Target *getTargetNVC0(unsigned int chipset); extern Target *getTargetNV50(unsigned int chipset); @@ -104,6 +163,11 @@ CodeEmitter::printBinary() const INFO("\n"); } +static inline uint32_t sizeToBundlesNVE4(uint32_t size) +{ + return (size + 55) / 56; +} + void CodeEmitter::prepareEmission(Program *prog) { @@ -112,6 +176,23 @@ CodeEmitter::prepareEmission(Program *prog) Function *func = reinterpret_cast(fi.get()); func->binPos = prog->binSize; prepareEmission(func); + + // adjust sizes & positions for schedulding info: + if (prog->getTarget()->hasSWSched) { + BasicBlock *bb = NULL; + for (int i = 0; i < func->bbCount; ++i) { + bb = func->bbArray[i]; + const uint32_t oldPos = bb->binPos; + const uint32_t oldEnd = bb->binPos + bb->binSize; + uint32_t adjPos = oldPos + sizeToBundlesNVE4(oldPos) * 8; + uint32_t adjEnd = oldEnd + sizeToBundlesNVE4(oldEnd) * 8; + bb->binPos = adjPos; + bb->binSize = adjEnd - adjPos; + } + if (bb) + func->binSize = bb->binPos + bb->binSize; + } + prog->binSize += func->binSize; } } diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target.h b/src/gallium/drivers/nv50/codegen/nv50_ir_target.h index 88996ebbde3..c60ee0216f7 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_target.h +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target.h @@ -93,9 +93,31 @@ protected: RelocInfo *relocInfo; }; + +enum OpClass +{ + OPCLASS_MOVE = 0, + OPCLASS_LOAD = 1, + OPCLASS_STORE = 2, + OPCLASS_ARITH = 3, + OPCLASS_SHIFT = 4, + OPCLASS_SFU = 5, + OPCLASS_LOGIC = 6, + OPCLASS_COMPARE = 7, + OPCLASS_CONVERT = 8, + OPCLASS_ATOMIC = 9, + OPCLASS_TEXTURE = 10, + OPCLASS_SURFACE = 11, + OPCLASS_FLOW = 12, + OPCLASS_PSEUDO = 14, + OPCLASS_OTHER = 15 +}; + class Target { public: + Target(bool j, bool s) : joinAnterior(j), hasSWSched(s) { } + static Target *create(uint32_t chipset); static void destroy(Target *); @@ -153,6 +175,9 @@ public: virtual bool mayPredicate(const Instruction *, const Value *) const = 0; + // whether @insn can be issued together with @next (order matters) + virtual bool canDualIssue(const Instruction *insn, + const Instruction *next) const { return false; } virtual int getLatency(const Instruction *) const { return 1; } virtual int getThroughput(const Instruction *) const { return 1; } @@ -162,9 +187,20 @@ public: virtual uint32_t getSVAddress(DataFile, const Symbol *) const = 0; public: - bool joinAnterior; // true if join is executed before the op + const bool joinAnterior; // true if join is executed before the op + const bool hasSWSched; // true if code should provide scheduling data static const uint8_t operationSrcNr[OP_LAST + 1]; + static const OpClass operationClass[OP_LAST + 1]; + + static inline uint8_t getOpSrcNr(operation op) + { + return operationSrcNr[op]; + } + static inline OpClass getOpClass(operation op) + { + return operationClass[op]; + } protected: uint32_t chipset; diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp index a64f7f72255..5e541e514cb 100644 --- a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp @@ -29,7 +29,7 @@ Target *getTargetNV50(unsigned int chipset) return new TargetNV50(chipset); } -TargetNV50::TargetNV50(unsigned int card) +TargetNV50::TargetNV50(unsigned int card) : Target(true, false) { chipset = card; @@ -132,8 +132,6 @@ void TargetNV50::initOpInfo() OP_CALL, OP_PREBREAK, OP_PRERET, OP_QUADON, OP_QUADPOP, OP_JOINAT }; - joinAnterior = true; - for (i = 0; i < DATA_FILE_COUNT; ++i) nativeFileMap[i] = (DataFile)i; nativeFileMap[FILE_PREDICATE] = FILE_FLAGS; diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp index 48d99bfe803..c91c56c9c10 100644 --- a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp @@ -33,6 +33,7 @@ public: virtual bool emitInstruction(Instruction *); virtual uint32_t getMinEncodingSize(const Instruction *) const; + virtual void prepareEmission(Function *); inline void setProgramType(Program::Type pType) { progType = pType; } @@ -41,6 +42,8 @@ private: Program::Type progType; + const bool writeIssueDelays; + private: void emitForm_A(const Instruction *, uint64_t); void emitForm_B(const Instruction *, uint64_t); @@ -1505,15 +1508,40 @@ CodeEmitterNVC0::emitMOV(const Instruction *i) bool CodeEmitterNVC0::emitInstruction(Instruction *insn) { + unsigned int size = insn->encSize; + + if (writeIssueDelays && !(codeSize & 0x3f)) + size += 8; + if (!insn->encSize) { ERROR("skipping unencodable instruction: "); insn->print(); return false; } else - if (codeSize + insn->encSize > codeSizeLimit) { + if (codeSize + size > codeSizeLimit) { ERROR("code emitter output buffer too small\n"); return false; } + if (writeIssueDelays) { + if (!(codeSize & 0x3f)) { + code[0] = 0x00000007; // cf issue delay "instruction" + code[1] = 0x20000000; + code += 2; + codeSize += 8; + } + const unsigned int id = (codeSize & 0x3f) / 8 - 1; + uint32_t *data = code - (id * 2 + 2); + if (id <= 2) { + data[0] |= insn->sched << (id * 8 + 4); + } else + if (id == 3) { + data[0] |= insn->sched << 28; + data[1] |= insn->sched >> 4; + } else { + data[1] |= insn->sched << ((id - 4) * 8 + 4); + } + } + // assert that instructions with multiple defs don't corrupt registers for (int d = 0; insn->defExists(d); ++d) assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0); @@ -1707,7 +1735,7 @@ CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const { const Target::OpInfo &info = targ->getOpInfo(i); - if (info.minEncSize == 8 || 1) + if (writeIssueDelays || info.minEncSize == 8 || 1) return 8; if (i->ftz || i->saturate || i->join) @@ -1761,7 +1789,503 @@ CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const return 4; } -CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target) : CodeEmitter(target) +// Simplified, erring on safe side. +class SchedDataCalculator : public Pass +{ +public: + SchedDataCalculator(const Target *targ) : targ(targ) { } + +private: + struct RegScores + { + struct Resource { + int st[DATA_FILE_COUNT]; // LD to LD delay 3 + int ld[DATA_FILE_COUNT]; // ST to ST delay 3 + int tex; // TEX to non-TEX delay 17 (0x11) + int sfu; // SFU to SFU delay 3 (except PRE-ops) + int imul; // integer MUL to MUL delay 3 + } res; + struct ScoreData { + int r[64]; + int p[8]; + int c; + } rd, wr; + int base; + + void rebase(const int base) + { + const int delta = this->base - base; + if (!delta) + return; + this->base = 0; + + for (int i = 0; i < 64; ++i) { + rd.r[i] += delta; + wr.r[i] += delta; + } + for (int i = 0; i < 8; ++i) { + rd.p[i] += delta; + wr.p[i] += delta; + } + rd.c += delta; + wr.c += delta; + + for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) { + res.ld[f] += delta; + res.st[f] += delta; + } + res.sfu += delta; + res.imul += delta; + res.tex += delta; + } + void wipe() + { + memset(&rd, 0, sizeof(rd)); + memset(&wr, 0, sizeof(wr)); + memset(&res, 0, sizeof(res)); + } + int getLatest(const ScoreData& d) const + { + int max = 0; + for (int i = 0; i < 64; ++i) + if (d.r[i] > max) + max = d.r[i]; + for (int i = 0; i < 8; ++i) + if (d.p[i] > max) + max = d.p[i]; + if (d.c > max) + max = d.c; + return max; + } + inline int getLatestRd() const + { + return getLatest(rd); + } + inline int getLatestWr() const + { + return getLatest(wr); + } + inline int getLatest() const + { + const int a = getLatestRd(); + const int b = getLatestWr(); + + int max = MAX2(a, b); + for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) { + max = MAX2(res.ld[f], max); + max = MAX2(res.st[f], max); + } + max = MAX2(res.sfu, max); + max = MAX2(res.imul, max); + max = MAX2(res.tex, max); + return max; + } + void setMax(const RegScores *that) + { + for (int i = 0; i < 64; ++i) { + rd.r[i] = MAX2(rd.r[i], that->rd.r[i]); + wr.r[i] = MAX2(wr.r[i], that->wr.r[i]); + } + for (int i = 0; i < 8; ++i) { + rd.p[i] = MAX2(rd.p[i], that->rd.p[i]); + wr.p[i] = MAX2(wr.p[i], that->wr.p[i]); + } + rd.c = MAX2(rd.c, that->rd.c); + wr.c = MAX2(wr.c, that->wr.c); + + for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) { + res.ld[f] = MAX2(res.ld[f], that->res.ld[f]); + res.st[f] = MAX2(res.st[f], that->res.st[f]); + } + res.sfu = MAX2(res.sfu, that->res.sfu); + res.imul = MAX2(res.imul, that->res.imul); + res.tex = MAX2(res.tex, that->res.tex); + } + void print(int cycle) + { + for (int i = 0; i < 64; ++i) { + if (rd.r[i] > cycle) + INFO("rd $r%i @ %i\n", i, rd.r[i]); + if (wr.r[i] > cycle) + INFO("wr $r%i @ %i\n", i, wr.r[i]); + } + for (int i = 0; i < 8; ++i) { + if (rd.p[i] > cycle) + INFO("rd $p%i @ %i\n", i, rd.p[i]); + if (wr.p[i] > cycle) + INFO("wr $p%i @ %i\n", i, wr.p[i]); + } + if (rd.c > cycle) + INFO("rd $c @ %i\n", rd.c); + if (wr.c > cycle) + INFO("wr $c @ %i\n", wr.c); + if (res.sfu > cycle) + INFO("sfu @ %i\n", res.sfu); + if (res.imul > cycle) + INFO("imul @ %i\n", res.imul); + if (res.tex > cycle) + INFO("tex @ %i\n", res.tex); + } + }; + + RegScores *score; // for current BB + std::vector scoreBoards; + int cycle; + int prevData; + operation prevOp; + + const Target *targ; + + bool visit(Function *); + bool visit(BasicBlock *); + + void commitInsn(const Instruction *, int cycle); + int calcDelay(const Instruction *, int cycle) const; + void setDelay(Instruction *, int delay, Instruction *next); + + void recordRd(const Value *, const int ready); + void recordWr(const Value *, const int ready); + void checkRd(const Value *, int cycle, int& delay) const; + void checkWr(const Value *, int cycle, int& delay) const; + + int getCycles(const Instruction *, int origDelay) const; +}; + +void +SchedDataCalculator::setDelay(Instruction *insn, int delay, Instruction *next) +{ + if (insn->op == OP_EXIT) + delay = MAX2(delay, 14); + + if (insn->op == OP_TEXBAR) { + // TODO: except if results not used before EXIT + insn->sched = 0xc2; + } else + if (insn->op == OP_JOIN || insn->join) { + insn->sched = 0x00; + } else + if (delay >= 0 || prevData == 0x04 || + !next || !targ->canDualIssue(insn, next)) { + insn->sched = static_cast(MAX2(delay, 0)); + if (prevOp == OP_EXPORT) + insn->sched |= 0x40; + else + insn->sched |= 0x20; + } else { + insn->sched = 0x04; // dual-issue + } + + if (prevData != 0x04 || prevOp != OP_EXPORT) + if (insn->sched != 0x04 || insn->op == OP_EXPORT) + prevOp = insn->op; + + prevData = insn->sched; +} + +int +SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const +{ + if (insn->sched & 0x80) { + int c = (insn->sched & 0x0f) * 2 + 1; + if (insn->op == OP_TEXBAR && origDelay > 0) + c += origDelay; + return c; + } + if (insn->sched & 0x60) + return (insn->sched & 0x1f) + 1; + return (insn->sched == 0x04) ? 0 : 32; +} + +bool +SchedDataCalculator::visit(Function *func) +{ + scoreBoards.resize(func->cfg.getSize()); + for (size_t i = 0; i < scoreBoards.size(); ++i) + scoreBoards[i].wipe(); + return true; +} + +bool +SchedDataCalculator::visit(BasicBlock *bb) +{ + Instruction *insn; + Instruction *next = NULL; + + int cycle = 0; + + prevData = 0x00; + prevOp = OP_NOP; + score = &scoreBoards.at(bb->getId()); + + for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) { + BasicBlock *in = BasicBlock::get(ei.getNode()); + if (in->getExit()) { + if (prevData != 0x04) + prevData = in->getExit()->sched; + prevOp = in->getExit()->op; + } + if (ei.getType() != Graph::Edge::BACK) + score->setMax(&scoreBoards.at(in->getId())); + // back branches will wait until all target dependencies are satisfied + } + if (bb->cfg.incidentCount() > 1) + prevOp = OP_NOP; + +#ifdef NVC0_DEBUG_SCHED_DATA + INFO("=== BB:%i initial scores\n", bb->getId()); + score->print(cycle); +#endif + + for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) { + next = insn->next; + + commitInsn(insn, cycle); + int delay = calcDelay(next, cycle); + setDelay(insn, delay, next); + cycle += getCycles(insn, delay); + +#ifdef NVC0_DEBUG_SCHED_DATA + INFO("cycle %i, sched %02x\n", cycle, insn->sched); + insn->print(); + next->print(); +#endif + } + if (!insn) + return true; + commitInsn(insn, cycle); + + int bbDelay = -1; + + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { + BasicBlock *out = BasicBlock::get(ei.getNode()); + + if (ei.getType() != Graph::Edge::BACK) { + // only test the first instruction of the outgoing block + next = out->getEntry(); + if (next) + bbDelay = MAX2(bbDelay, calcDelay(next, cycle)); + } else { + // wait until all dependencies are satisfied + const int regsFree = score->getLatest(); + next = out->getFirst(); + for (int c = cycle; next && c < regsFree; next = next->next) { + bbDelay = MAX2(bbDelay, calcDelay(next, c)); + c += getCycles(next, bbDelay); + } + next = NULL; + } + } + if (bb->cfg.outgoingCount() != 1) + next = NULL; + setDelay(insn, bbDelay, next); + cycle += getCycles(insn, bbDelay); + + score->rebase(cycle); // common base for initializing out blocks' scores + return true; +} + +#define NVE4_MAX_ISSUE_DELAY 0x1f +int +SchedDataCalculator::calcDelay(const Instruction *insn, int cycle) const +{ + int delay = 0, ready = cycle; + + for (int s = 0; insn->srcExists(s); ++s) + checkRd(insn->getSrc(s), cycle, delay); + // WAR & WAW don't seem to matter + // for (int s = 0; insn->srcExists(s); ++s) + // recordRd(insn->getSrc(s), cycle); + + switch (Target::getOpClass(insn->op)) { + case OPCLASS_SFU: + ready = score->res.sfu; + break; + case OPCLASS_ARITH: + if (insn->op == OP_MUL && !isFloatType(insn->dType)) + ready = score->res.imul; + break; + case OPCLASS_TEXTURE: + ready = score->res.tex; + break; + case OPCLASS_LOAD: + ready = score->res.ld[insn->src(0).getFile()]; + break; + case OPCLASS_STORE: + ready = score->res.st[insn->src(0).getFile()]; + break; + default: + break; + } + if (Target::getOpClass(insn->op) != OPCLASS_TEXTURE) + ready = MAX2(ready, score->res.tex); + + delay = MAX2(delay, ready - cycle); + + // if can issue next cycle, delay is 0, not 1 + return MIN2(delay - 1, NVE4_MAX_ISSUE_DELAY); +} + +void +SchedDataCalculator::commitInsn(const Instruction *insn, int cycle) +{ + const int ready = cycle + targ->getLatency(insn); + + for (int d = 0; insn->defExists(d); ++d) + recordWr(insn->getDef(d), ready); + // WAR & WAW don't seem to matter + // for (int s = 0; insn->srcExists(s); ++s) + // recordRd(insn->getSrc(s), cycle); + + switch (Target::getOpClass(insn->op)) { + case OPCLASS_SFU: + score->res.sfu = cycle + 4; + break; + case OPCLASS_ARITH: + if (insn->op == OP_MUL && !isFloatType(insn->dType)) + score->res.imul = cycle + 4; + break; + case OPCLASS_TEXTURE: + score->res.tex = cycle + 18; + break; + case OPCLASS_LOAD: + if (insn->src(0).getFile() == FILE_MEMORY_CONST) + break; + score->res.ld[insn->src(0).getFile()] = cycle + 4; + score->res.st[insn->src(0).getFile()] = ready; + break; + case OPCLASS_STORE: + score->res.st[insn->src(0).getFile()] = cycle + 4; + score->res.ld[insn->src(0).getFile()] = ready; + break; + case OPCLASS_OTHER: + if (insn->op == OP_TEXBAR) + score->res.tex = cycle; + break; + default: + break; + } + +#ifdef NVC0_DEBUG_SCHED_DATA + score->print(cycle); +#endif +} + +void +SchedDataCalculator::checkRd(const Value *v, int cycle, int& delay) const +{ + int ready = cycle; + int a, b; + + switch (v->reg.file) { + case FILE_GPR: + a = v->reg.data.id; + b = a + v->reg.size / 4; + for (int r = a; r < b; ++r) + ready = MAX2(ready, score->rd.r[r]); + break; + case FILE_PREDICATE: + ready = MAX2(ready, score->rd.p[v->reg.data.id]); + break; + case FILE_FLAGS: + ready = MAX2(ready, score->rd.c); + break; + case FILE_SHADER_INPUT: + case FILE_SHADER_OUTPUT: // yes, TCPs can read outputs + case FILE_MEMORY_LOCAL: + case FILE_MEMORY_CONST: + case FILE_MEMORY_SHARED: + case FILE_MEMORY_GLOBAL: + case FILE_SYSTEM_VALUE: + // TODO: any restrictions here ? + break; + case FILE_IMMEDIATE: + break; + default: + assert(0); + break; + } + if (cycle < ready) + delay = MAX2(delay, ready - cycle); +} + +void +SchedDataCalculator::checkWr(const Value *v, int cycle, int& delay) const +{ + int ready = cycle; + int a, b; + + switch (v->reg.file) { + case FILE_GPR: + a = v->reg.data.id; + b = a + v->reg.size / 4; + for (int r = a; r < b; ++r) + ready = MAX2(ready, score->wr.r[r]); + break; + case FILE_PREDICATE: + ready = MAX2(ready, score->wr.p[v->reg.data.id]); + break; + default: + assert(v->reg.file == FILE_FLAGS); + ready = MAX2(ready, score->wr.c); + break; + } + if (cycle < ready) + delay = MAX2(delay, ready - cycle); +} + +void +SchedDataCalculator::recordWr(const Value *v, const int ready) +{ + int a = v->reg.data.id; + + if (v->reg.file == FILE_GPR) { + int b = a + v->reg.size / 4; + for (int r = a; r < b; ++r) + score->rd.r[r] = ready; + } else + // $c, $pX: shorter issue-to-read delay (at least as exec pred and carry) + if (v->reg.file == FILE_PREDICATE) { + score->rd.p[a] = ready + 4; + } else { + assert(v->reg.file == FILE_FLAGS); + score->rd.c = ready + 4; + } +} + +void +SchedDataCalculator::recordRd(const Value *v, const int ready) +{ + int a = v->reg.data.id; + + if (v->reg.file == FILE_GPR) { + int b = a + v->reg.size / 4; + for (int r = a; r < b; ++r) + score->wr.r[r] = ready; + } else + if (v->reg.file == FILE_PREDICATE) { + score->wr.p[a] = ready; + } else + if (v->reg.file == FILE_FLAGS) { + score->wr.c = ready; + } +} + +void +CodeEmitterNVC0::prepareEmission(Function *func) +{ + const Target *targ = func->getProgram()->getTarget(); + + CodeEmitter::prepareEmission(func); + + if (targ->hasSWSched) { + SchedDataCalculator sched(targ); + sched.run(func, true, true); + } +} + +CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target) + : CodeEmitter(target), + writeIssueDelays(target->hasSWSched) { code = NULL; codeSize = codeSizeLimit = 0; diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp index 93af23faac0..ffa40dd93fc 100644 --- a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp @@ -29,7 +29,7 @@ Target *getTargetNVC0(unsigned int chipset) return new TargetNVC0(chipset); } -TargetNVC0::TargetNVC0(unsigned int card) +TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4) { chipset = card; initOpInfo(); @@ -274,8 +274,6 @@ void TargetNVC0::initOpInfo() OP_QUADON, OP_QUADPOP, OP_TEXBAR }; - joinAnterior = false; - for (i = 0; i < DATA_FILE_COUNT; ++i) nativeFileMap[i] = (DataFile)i; nativeFileMap[FILE_ADDRESS] = FILE_GPR; @@ -534,14 +532,39 @@ TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const } // TODO: better values +// this could be more precise, e.g. depending on the issue-to-read/write delay +// of the depending instruction, but it's good enough int TargetNVC0::getLatency(const Instruction *i) const { - if (i->op == OP_LOAD) { - if (i->cache == CACHE_CV) - return 700; - return 48; + if (chipset >= 0xe4) { + if (i->dType == TYPE_F64 || i->sType == TYPE_F64) + return 20; + switch (i->op) { + case OP_LINTERP: + case OP_PINTERP: + return 15; + case OP_LOAD: + if (i->src(0).getFile() == FILE_MEMORY_CONST) + return 9; + // fall through + case OP_VFETCH: + return 24; + default: + if (Target::getOpClass(i->op) == OPCLASS_TEXTURE) + return 17; + if (i->op == OP_MUL && i->dType != TYPE_F32) + return 15; + return 9; + } + } else { + if (i->op == OP_LOAD) { + if (i->cache == CACHE_CV) + return 700; + return 48; + } + return 24; } - return 24; + return 32; } // These are "inverse" throughput values, i.e. the number of cycles required @@ -613,4 +636,42 @@ int TargetNVC0::getThroughput(const Instruction *i) const } } +bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const +{ + const OpClass clA = operationClass[a->op]; + const OpClass clB = operationClass[b->op]; + + if (getChipset() >= 0xe4) { + // not texturing + // not if the 2nd instruction isn't necessarily executed + if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW) + return false; + // anything with MOV + if (a->op == OP_MOV || b->op == OP_MOV) + return true; + if (clA == clB) { + // only F32 arith or integer additions + if (clA != OPCLASS_ARITH) + return false; + return (a->dType == TYPE_F32 || a->op == OP_ADD || + b->dType == TYPE_F32 || b->op == OP_ADD); + } + // nothing with TEXBAR + if (a->op == OP_TEXBAR || b->op == OP_TEXBAR) + return false; + // no loads and stores accessing the the same space + if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) || + (clB == OPCLASS_LOAD && clA == OPCLASS_STORE)) + if (a->src(0).getFile() == b->src(0).getFile()) + return false; + // no > 32-bit ops + if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 || + typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4) + return false; + return true; + } else { + return false; // info not needed (yet) + } +} + } // namespace nv50_ir diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h index e4efe476de0..d859388dfdf 100644 --- a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h +++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.h @@ -51,6 +51,7 @@ public: virtual bool isPostMultiplySupported(operation, float, int& e) const; virtual bool mayPredicate(const Instruction *, const Value *) const; + virtual bool canDualIssue(const Instruction *, const Instruction *) const; virtual int getLatency(const Instruction *) const; virtual int getThroughput(const Instruction *) const; -- 2.30.2