From d1965af22045d2a62b1cd1bc473b836413d79b46 Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Fri, 4 Jan 2013 19:00:44 -0600 Subject: [PATCH] X86: Move address based decode caching in front of the predecoder. The predecoder in x86 does a lot of work, most of which can be skipped if the decoder cache is put in front of it. Committed by: Nilay Vaish --- src/arch/x86/decoder.cc | 157 +++++++++++++++++++++++++++++++--------- src/arch/x86/decoder.hh | 137 +++++++++++++++++++++++++++-------- src/arch/x86/isa.cc | 21 ++++-- src/arch/x86/isa.hh | 3 +- 4 files changed, 246 insertions(+), 72 deletions(-) diff --git a/src/arch/x86/decoder.cc b/src/arch/x86/decoder.cc index 5fb2dcaf4..9dcb02902 100644 --- a/src/arch/x86/decoder.cc +++ b/src/arch/x86/decoder.cc @@ -38,10 +38,15 @@ namespace X86ISA { -void Decoder::doReset() + +Decoder::State +Decoder::doResetState() { origPC = basePC + offset; DPRINTF(Decoder, "Setting origPC to %#x\n", origPC); + instBytes = &decodePages->lookup(origPC); + chunkIdx = 0; + emi.rex = 0; emi.legacy = 0; emi.opcode.num = 0; @@ -55,12 +60,17 @@ void Decoder::doReset() emi.modRM = 0; emi.sib = 0; - m5Reg = tc->readMiscRegNoEffect(MISCREG_M5_REG); - emi.mode.mode = m5Reg.mode; - emi.mode.submode = m5Reg.submode; + + if (instBytes->si) { + return FromCacheState; + } else { + instBytes->chunks.clear(); + return PrefixState; + } } -void Decoder::process() +void +Decoder::process() { //This function drives the decoder state machine. @@ -70,15 +80,18 @@ void Decoder::process() assert(!outOfBytes); assert(!instDone); + if (state == ResetState) + state = doResetState(); + if (state == FromCacheState) { + state = doFromCacheState(); + } else { + instBytes->chunks.push_back(fetchChunk); + } + //While there's still something to do... - while(!instDone && !outOfBytes) - { + while (!instDone && !outOfBytes) { uint8_t nextByte = getNextByte(); - switch(state) - { - case ResetState: - doReset(); - state = PrefixState; + switch (state) { case PrefixState: state = doPrefixState(nextByte); break; @@ -105,9 +118,42 @@ void Decoder::process() } } +Decoder::State +Decoder::doFromCacheState() +{ + DPRINTF(Decoder, "Looking at cache state.\n"); + if ((fetchChunk & instBytes->masks[chunkIdx]) != + instBytes->chunks[chunkIdx]) { + DPRINTF(Decoder, "Decode cache miss.\n"); + // The chached chunks didn't match what was fetched. Fall back to the + // predecoder. + instBytes->chunks[chunkIdx] = fetchChunk; + instBytes->chunks.resize(chunkIdx + 1); + instBytes->si = NULL; + chunkIdx = 0; + fetchChunk = instBytes->chunks[0]; + offset = origPC % sizeof(MachInst); + basePC = origPC - offset; + return PrefixState; + } else if (chunkIdx == instBytes->chunks.size() - 1) { + // We matched the cache, so use its value. + instDone = true; + offset = instBytes->lastOffset; + if (offset == sizeof(MachInst)) + outOfBytes = true; + return ResetState; + } else { + // We matched so far, but need to check more chunks. + chunkIdx++; + outOfBytes = true; + return FromCacheState; + } +} + //Either get a prefix and record it in the ExtMachInst, or send the //state machine on to get the opcode(s). -Decoder::State Decoder::doPrefixState(uint8_t nextByte) +Decoder::State +Decoder::doPrefixState(uint8_t nextByte) { uint8_t prefix = Prefixes[nextByte]; State nextState = PrefixState; @@ -164,7 +210,8 @@ Decoder::State Decoder::doPrefixState(uint8_t nextByte) //Load all the opcodes (currently up to 2) and then figure out //what immediate and/or ModRM is needed. -Decoder::State Decoder::doOpcodeState(uint8_t nextByte) +Decoder::State +Decoder::doOpcodeState(uint8_t nextByte) { State nextState = ErrorState; emi.opcode.num++; @@ -194,9 +241,9 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte) if (emi.rex.w) logOpSize = 3; // 64 bit operand size else if (emi.legacy.op) - logOpSize = m5Reg.altOp; + logOpSize = altOp; else - logOpSize = m5Reg.defOp; + logOpSize = defOp; //Set the actual op size emi.opSize = 1 << logOpSize; @@ -205,16 +252,16 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte) //a fixed value at the decoder level. int logAddrSize; if(emi.legacy.addr) - logAddrSize = m5Reg.altAddr; + logAddrSize = altAddr; else - logAddrSize = m5Reg.defAddr; + logAddrSize = defAddr; //Set the actual address size emi.addrSize = 1 << logAddrSize; //Figure out the effective stack width. This can be overriden to //a fixed value at the decoder level. - emi.stackSize = 1 << m5Reg.stack; + emi.stackSize = 1 << stack; //Figure out how big of an immediate we'll retreive based //on the opcode. @@ -242,13 +289,14 @@ Decoder::State Decoder::doOpcodeState(uint8_t nextByte) //Get the ModRM byte and determine what displacement, if any, there is. //Also determine whether or not to get the SIB byte, displacement, or //immediate next. -Decoder::State Decoder::doModRMState(uint8_t nextByte) +Decoder::State +Decoder::doModRMState(uint8_t nextByte) { State nextState = ErrorState; ModRM modRM; modRM = nextByte; DPRINTF(Decoder, "Found modrm byte %#x.\n", nextByte); - if (m5Reg.defOp == 1) { + if (defOp == 1) { //figure out 16 bit displacement size if ((modRM.mod == 0 && modRM.rm == 6) || modRM.mod == 2) displacementSize = 2; @@ -297,7 +345,8 @@ Decoder::State Decoder::doModRMState(uint8_t nextByte) //Get the SIB byte. We don't do anything with it at this point, other //than storing it in the ExtMachInst. Determine if we need to get a //displacement or immediate next. -Decoder::State Decoder::doSIBState(uint8_t nextByte) +Decoder::State +Decoder::doSIBState(uint8_t nextByte) { State nextState = ErrorState; emi.sib = nextByte; @@ -318,7 +367,8 @@ Decoder::State Decoder::doSIBState(uint8_t nextByte) //Gather up the displacement, or at least as much of it //as we can get. -Decoder::State Decoder::doDisplacementState() +Decoder::State +Decoder::doDisplacementState() { State nextState = ErrorState; @@ -365,7 +415,8 @@ Decoder::State Decoder::doDisplacementState() //Gather up the immediate, or at least as much of it //as we can get -Decoder::State Decoder::doImmediateState() +Decoder::State +Decoder::doImmediateState() { State nextState = ErrorState; @@ -408,24 +459,62 @@ Decoder::State Decoder::doImmediateState() return nextState; } -DecodeCache::InstMap Decoder::instMap; -DecodeCache::AddrMap Decoder::decodePages; +Decoder::InstBytes Decoder::dummy; +Decoder::InstCacheMap Decoder::instCacheMap; StaticInstPtr Decoder::decode(ExtMachInst mach_inst, Addr addr) { - StaticInstPtr &si = decodePages.lookup(addr); - if (si && (si->machInst == mach_inst)) - return si; + DecodeCache::InstMap::iterator iter = instMap->find(mach_inst); + if (iter != instMap->end()) + return iter->second; + + StaticInstPtr si = decodeInst(mach_inst); + (*instMap)[mach_inst] = si; + return si; +} + +StaticInstPtr +Decoder::decode(PCState &nextPC) +{ + if (!instDone) + return NULL; + instDone = false; + updateNPC(nextPC); - DecodeCache::InstMap::iterator iter = instMap.find(mach_inst); - if (iter != instMap.end()) { - si = iter->second; + StaticInstPtr &si = instBytes->si; + if (si) return si; + + // We didn't match in the AddrMap, but we still populated an entry. Fix + // up its byte masks. + const int chunkSize = sizeof(MachInst); + + instBytes->lastOffset = offset; + + Addr firstBasePC = basePC - (instBytes->chunks.size() - 1) * chunkSize; + Addr firstOffset = origPC - firstBasePC; + Addr totalSize = instBytes->lastOffset - firstOffset + + (instBytes->chunks.size() - 1) * chunkSize; + int start = firstOffset; + instBytes->masks.clear(); + + while (totalSize) { + int end = start + totalSize; + end = (chunkSize < end) ? chunkSize : end; + int size = end - start; + int idx = instBytes->masks.size(); + + MachInst maskVal = mask(size * 8) << (start * 8); + assert(maskVal); + + instBytes->masks.push_back(maskVal); + instBytes->chunks[idx] &= instBytes->masks[idx]; + totalSize -= size; + start = 0; } - si = decodeInst(mach_inst); - instMap[mach_inst] = si; + si = decode(emi, origPC); return si; } diff --git a/src/arch/x86/decoder.hh b/src/arch/x86/decoder.hh index 24194d839..796f9eef9 100644 --- a/src/arch/x86/decoder.hh +++ b/src/arch/x86/decoder.hh @@ -32,6 +32,7 @@ #define __ARCH_X86_DECODER_HH__ #include +#include #include "arch/x86/regs/misc.hh" #include "arch/x86/types.hh" @@ -58,9 +59,24 @@ class Decoder static const uint8_t SizeTypeToSize[3][10]; protected: + struct InstBytes + { + StaticInstPtr si; + std::vector chunks; + std::vector masks; + int lastOffset; + + InstBytes() : lastOffset(0) + {} + }; + + static InstBytes dummy; + ThreadContext * tc; //The bytes to be predecoded MachInst fetchChunk; + InstBytes *instBytes; + int chunkIdx; //The pc of the start of fetchChunk Addr basePC; //The pc the current instruction started at @@ -69,9 +85,16 @@ class Decoder int offset; //The extended machine instruction being generated ExtMachInst emi; - HandyM5Reg m5Reg; - - inline uint8_t getNextByte() + //Predecoding state + X86Mode mode; + X86SubMode submode; + uint8_t altOp; + uint8_t defOp; + uint8_t altAddr; + uint8_t defAddr; + uint8_t stack; + + uint8_t getNextByte() { return ((uint8_t *)&fetchChunk)[offset]; } @@ -99,23 +122,34 @@ class Decoder consumeBytes(toGet); } - inline void consumeByte() + void updateOffsetState() { - offset++; assert(offset <= sizeof(MachInst)); - if(offset == sizeof(MachInst)) - outOfBytes = true; + if (offset == sizeof(MachInst)) { + DPRINTF(Decoder, "At the end of a chunk, idx = %d, chunks = %d.\n", + chunkIdx, instBytes->chunks.size()); + chunkIdx++; + if (chunkIdx == instBytes->chunks.size()) { + outOfBytes = true; + } else { + offset = 0; + fetchChunk = instBytes->chunks[chunkIdx]; + basePC += sizeof(MachInst); + } + } } - inline void consumeBytes(int numBytes) + void consumeByte() { - offset += numBytes; - assert(offset <= sizeof(MachInst)); - if(offset == sizeof(MachInst)) - outOfBytes = true; + offset++; + updateOffsetState(); } - void doReset(); + void consumeBytes(int numBytes) + { + offset += numBytes; + updateOffsetState(); + } //State machine state protected: @@ -133,6 +167,7 @@ class Decoder enum State { ResetState, + FromCacheState, PrefixState, OpcodeState, ModRMState, @@ -146,6 +181,8 @@ class Decoder State state; //Functions to handle each of the states + State doResetState(); + State doFromCacheState(); State doPrefixState(uint8_t); State doOpcodeState(uint8_t); State doModRMState(uint8_t); @@ -153,6 +190,20 @@ class Decoder State doDisplacementState(); State doImmediateState(); + protected: + /// Caching for decoded instruction objects. + + typedef MiscReg CacheKey; + + typedef DecodeCache::AddrMap DecodePages; + DecodePages *decodePages; + typedef m5::hash_map AddrCacheMap; + AddrCacheMap addrCacheMap; + + DecodeCache::InstMap *instMap; + typedef m5::hash_map InstCacheMap; + static InstCacheMap instCacheMap; + public: Decoder(ThreadContext * _tc) : tc(_tc), basePC(0), origPC(0), offset(0), @@ -160,9 +211,47 @@ class Decoder state(ResetState) { memset(&emi, 0, sizeof(emi)); - emi.mode.mode = LongMode; - emi.mode.submode = SixtyFourBitMode; - m5Reg = 0; + mode = LongMode; + submode = SixtyFourBitMode; + emi.mode.mode = mode; + emi.mode.submode = submode; + altOp = 0; + defOp = 0; + altAddr = 0; + defAddr = 0; + stack = 0; + instBytes = &dummy; + decodePages = NULL; + instMap = NULL; + } + + void setM5Reg(HandyM5Reg m5Reg) + { + mode = (X86Mode)(uint64_t)m5Reg.mode; + submode = (X86SubMode)(uint64_t)m5Reg.submode; + emi.mode.mode = mode; + emi.mode.submode = submode; + altOp = m5Reg.altOp; + defOp = m5Reg.defOp; + altAddr = m5Reg.altAddr; + defAddr = m5Reg.defAddr; + stack = m5Reg.stack; + + AddrCacheMap::iterator amIter = addrCacheMap.find(m5Reg); + if (amIter != addrCacheMap.end()) { + decodePages = amIter->second; + } else { + decodePages = new DecodePages; + addrCacheMap[m5Reg] = decodePages; + } + + InstCacheMap::iterator imIter = instCacheMap.find(m5Reg); + if (imIter != instCacheMap.end()) { + instMap = imIter->second; + } else { + instMap = new DecodeCache::InstMap; + instCacheMap[m5Reg] = instMap; + } } void reset() @@ -218,11 +307,6 @@ class Decoder } } - protected: - /// Caching for decoded instruction objects. - static DecodeCache::InstMap instMap; - static DecodeCache::AddrMap decodePages; - public: StaticInstPtr decodeInst(ExtMachInst mach_inst); @@ -230,16 +314,7 @@ class Decoder /// @param mach_inst The binary instruction to decode. /// @retval A pointer to the corresponding StaticInst object. StaticInstPtr decode(ExtMachInst mach_inst, Addr addr); - - StaticInstPtr - decode(X86ISA::PCState &nextPC) - { - if (!instDone) - return NULL; - instDone = false; - updateNPC(nextPC); - return decode(emi, origPC); - } + StaticInstPtr decode(X86ISA::PCState &nextPC); }; } // namespace X86ISA diff --git a/src/arch/x86/isa.cc b/src/arch/x86/isa.cc index 5305b1058..1a9b39840 100644 --- a/src/arch/x86/isa.cc +++ b/src/arch/x86/isa.cc @@ -28,6 +28,7 @@ * Authors: Gabe Black */ +#include "arch/x86/decoder.hh" #include "arch/x86/isa.hh" #include "arch/x86/tlb.hh" #include "cpu/base.hh" @@ -39,7 +40,8 @@ namespace X86ISA void ISA::updateHandyM5Reg(Efer efer, CR0 cr0, - SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags) + SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags, + ThreadContext *tc) { HandyM5Reg m5reg = 0; if (efer.lma) { @@ -94,6 +96,8 @@ ISA::updateHandyM5Reg(Efer efer, CR0 cr0, } regVal[MISCREG_M5_REG] = m5reg; + if (tc) + tc->getDecoderPtr()->setM5Reg(m5reg); } void @@ -184,7 +188,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc) newCR0, regVal[MISCREG_CS_ATTR], regVal[MISCREG_SS_ATTR], - regVal[MISCREG_RFLAGS]); + regVal[MISCREG_RFLAGS], + tc); } break; case MISCREG_CR2: @@ -225,7 +230,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc) regVal[MISCREG_CR0], newCSAttr, regVal[MISCREG_SS_ATTR], - regVal[MISCREG_RFLAGS]); + regVal[MISCREG_RFLAGS], + tc); } break; case MISCREG_SS_ATTR: @@ -233,7 +239,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc) regVal[MISCREG_CR0], regVal[MISCREG_CS_ATTR], val, - regVal[MISCREG_RFLAGS]); + regVal[MISCREG_RFLAGS], + tc); break; // These segments always actually use their bases, or in other words // their effective bases must stay equal to their actual bases. @@ -340,7 +347,8 @@ ISA::setMiscReg(int miscReg, MiscReg val, ThreadContext * tc) regVal[MISCREG_CR0], regVal[MISCREG_CS_ATTR], regVal[MISCREG_SS_ATTR], - regVal[MISCREG_RFLAGS]); + regVal[MISCREG_RFLAGS], + tc); return; default: break; @@ -363,7 +371,8 @@ ISA::unserialize(EventManager *em, Checkpoint * cp, regVal[MISCREG_CR0], regVal[MISCREG_CS_ATTR], regVal[MISCREG_SS_ATTR], - regVal[MISCREG_RFLAGS]); + regVal[MISCREG_RFLAGS], + NULL); } } diff --git a/src/arch/x86/isa.hh b/src/arch/x86/isa.hh index 463a249a4..7b0c7b61a 100644 --- a/src/arch/x86/isa.hh +++ b/src/arch/x86/isa.hh @@ -50,7 +50,8 @@ namespace X86ISA protected: MiscReg regVal[NUM_MISCREGS]; void updateHandyM5Reg(Efer efer, CR0 cr0, - SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags); + SegAttr csAttr, SegAttr ssAttr, RFLAGS rflags, + ThreadContext *tc); public: void clear(); -- 2.30.2