From 8a53da22c2f07aed924a45ab296f7468d842d7f6 Mon Sep 17 00:00:00 2001 From: Anthony Gutierrez Date: Fri, 15 Nov 2013 13:21:15 -0500 Subject: [PATCH] cpu: allow the fetch buffer to be smaller than a cache line the current implementation of the fetch buffer in the o3 cpu is only allowed to be the size of a cache line. some architectures, e.g., ARM, have fetch buffers smaller than a cache line, see slide 22 at: http://www.arm.com/files/pdf/at-exploring_the_design_of_the_cortex-a15.pdf this patch allows the fetch buffer to be set to values smaller than a cache line. --- configs/common/O3_ARM_v7a.py | 1 + src/SConscript | 4 +- src/cpu/o3/O3CPU.py | 1 + src/cpu/o3/fetch.hh | 31 +++++++----- src/cpu/o3/fetch_impl.hh | 92 ++++++++++++++++++++---------------- 5 files changed, 73 insertions(+), 56 deletions(-) diff --git a/configs/common/O3_ARM_v7a.py b/configs/common/O3_ARM_v7a.py index 10d466419..aedafaf4d 100644 --- a/configs/common/O3_ARM_v7a.py +++ b/configs/common/O3_ARM_v7a.py @@ -119,6 +119,7 @@ class O3_ARM_v7a_3(DerivO3CPU): commitToRenameDelay = 1 commitToIEWDelay = 1 fetchWidth = 3 + fetchBufferSize = 16 fetchToDecodeDelay = 3 decodeWidth = 3 decodeToRenameDelay = 2 diff --git a/src/SConscript b/src/SConscript index 133541795..398c342ec 100755 --- a/src/SConscript +++ b/src/SConscript @@ -148,7 +148,7 @@ class SourceFile(object): def __ge__(self, other): return self.filename >= other.filename def __eq__(self, other): return self.filename == other.filename def __ne__(self, other): return self.filename != other.filename - + class Source(SourceFile): '''Add a c/c++ source file to the build''' def __init__(self, source, Werror=True, swig=False, **guards): @@ -164,7 +164,7 @@ class PySource(SourceFile): modules = {} tnodes = {} symnames = {} - + def __init__(self, package, source, **guards): '''specify the python package, the source file, and any guards''' super(PySource, self).__init__(source, **guards) diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py index 044ee9d59..a6094e47c 100644 --- a/src/cpu/o3/O3CPU.py +++ b/src/cpu/o3/O3CPU.py @@ -60,6 +60,7 @@ class DerivO3CPU(BaseCPU): "delay") commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay") fetchWidth = Param.Unsigned(8, "Fetch width") + fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes") renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay") iewToDecodeDelay = Param.Cycles(1, "Issue/Execute/Writeback to decode " diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index 35f58ff74..6ef604af3 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -274,9 +274,9 @@ class DefaultFetch bool lookupAndUpdateNextPC(DynInstPtr &inst, TheISA::PCState &pc); /** - * Fetches the cache line that contains fetch_PC. Returns any + * Fetches the cache line that contains the fetch PC. Returns any * fault that happened. Puts the data into the class variable - * cacheData. + * fetchBuffer, which may not hold the entire fetched cache line. * @param vaddr The memory address that is being fetched from. * @param ret_fault The fault reference that will be set to the result of * the icache access. @@ -339,10 +339,10 @@ class DefaultFetch */ void fetch(bool &status_change); - /** Align a PC to the start of an I-cache block. */ - Addr icacheBlockAlignPC(Addr addr) + /** Align a PC to the start of a fetch buffer block. */ + Addr fetchBufferAlignPC(Addr addr) { - return (addr & ~(cacheBlkMask)); + return (addr & ~(fetchBufferMask)); } /** The decoder. */ @@ -463,17 +463,22 @@ class DefaultFetch /** Cache block size. */ unsigned int cacheBlkSize; - /** Mask to get a cache block's address. */ - Addr cacheBlkMask; + /** The size of the fetch buffer in bytes. The fetch buffer + * itself may be smaller than a cache line. + */ + unsigned fetchBufferSize; + + /** Mask to align a fetch address to a fetch buffer boundary. */ + Addr fetchBufferMask; - /** The cache line being fetched. */ - uint8_t *cacheData[Impl::MaxThreads]; + /** The fetch data that is being fetched and buffered. */ + uint8_t *fetchBuffer[Impl::MaxThreads]; - /** The PC of the cacheline that has been loaded. */ - Addr cacheDataPC[Impl::MaxThreads]; + /** The PC of the first instruction loaded into the fetch buffer. */ + Addr fetchBufferPC[Impl::MaxThreads]; - /** Whether or not the cache data is valid. */ - bool cacheDataValid[Impl::MaxThreads]; + /** Whether or not the fetch buffer data is valid. */ + bool fetchBufferValid[Impl::MaxThreads]; /** Size of instructions. */ int instSize; diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index d97c5ba36..b35dd80f3 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -85,7 +85,8 @@ DefaultFetch::DefaultFetch(O3CPU *_cpu, DerivO3CPUParams *params) retryPkt(NULL), retryTid(InvalidThreadID), cacheBlkSize(cpu->cacheLineSize()), - cacheBlkMask(cacheBlkSize - 1), + fetchBufferSize(params->fetchBufferSize), + fetchBufferMask(fetchBufferSize - 1), numThreads(params->numThreads), numFetchingThreads(params->smtNumFetchingThreads), finishTranslationEvent(this) @@ -98,6 +99,12 @@ DefaultFetch::DefaultFetch(O3CPU *_cpu, DerivO3CPUParams *params) fatal("fetchWidth (%d) is larger than compiled limit (%d),\n" "\tincrease MaxWidth in src/cpu/o3/impl.hh\n", fetchWidth, static_cast(Impl::MaxWidth)); + if (fetchBufferSize > cacheBlkSize) + fatal("fetch buffer size (%u bytes) is greater than the cache " + "block size (%u bytes)\n", fetchBufferSize, cacheBlkSize); + if (cacheBlkSize % fetchBufferSize) + fatal("cache block (%u bytes) is not a multiple of the " + "fetch buffer (%u bytes)\n", cacheBlkSize, fetchBufferSize); std::string policy = params->smtFetchPolicy; @@ -131,16 +138,19 @@ DefaultFetch::DefaultFetch(O3CPU *_cpu, DerivO3CPUParams *params) instSize = sizeof(TheISA::MachInst); for (int i = 0; i < Impl::MaxThreads; i++) { - decoder[i] = new TheISA::Decoder; + decoder[i] = NULL; + fetchBuffer[i] = NULL; + fetchBufferPC[i] = 0; + fetchBufferValid[i] = false; } branchPred = params->branchPred; for (ThreadID tid = 0; tid < numThreads; tid++) { - // Create space to store a cache line. - cacheData[tid] = new uint8_t[cacheBlkSize]; - cacheDataPC[tid] = 0; - cacheDataValid[tid] = false; + decoder[tid] = new TheISA::Decoder; + // Create space to buffer the cache line data, + // which may not hold the entire cache line. + fetchBuffer[tid] = new uint8_t[fetchBufferSize]; } } @@ -327,7 +337,7 @@ DefaultFetch::resetStage() priorityList.clear(); // Setup PC and nextPC with initial state. - for (ThreadID tid = 0; tid < numThreads; tid++) { + for (ThreadID tid = 0; tid < numThreads; ++tid) { fetchStatus[tid] = Running; pc[tid] = cpu->pcState(tid); fetchOffset[tid] = 0; @@ -342,16 +352,14 @@ DefaultFetch::resetStage() stalls[tid].commit = false; stalls[tid].drain = false; + fetchBufferPC[tid] = 0; + fetchBufferValid[tid] = false; + priorityList.push_back(tid); } wroteToTimeBuffer = false; _status = Inactive; - - for (ThreadID tid = 0; tid < numThreads; tid++) { - cacheDataPC[tid] = 0; - cacheDataValid[tid] = false; - } } template @@ -373,8 +381,8 @@ DefaultFetch::processCacheCompletion(PacketPtr pkt) return; } - memcpy(cacheData[tid], pkt->getPtr(), cacheBlkSize); - cacheDataValid[tid] = true; + memcpy(fetchBuffer[tid], pkt->getPtr(), fetchBufferSize); + fetchBufferValid[tid] = true; // Wake up the CPU (if it went to sleep and was waiting on // this completion event). @@ -573,18 +581,19 @@ DefaultFetch::fetchCacheLine(Addr vaddr, ThreadID tid, Addr pc) return false; } - // Align the fetch address so it's at the start of a cache block. - Addr block_PC = icacheBlockAlignPC(vaddr); + // Align the fetch address to the start of a fetch buffer segment. + Addr fetchBufferBlockPC = fetchBufferAlignPC(vaddr); DPRINTF(Fetch, "[tid:%i] Fetching cache line %#x for addr %#x\n", - tid, block_PC, vaddr); + tid, fetchBufferBlockPC, vaddr); // Setup the memReq to do a read of the first instruction's address. // Set the appropriate read size and flags as well. // Build request here. RequestPtr mem_req = - new Request(tid, block_PC, cacheBlkSize, Request::INST_FETCH, - cpu->instMasterId(), pc, cpu->thread[tid]->contextId(), tid); + new Request(tid, fetchBufferBlockPC, fetchBufferSize, + Request::INST_FETCH, cpu->instMasterId(), pc, + cpu->thread[tid]->contextId(), tid); memReq[tid] = mem_req; @@ -601,7 +610,7 @@ void DefaultFetch::finishTranslation(Fault fault, RequestPtr mem_req) { ThreadID tid = mem_req->threadId(); - Addr block_PC = mem_req->getVaddr(); + Addr fetchBufferBlockPC = mem_req->getVaddr(); assert(!cpu->switchedOut()); @@ -634,10 +643,10 @@ DefaultFetch::finishTranslation(Fault fault, RequestPtr mem_req) // Build packet here. PacketPtr data_pkt = new Packet(mem_req, MemCmd::ReadReq); - data_pkt->dataDynamicArray(new uint8_t[cacheBlkSize]); + data_pkt->dataDynamicArray(new uint8_t[fetchBufferSize]); - cacheDataPC[tid] = block_PC; - cacheDataValid[tid] = false; + fetchBufferPC[tid] = fetchBufferBlockPC; + fetchBufferValid[tid] = false; DPRINTF(Fetch, "Fetch: Doing instruction read.\n"); fetchedCacheLines++; @@ -1154,13 +1163,13 @@ DefaultFetch::fetch(bool &status_change) fetchStatus[tid] = Running; status_change = true; } else if (fetchStatus[tid] == Running) { - // Align the fetch PC so its at the start of a cache block. - Addr block_PC = icacheBlockAlignPC(fetchAddr); + // Align the fetch PC so its at the start of a fetch buffer segment. + Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr); // If buffer is no longer valid or fetchAddr has moved to point // to the next cache block, AND we have no remaining ucode // from a macro-op, then start fetch from icache. - if (!(cacheDataValid[tid] && block_PC == cacheDataPC[tid]) + if (!(fetchBufferValid[tid] && fetchBufferBlockPC == fetchBufferPC[tid]) && !inRom && !macroop[tid]) { DPRINTF(Fetch, "[tid:%i]: Attempting to translate and read " "instruction, starting at PC %s.\n", tid, thisPC); @@ -1211,10 +1220,10 @@ DefaultFetch::fetch(bool &status_change) bool predictedBranch = false; TheISA::MachInst *cacheInsts = - reinterpret_cast(cacheData[tid]); + reinterpret_cast(fetchBuffer[tid]); - const unsigned numInsts = cacheBlkSize / instSize; - unsigned blkOffset = (fetchAddr - cacheDataPC[tid]) / instSize; + const unsigned numInsts = fetchBufferSize / instSize; + unsigned blkOffset = (fetchAddr - fetchBufferPC[tid]) / instSize; // Loop through instruction memory from the cache. // Keep issuing while fetchWidth is available and branch is not @@ -1227,12 +1236,13 @@ DefaultFetch::fetch(bool &status_change) bool needMem = !inRom && !curMacroop && !decoder[tid]->instReady(); fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask; - Addr block_PC = icacheBlockAlignPC(fetchAddr); + Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr); if (needMem) { // If buffer is no longer valid or fetchAddr has moved to point // to the next cache block then start fetch from icache. - if (!cacheDataValid[tid] || block_PC != cacheDataPC[tid]) + if (!fetchBufferValid[tid] || + fetchBufferBlockPC != fetchBufferPC[tid]) break; if (blkOffset >= numInsts) { @@ -1328,7 +1338,7 @@ DefaultFetch::fetch(bool &status_change) if (newMacro) { fetchAddr = thisPC.instAddr() & BaseCPU::PCMask; - blkOffset = (fetchAddr - cacheDataPC[tid]) / instSize; + blkOffset = (fetchAddr - fetchBufferPC[tid]) / instSize; pcOffset = 0; curMacroop = NULL; } @@ -1350,9 +1360,9 @@ DefaultFetch::fetch(bool &status_change) } else if (numInst >= fetchWidth) { DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth " "for this cycle.\n", tid); - } else if (blkOffset >= cacheBlkSize) { - DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache " - "block.\n", tid); + } else if (blkOffset >= fetchBufferSize) { + DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of the" + "fetch buffer.\n", tid); } macroop[tid] = curMacroop; @@ -1364,11 +1374,11 @@ DefaultFetch::fetch(bool &status_change) pc[tid] = thisPC; - // pipeline a fetch if we're crossing a cache boundary and not in + // pipeline a fetch if we're crossing a fetch buffer boundary and not in // a state that would preclude fetching fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask; - Addr block_PC = icacheBlockAlignPC(fetchAddr); - issuePipelinedIfetch[tid] = block_PC != cacheDataPC[tid] && + Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr); + issuePipelinedIfetch[tid] = fetchBufferBlockPC != fetchBufferPC[tid] && fetchStatus[tid] != IcacheWaitResponse && fetchStatus[tid] != ItlbWait && fetchStatus[tid] != IcacheWaitRetry && @@ -1575,11 +1585,11 @@ DefaultFetch::pipelineIcacheAccesses(ThreadID tid) Addr pcOffset = fetchOffset[tid]; Addr fetchAddr = (thisPC.instAddr() + pcOffset) & BaseCPU::PCMask; - // Align the fetch PC so its at the start of a cache block. - Addr block_PC = icacheBlockAlignPC(fetchAddr); + // Align the fetch PC so its at the start of a fetch buffer segment. + Addr fetchBufferBlockPC = fetchBufferAlignPC(fetchAddr); // Unless buffer already got the block, fetch it from icache. - if (!(cacheDataValid[tid] && block_PC == cacheDataPC[tid])) { + if (!(fetchBufferValid[tid] && fetchBufferBlockPC == fetchBufferPC[tid])) { DPRINTF(Fetch, "[tid:%i]: Issuing a pipelined I-cache access, " "starting at PC %s.\n", tid, thisPC); -- 2.30.2