From e5415671bd8871b99e754b5db75714a6492c0917 Mon Sep 17 00:00:00 2001 From: Tuan Ta Date: Mon, 2 Apr 2018 15:20:02 -0400 Subject: [PATCH] cpu: fixed how O3 CPU executes an exit system call When a thread executed an exit syscall in SE mode, the thread context was removed immediately in the same cycle, which left inflight squash operations and trap event incomplete. The problem happened when a new thread was assigned to the CPU later. The new thread started with some incomplete transactions of the previous thread (e.g., squashing). This problem could cause incorrect execution flow for the new thread (i.e., pc was not reset properly at the exit point), deadlock (i.e., some stage-to-stage signals were not reset) and incorrect rename map between logical and physical registers. This patch adds a new state called 'Halting' to the thread context and defers removing thread context from a CPU until a trap event initiated by an exit syscall execution is processed. This patch also makes sure that the removal of a thread context happens after all inflight transactions of the to-be-removed thread in the pipeline complete. Change-Id: If7ef1462fb8864e22b45371ee7ae67e2a5ad38b8 Reviewed-on: https://gem5-review.googlesource.com/c/8184 Reviewed-by: Giacomo Gabrielli Maintainer: Jason Lowe-Power --- src/cpu/o3/commit.hh | 3 + src/cpu/o3/commit_impl.hh | 23 ++++++ src/cpu/o3/cpu.cc | 132 ++++++++++++++++++++---------- src/cpu/o3/cpu.hh | 25 ++++++ src/cpu/o3/decode.hh | 4 + src/cpu/o3/decode_impl.hh | 8 ++ src/cpu/o3/fetch.hh | 3 + src/cpu/o3/fetch_impl.hh | 20 +++++ src/cpu/o3/iew.hh | 3 + src/cpu/o3/iew_impl.hh | 13 +++ src/cpu/o3/rename.hh | 3 + src/cpu/o3/rename_impl.hh | 22 +++++ src/cpu/o3/thread_context_impl.hh | 13 ++- src/cpu/thread_context.hh | 4 + src/sim/eventq.hh | 3 + 15 files changed, 232 insertions(+), 47 deletions(-) diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 2d7a23e5d..4e32f865d 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -187,6 +187,9 @@ class DefaultCommit /** Initializes stage by sending back the number of free entries. */ void startupStage(); + /** Clear all thread-specific states */ + void clearStates(ThreadID tid); + /** Initializes the draining of commit. */ void drain(); diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index 879fad292..2891ce331 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -364,6 +364,22 @@ DefaultCommit::startupStage() cpu->activityThisCycle(); } +template +void +DefaultCommit::clearStates(ThreadID tid) +{ + commitStatus[tid] = Idle; + changedROBNumEntries[tid] = false; + checkEmptyROB[tid] = false; + trapInFlight[tid] = false; + committedStores[tid] = false; + trapSquash[tid] = false; + tcSquash[tid] = false; + pc[tid].set(0); + lastCommitedSeqNum[tid] = 0; + squashAfterInst[tid] = NULL; +} + template void DefaultCommit::drain() @@ -813,6 +829,13 @@ DefaultCommit::commit() if (trapSquash[tid]) { assert(!tcSquash[tid]); squashFromTrap(tid); + + // If the thread is trying to exit (i.e., an exit syscall was + // executed), this trapSquash was originated by the exit + // syscall earlier. In this case, schedule an exit event in + // the next cycle to fully terminate this thread + if (cpu->isThreadExiting(tid)) + cpu->scheduleThreadExitEvent(tid); } else if (tcSquash[tid]) { assert(commitStatus[tid] != TrapPending); squashFromTC(tid); diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index f5aa9f712..e50741ec0 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -143,6 +143,8 @@ FullO3CPU::FullO3CPU(DerivO3CPUParams *params) dtb(params->dtb), tickEvent([this]{ tick(); }, "FullO3CPU tick", false, Event::CPU_Tick_Pri), + threadExitEvent([this]{ exitThreads(); }, "FullO3CPU exit threads", + false, Event::CPU_Exit_Pri), #ifndef NDEBUG instcount(0), #endif @@ -810,7 +812,7 @@ void FullO3CPU::haltContext(ThreadID tid) { //For now, this is the same as deallocate - DPRINTF(O3CPU,"[tid:%i]: Halt Context called. Deallocating", tid); + DPRINTF(O3CPU,"[tid:%i]: Halt Context called. Deallocating\n", tid); assert(!switchedOut()); deactivateThread(tid); @@ -886,51 +888,20 @@ FullO3CPU::removeThread(ThreadID tid) // here to alleviate the case for double-freeing registers // in SMT workloads. - // Unbind Int Regs from Rename Map - for (RegId reg_id(IntRegClass, 0); reg_id.index() < TheISA::NumIntRegs; - reg_id.index()++) { - PhysRegIdPtr phys_reg = renameMap[tid].lookup(reg_id); - scoreboard.unsetReg(phys_reg); - freeList.addReg(phys_reg); - } - - // Unbind Float Regs from Rename Map - for (RegId reg_id(FloatRegClass, 0); reg_id.index() < TheISA::NumFloatRegs; - reg_id.index()++) { - PhysRegIdPtr phys_reg = renameMap[tid].lookup(reg_id); - scoreboard.unsetReg(phys_reg); - freeList.addReg(phys_reg); - } - - // Unbind Float Regs from Rename Map - for (unsigned preg = 0; preg < TheISA::NumVecPredRegs; preg++) { - PhysRegIdPtr phys_reg = renameMap[tid].lookup( - RegId(VecPredRegClass, preg)); - scoreboard.unsetReg(phys_reg); - freeList.addReg(phys_reg); - } - - // Unbind condition-code Regs from Rename Map - for (RegId reg_id(CCRegClass, 0); reg_id.index() < TheISA::NumCCRegs; - reg_id.index()++) { - PhysRegIdPtr phys_reg = renameMap[tid].lookup(reg_id); - scoreboard.unsetReg(phys_reg); - freeList.addReg(phys_reg); - } - - // Squash Throughout Pipeline - DynInstPtr inst = commit.rob->readHeadInst(tid); - InstSeqNum squash_seq_num = inst->seqNum; - fetch.squash(0, squash_seq_num, inst, tid); - decode.squash(tid); - rename.squash(squash_seq_num, tid); - iew.squash(tid); - iew.ldstQueue.squash(squash_seq_num, tid); - commit.rob->squash(squash_seq_num, tid); - - + // clear all thread-specific states in each stage of the pipeline + // since this thread is going to be completely removed from the CPU + commit.clearStates(tid); + fetch.clearStates(tid); + decode.clearStates(tid); + rename.clearStates(tid); + iew.clearStates(tid); + + // at this step, all instructions in the pipeline should be already + // either committed successfully or squashed. All thread-specific + // queues in the pipeline must be empty. assert(iew.instQueue.getCount(tid) == 0); assert(iew.ldstQueue.getCount(tid) == 0); + assert(commit.rob->isEmpty(tid)); // Reset ROB/IQ/LSQ Entries @@ -1884,5 +1855,78 @@ FullO3CPU::updateThreadPriority() } } +template +void +FullO3CPU::addThreadToExitingList(ThreadID tid) +{ + DPRINTF(O3CPU, "Thread %d is inserted to exitingThreads list\n", tid); + + // make sure the thread is Active + assert(std::find(activeThreads.begin(), activeThreads.end(), tid) + != activeThreads.end()); + + // make sure the thread has not been added to the list yet + assert(exitingThreads.count(tid) == 0); + + // add the thread to exitingThreads list to mark that this thread is + // trying to exit. The boolean value in the pair denotes if a thread is + // ready to exit. The thread is not ready to exit until the corresponding + // exit trap event is processed in the future. Until then, it'll be still + // an active thread that is trying to exit. + exitingThreads.emplace(std::make_pair(tid, false)); +} + +template +bool +FullO3CPU::isThreadExiting(ThreadID tid) const +{ + return exitingThreads.count(tid) == 1; +} + +template +void +FullO3CPU::scheduleThreadExitEvent(ThreadID tid) +{ + assert(exitingThreads.count(tid) == 1); + + // exit trap event has been processed. Now, the thread is ready to exit + // and be removed from the CPU. + exitingThreads[tid] = true; + + // we schedule a threadExitEvent in the next cycle to properly clean + // up the thread's states in the pipeline. threadExitEvent has lower + // priority than tickEvent, so the cleanup will happen at the very end + // of the next cycle after all pipeline stages complete their operations. + // We want all stages to complete squashing instructions before doing + // the cleanup. + if (!threadExitEvent.scheduled()) { + schedule(threadExitEvent, nextCycle()); + } +} + +template +void +FullO3CPU::exitThreads() +{ + // there must be at least one thread trying to exit + assert(exitingThreads.size() > 0); + + // terminate all threads that are ready to exit + auto it = exitingThreads.begin(); + while (it != exitingThreads.end()) { + ThreadID thread_id = it->first; + bool readyToExit = it->second; + + if (readyToExit) { + DPRINTF(O3CPU, "Exiting thread %d\n", thread_id); + haltContext(thread_id); + tcBase(thread_id)->setStatus(ThreadContext::Halted); + it = exitingThreads.erase(it); + } else { + it++; + } + } +} + // Forward declaration of FullO3CPU. template class FullO3CPU; diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index aabac5fea..ec6be657a 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -205,6 +205,9 @@ class FullO3CPU : public BaseO3CPU /** The tick event used for scheduling CPU ticks. */ EventFunctionWrapper tickEvent; + /** The exit event used for terminating all ready-to-exit threads */ + EventFunctionWrapper threadExitEvent; + /** Schedule tick event, regardless of its current state. */ void scheduleTickEvent(Cycles delay) { @@ -331,6 +334,21 @@ class FullO3CPU : public BaseO3CPU void serializeThread(CheckpointOut &cp, ThreadID tid) const override; void unserializeThread(CheckpointIn &cp, ThreadID tid) override; + /** Insert tid to the list of threads trying to exit */ + void addThreadToExitingList(ThreadID tid); + + /** Is the thread trying to exit? */ + bool isThreadExiting(ThreadID tid) const; + + /** + * If a thread is trying to exit and its corresponding trap event + * has been completed, schedule an event to terminate the thread. + */ + void scheduleThreadExitEvent(ThreadID tid); + + /** Terminate all threads that are ready to exit */ + void exitThreads(); + public: /** Executes a syscall. * @todo: Determine if this needs to be virtual. @@ -648,6 +666,13 @@ class FullO3CPU : public BaseO3CPU /** Active Threads List */ std::list activeThreads; + /** + * This is a list of threads that are trying to exit. Each thread id + * is mapped to a boolean value denoting whether the thread is ready + * to exit. + */ + std::unordered_map exitingThreads; + /** Integer Register Scoreboard */ Scoreboard scoreboard; diff --git a/src/cpu/o3/decode.hh b/src/cpu/o3/decode.hh index a1e29a6b5..4cd318404 100644 --- a/src/cpu/o3/decode.hh +++ b/src/cpu/o3/decode.hh @@ -102,6 +102,10 @@ class DefaultDecode DefaultDecode(O3CPU *_cpu, DerivO3CPUParams *params); void startupStage(); + + /** Clear all thread-specific states */ + void clearStates(ThreadID tid); + void resetStage(); /** Returns the name of decode. */ diff --git a/src/cpu/o3/decode_impl.hh b/src/cpu/o3/decode_impl.hh index 63b180ef9..27b3c30a1 100644 --- a/src/cpu/o3/decode_impl.hh +++ b/src/cpu/o3/decode_impl.hh @@ -91,6 +91,14 @@ DefaultDecode::startupStage() resetStage(); } +template +void +DefaultDecode::clearStates(ThreadID tid) +{ + decodeStatus[tid] = Idle; + stalls[tid].rename = false; +} + template void DefaultDecode::resetStage() diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh index ee1932bec..3cf0773fd 100644 --- a/src/cpu/o3/fetch.hh +++ b/src/cpu/o3/fetch.hh @@ -216,6 +216,9 @@ class DefaultFetch /** Initialize stage. */ void startupStage(); + /** Clear all thread-specific states*/ + void clearStates(ThreadID tid); + /** Handles retrying the fetch access. */ void recvReqRetry(); diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh index 73c1ed156..8afe62335 100644 --- a/src/cpu/o3/fetch_impl.hh +++ b/src/cpu/o3/fetch_impl.hh @@ -326,6 +326,26 @@ DefaultFetch::startupStage() switchToActive(); } +template +void +DefaultFetch::clearStates(ThreadID tid) +{ + fetchStatus[tid] = Running; + pc[tid] = cpu->pcState(tid); + fetchOffset[tid] = 0; + macroop[tid] = NULL; + delayedCommit[tid] = false; + memReq[tid] = NULL; + stalls[tid].decode = false; + stalls[tid].drain = false; + fetchBufferPC[tid] = 0; + fetchBufferValid[tid] = false; + fetchQueue[tid].clear(); + + // TODO not sure what to do with priorityList for now + // priorityList.push_back(tid); +} + template void DefaultFetch::resetStage() diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index b15521f5d..363c645c3 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -147,6 +147,9 @@ class DefaultIEW /** Initializes stage; sends back the number of free IQ and LSQ entries. */ void startupStage(); + /** Clear all thread-specific states */ + void clearStates(ThreadID tid); + /** Sets main time buffer used for backwards communication. */ void setTimeBuffer(TimeBuffer *tb_ptr); diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index 3d5d84886..251389631 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -323,6 +323,19 @@ DefaultIEW::startupStage() cpu->activateStage(O3CPU::IEWIdx); } +template +void +DefaultIEW::clearStates(ThreadID tid) +{ + toRename->iewInfo[tid].usedIQ = true; + toRename->iewInfo[tid].freeIQEntries = + instQueue.numFreeEntries(tid); + + toRename->iewInfo[tid].usedLSQ = true; + toRename->iewInfo[tid].freeLQEntries = ldstQueue.numFreeLoadEntries(tid); + toRename->iewInfo[tid].freeSQEntries = ldstQueue.numFreeStoreEntries(tid); +} + template void DefaultIEW::setTimeBuffer(TimeBuffer *tb_ptr) diff --git a/src/cpu/o3/rename.hh b/src/cpu/o3/rename.hh index a091c0908..572790991 100644 --- a/src/cpu/o3/rename.hh +++ b/src/cpu/o3/rename.hh @@ -169,6 +169,9 @@ class DefaultRename /** Initializes variables for the stage. */ void startupStage(); + /** Clear all thread-specific states */ + void clearStates(ThreadID tid); + /** Sets pointer to list of active threads. */ void setActiveThreads(std::list *at_ptr); diff --git a/src/cpu/o3/rename_impl.hh b/src/cpu/o3/rename_impl.hh index b63163f04..fd9b09e20 100644 --- a/src/cpu/o3/rename_impl.hh +++ b/src/cpu/o3/rename_impl.hh @@ -254,6 +254,28 @@ DefaultRename::startupStage() resetStage(); } +template +void +DefaultRename::clearStates(ThreadID tid) +{ + renameStatus[tid] = Idle; + + freeEntries[tid].iqEntries = iew_ptr->instQueue.numFreeEntries(tid); + freeEntries[tid].lqEntries = iew_ptr->ldstQueue.numFreeLoadEntries(tid); + freeEntries[tid].sqEntries = iew_ptr->ldstQueue.numFreeStoreEntries(tid); + freeEntries[tid].robEntries = commit_ptr->numROBFreeEntries(tid); + emptyROB[tid] = true; + + stalls[tid].iew = false; + serializeInst[tid] = NULL; + + instsInProgress[tid] = 0; + loadsInProgress[tid] = 0; + storesInProgress[tid] = 0; + + serializeOnNextInst[tid] = false; +} + template void DefaultRename::resetStage() diff --git a/src/cpu/o3/thread_context_impl.hh b/src/cpu/o3/thread_context_impl.hh index 473e2e28e..57e0a45f5 100644 --- a/src/cpu/o3/thread_context_impl.hh +++ b/src/cpu/o3/thread_context_impl.hh @@ -128,11 +128,18 @@ O3ThreadContext::halt() { DPRINTF(O3CPU, "Calling halt on Thread Context %d\n", threadId()); - if (thread->status() == ThreadContext::Halted) + if (thread->status() == ThreadContext::Halting || + thread->status() == ThreadContext::Halted) return; - thread->setStatus(ThreadContext::Halted); - cpu->haltContext(thread->threadId()); + // the thread is not going to halt/terminate immediately in this cycle. + // The thread will be removed after an exit trap is processed + // (e.g., after trapLatency cycles). Until then, the thread's status + // will be Halting. + thread->setStatus(ThreadContext::Halting); + + // add this thread to the exiting list to mark that it is trying to exit. + cpu->addThreadToExitingList(thread->threadId()); } template diff --git a/src/cpu/thread_context.hh b/src/cpu/thread_context.hh index a570b9a00..6b9ff1a12 100644 --- a/src/cpu/thread_context.hh +++ b/src/cpu/thread_context.hh @@ -111,6 +111,10 @@ class ThreadContext /// synchronization, etc. Suspended, + /// Trying to exit and waiting for an event to completely exit. + /// Entered when target executes an exit syscall. + Halting, + /// Permanently shut down. Entered when target executes /// m5exit pseudo-instruction. When all contexts enter /// this state, the simulation will terminate. diff --git a/src/sim/eventq.hh b/src/sim/eventq.hh index 6e8e63338..895f69424 100644 --- a/src/sim/eventq.hh +++ b/src/sim/eventq.hh @@ -161,6 +161,9 @@ class EventBase /// (such as writebacks). static const Priority CPU_Tick_Pri = 50; + /// If we want to exit a thread in a CPU, it comes after CPU_Tick_Pri + static const Priority CPU_Exit_Pri = 64; + /// Statistics events (dump, reset, etc.) come after /// everything else, but before exit. static const Priority Stat_Event_Pri = 90; -- 2.30.2