/*
- * Copyright (c) 2015 ARM Limited
+ * Copyright (c) 2015-2016 ARM Limited
* All rights reserved
*
* The license below extends only to copyright in the software and shall
}
}
+void
+DistIface::Sync::abort()
+{
+ std::unique_lock<std::mutex> sync_lock(lock);
+ waitNum = 0;
+ isAbort = true;
+ sync_lock.unlock();
+ cv.notify_one();
+}
+
DistIface::SyncSwitch::SyncSwitch(int num_nodes)
{
numNodes = num_nodes;
doStopSync = false;
nextAt = std::numeric_limits<Tick>::max();
nextRepeat = std::numeric_limits<Tick>::max();
+ isAbort = false;
}
DistIface::SyncNode::SyncNode()
doStopSync = false;
nextAt = std::numeric_limits<Tick>::max();
nextRepeat = std::numeric_limits<Tick>::max();
+ isAbort = false;
}
-void
+bool
DistIface::SyncNode::run(bool same_tick)
{
std::unique_lock<std::mutex> sync_lock(lock);
Header header;
assert(waitNum == 0);
+ assert(!isAbort);
waitNum = DistIface::recvThreadsNum;
// initiate the global synchronisation
header.msgType = MsgType::cmdSyncReq;
// now wait until all receiver threads complete the synchronisation
auto lf = [this]{ return waitNum == 0; };
cv.wait(sync_lock, lf);
- // global synchronisation is done
- assert(!same_tick || (nextAt == curTick()));
+ // global synchronisation is done.
+ assert(isAbort || !same_tick || (nextAt == curTick()));
+ return !isAbort;
}
-void
+bool
DistIface::SyncSwitch::run(bool same_tick)
{
std::unique_lock<std::mutex> sync_lock(lock);
cv.wait(sync_lock, lf);
}
assert(waitNum == 0);
+ if (isAbort) // sync aborted
+ return false;
assert(!same_tick || (nextAt == curTick()));
waitNum = numNodes;
// Complete the global synchronisation
header.needStopSync = ReqType::none;
}
DistIface::master->sendCmd(header);
+ return true;
}
-void
+bool
DistIface::SyncSwitch::progress(Tick send_tick,
Tick sync_repeat,
ReqType need_ckpt,
ReqType need_stop_sync)
{
std::unique_lock<std::mutex> sync_lock(lock);
+ if (isAbort) // sync aborted
+ return false;
assert(waitNum > 0);
if (send_tick > nextAt)
sync_lock.unlock();
cv.notify_one();
}
+ // The receive thread must keep alive in the switch until the node
+ // closes the connection. Thus, we always return true here.
+ return true;
}
-void
+bool
DistIface::SyncNode::progress(Tick max_send_tick,
Tick next_repeat,
ReqType do_ckpt,
ReqType do_stop_sync)
{
std::unique_lock<std::mutex> sync_lock(lock);
+ if (isAbort) // sync aborted
+ return false;
assert(waitNum > 0);
nextAt = max_send_tick;
sync_lock.unlock();
cv.notify_one();
}
+ // The receive thread must finish when simulation is about to exit
+ return !doExit;
}
void
repeat = DistIface::sync->nextRepeat;
// Do a global barrier to agree on a common repeat value (the smallest
// one from all participating nodes.
- DistIface::sync->run(false);
+ if (!DistIface::sync->run(false))
+ panic("DistIface::SyncEvent::start() aborted\n");
assert(!DistIface::sync->doCkpt);
assert(!DistIface::sync->doExit);
EventQueue::ScopedRelease sr(curEventQueue());
// we do a global sync here that is supposed to happen at the same
// tick in all gem5 peers
- DistIface::sync->run(true);
+ if (!DistIface::sync->run(true))
+ return; // global sync aborted
// global sync completed
}
if (DistIface::sync->doCkpt)
exitSimLoop("checkpoint");
- if (DistIface::sync->doExit)
+ if (DistIface::sync->doExit) {
exitSimLoop("exit request from gem5 peers");
+ return;
+ }
if (DistIface::sync->doStopSync) {
DistIface::sync->doStopSync = false;
inform("synchronization disabled at %lu\n", curTick());
DistIface::~DistIface()
{
assert(recvThread);
+ recvThread->join();
delete recvThread;
- if (this == master) {
+ if (distIfaceNum-- == 0) {
assert(syncEvent);
delete syncEvent;
assert(sync);
delete sync;
- master = nullptr;
}
+ if (this == master)
+ master = nullptr;
}
void
// because one of them called m5 exit. So we stop here.
// Grab the eventq lock to stop the simulation thread
curEventQueue()->lock();
- exitSimLoop("Message server closed connection, simulator "
- "is exiting");
+ exitSimLoop("connection to gem5 peer got closed");
curEventQueue()->unlock();
+ // The simulation thread may be blocked in processing an on-going
+ // global synchronisation. Abort the sync to give the simulation
+ // thread a chance to make progress and process the exit event.
+ sync->abort();
+ // Finish receiver thread
break;
}
header.sendDelay);
} else {
// everything else must be synchronisation related command
- sync->progress(header.sendTick,
- header.syncRepeat,
- header.needCkpt,
- header.needExit,
- header.needStopSync);
+ if (!sync->progress(header.sendTick,
+ header.syncRepeat,
+ header.needCkpt,
+ header.needExit,
+ header.needStopSync))
+ // Finish receiver thread if simulation is about to exit
+ break;
}
}
}
/*
- * Copyright (c) 2015 ARM Limited
+ * Copyright (c) 2015-2016 ARM Limited
* All rights reserved
*
* The license below extends only to copyright in the software and shall
* Tick for the next periodic sync (if the event is not scheduled yet)
*/
Tick nextAt;
+ /**
+ * Flag is set if the sync is aborted (e.g. due to connection lost)
+ */
+ bool isAbort;
friend class SyncEvent;
void init(Tick start, Tick repeat);
/**
* Core method to perform a full dist sync.
+ *
+ * @return true if the sync completes, false if it gets aborted
*/
- virtual void run(bool same_tick) = 0;
+ virtual bool run(bool same_tick) = 0;
/**
* Callback when the receiver thread gets a sync ack message.
+ *
+ * @return false if the receiver thread needs to stop (e.g.
+ * simulation is to exit)
*/
- virtual void progress(Tick send_tick,
+ virtual bool progress(Tick send_tick,
Tick next_repeat,
ReqType do_ckpt,
ReqType do_exit,
ReqType do_stop_sync) = 0;
+ /**
+ * Abort processing an on-going sync event (in case of an error, e.g.
+ * lost connection to a peer gem5)
+ */
+ void abort();
virtual void requestCkpt(ReqType req) = 0;
virtual void requestExit(ReqType req) = 0;
SyncNode();
~SyncNode() {}
- void run(bool same_tick) override;
- void progress(Tick max_req_tick,
+ bool run(bool same_tick) override;
+ bool progress(Tick max_req_tick,
Tick next_repeat,
ReqType do_ckpt,
ReqType do_exit,
SyncSwitch(int num_nodes);
~SyncSwitch() {}
- void run(bool same_tick) override;
- void progress(Tick max_req_tick,
+ bool run(bool same_tick) override;
+ bool progress(Tick max_req_tick,
Tick next_repeat,
ReqType do_ckpt,
ReqType do_exit,