uby: Fix checkpointing and restore
authorTimothy Jones <timothy.jones@cl.cam.ac.uk>
Tue, 4 Aug 2015 04:08:40 +0000 (23:08 -0500)
committerTimothy Jones <timothy.jones@cl.cam.ac.uk>
Tue, 4 Aug 2015 04:08:40 +0000 (23:08 -0500)
There are 2 problems with the existing checkpoint and restore code in ruby.
The first is that when the event queue is altered by ruby during serialization,
some events that are currently scheduled cannot be found (e.g. the event to
stop simulation that always lives on the queue), causing a panic.
The second is that ruby is sometimes serialized after the memory system,
meaning that the dirty data in its cache is flushed back to memory too late
and so isn't included in the checkpoint.

These are fixed by implementing memory writeback in ruby, using the same
technique of hijacking the event queue, but first descheduling all events that
are currently on it.  They are saved, along with their scheduled time, so that
the event queue can be faithfully reconstructed after writeback has finished.
Events with the AutoDelete flag set will delete themselves when they
are descheduled, causing an error when attempting to schedule them again.
This is fixed by simply not recording them when taking them off the queue.

Writeback is still implemented using flushing, so the cache recorder object,
that is created to generate the trace and manage flushing, is kept
around and used during serialization to write the trace to disk.

Committed by: Nilay Vaish <nilay@cs.wisc.edu>

src/mem/ruby/system/CacheRecorder.cc
src/mem/ruby/system/System.cc
src/mem/ruby/system/System.hh
src/sim/eventq.hh

index ab7d1cc91729ce5483c031e330854ee5379aa208..8e8757967747dc330079084960508f53090cf500 100644 (file)
@@ -95,6 +95,8 @@ CacheRecorder::enqueueNextFlushRequest()
         m_sequencer_ptr->makeRequest(pkt);
 
         DPRINTF(RubyCacheTrace, "Flushing %s\n", *rec);
+    } else {
+        DPRINTF(RubyCacheTrace, "Flushed all %d records\n", m_records_flushed);
     }
 }
 
@@ -137,6 +139,8 @@ CacheRecorder::enqueueNextFetchRequest()
 
         m_bytes_read += (sizeof(TraceRecord) + m_block_size_bytes);
         m_records_read++;
+    } else {
+        DPRINTF(RubyCacheTrace, "Fetched all %d records\n", m_records_read);
     }
 }
 
index 815b89ee536ca8d0a9781620b5cb3edaf00ac2d4..98cf50e9c5d837bb4a099f96c6e93c4b1eb7faa1 100644 (file)
@@ -30,6 +30,7 @@
 #include <zlib.h>
 
 #include <cstdio>
+#include <list>
 
 #include "base/intmath.hh"
 #include "base/statistics.hh"
@@ -56,7 +57,8 @@ unsigned RubySystem::m_systems_to_warmup = 0;
 bool RubySystem::m_cooldown_enabled = false;
 
 RubySystem::RubySystem(const Params *p)
-    : ClockedObject(p), m_access_backing_store(p->access_backing_store)
+    : ClockedObject(p), m_access_backing_store(p->access_backing_store),
+      m_cache_recorder(NULL)
 {
     m_random_seed = p->random_seed;
     srandom(m_random_seed);
@@ -98,6 +100,111 @@ RubySystem::~RubySystem()
     delete m_profiler;
 }
 
+void
+RubySystem::makeCacheRecorder(uint8_t *uncompressed_trace,
+                              uint64 cache_trace_size,
+                              uint64 block_size_bytes)
+{
+    vector<Sequencer*> sequencer_map;
+    Sequencer* sequencer_ptr = NULL;
+
+    for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
+        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
+        if (sequencer_ptr == NULL) {
+            sequencer_ptr = sequencer_map[cntrl];
+        }
+    }
+
+    assert(sequencer_ptr != NULL);
+
+    for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
+        if (sequencer_map[cntrl] == NULL) {
+            sequencer_map[cntrl] = sequencer_ptr;
+        }
+    }
+
+    // Remove the old CacheRecorder if it's still hanging about.
+    if (m_cache_recorder != NULL) {
+        delete m_cache_recorder;
+    }
+
+    // Create the CacheRecorder and record the cache trace
+    m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
+                                         sequencer_map, block_size_bytes);
+}
+
+void
+RubySystem::memWriteback()
+{
+    m_cooldown_enabled = true;
+
+    // Make the trace so we know what to write back.
+    DPRINTF(RubyCacheTrace, "Recording Cache Trace\n");
+    makeCacheRecorder(NULL, 0, getBlockSizeBytes());
+    for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
+        m_abs_cntrl_vec[cntrl]->recordCacheTrace(cntrl, m_cache_recorder);
+    }
+    DPRINTF(RubyCacheTrace, "Cache Trace Complete\n");
+
+    // save the current tick value
+    Tick curtick_original = curTick();
+    DPRINTF(RubyCacheTrace, "Recording current tick %ld\n", curtick_original);
+
+    // Deschedule all prior events on the event queue, but record the tick they
+    // were scheduled at so they can be restored correctly later.
+    list<pair<Event*, Tick> > original_events;
+    while (!eventq->empty()) {
+        Event *curr_head = eventq->getHead();
+        if (curr_head->isAutoDelete()) {
+            DPRINTF(RubyCacheTrace, "Event %s auto-deletes when descheduled,"
+                    " not recording\n", curr_head->name());
+        } else {
+            original_events.push_back(make_pair(curr_head, curr_head->when()));
+        }
+        eventq->deschedule(curr_head);
+    }
+
+    // Schedule an event to start cache cooldown
+    DPRINTF(RubyCacheTrace, "Starting cache flush\n");
+    enqueueRubyEvent(curTick());
+    simulate();
+    DPRINTF(RubyCacheTrace, "Cache flush complete\n");
+
+    // Deschedule any events left on the event queue.
+    while (!eventq->empty()) {
+        eventq->deschedule(eventq->getHead());
+    }
+
+    // Restore curTick
+    setCurTick(curtick_original);
+
+    // Restore all events that were originally on the event queue.  This is
+    // done after setting curTick back to its original value so that events do
+    // not seem to be scheduled in the past.
+    while (!original_events.empty()) {
+        pair<Event*, Tick> event = original_events.back();
+        eventq->schedule(event.first, event.second);
+        original_events.pop_back();
+    }
+
+    // No longer flushing back to memory.
+    m_cooldown_enabled = false;
+
+    // There are several issues with continuing simulation after calling
+    // memWriteback() at the moment, that stem from taking events off the
+    // queue, simulating again, and then putting them back on, whilst
+    // pretending that no time has passed.  One is that some events will have
+    // been deleted, so can't be put back.  Another is that any object
+    // recording the tick something happens may end up storing a tick in the
+    // future.  A simple warning here alerts the user that things may not work
+    // as expected.
+    warn_once("Ruby memory writeback is experimental.  Continuing simulation "
+              "afterwards may not always work as intended.");
+
+    // Keep the cache recorder around so that we can dump the trace if a
+    // checkpoint is immediately taken.
+}
+
 void
 RubySystem::writeCompressedTrace(uint8_t *raw_data, string filename,
                                  uint64 uncompressed_trace_size)
@@ -130,59 +237,19 @@ RubySystem::writeCompressedTrace(uint8_t *raw_data, string filename,
 void
 RubySystem::serializeOld(CheckpointOut &cp)
 {
-    m_cooldown_enabled = true;
-    vector<Sequencer*> sequencer_map;
-    Sequencer* sequencer_ptr = NULL;
-
-    for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
-        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
-        if (sequencer_ptr == NULL) {
-            sequencer_ptr = sequencer_map[cntrl];
-        }
-    }
-
-    assert(sequencer_ptr != NULL);
-
-    for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
-        if (sequencer_map[cntrl] == NULL) {
-            sequencer_map[cntrl] = sequencer_ptr;
-        }
-    }
-
     // Store the cache-block size, so we are able to restore on systems with a
     // different cache-block size. CacheRecorder depends on the correct
     // cache-block size upon unserializing.
     uint64 block_size_bytes = getBlockSizeBytes();
     SERIALIZE_SCALAR(block_size_bytes);
 
-    DPRINTF(RubyCacheTrace, "Recording Cache Trace\n");
-    // Create the CacheRecorder and record the cache trace
-    m_cache_recorder = new CacheRecorder(NULL, 0, sequencer_map,
-                                         block_size_bytes);
-
-    for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
-        m_abs_cntrl_vec[cntrl]->recordCacheTrace(cntrl, m_cache_recorder);
+    // Check that there's a valid trace to use.  If not, then memory won't be
+    // up-to-date and the simulation will probably fail when restoring from the
+    // checkpoint.
+    if (m_cache_recorder == NULL) {
+        fatal("Call memWriteback() before serialize() to create ruby trace");
     }
 
-    DPRINTF(RubyCacheTrace, "Cache Trace Complete\n");
-    // save the current tick value
-    Tick curtick_original = curTick();
-    // save the event queue head
-    Event* eventq_head = eventq->replaceHead(NULL);
-    DPRINTF(RubyCacheTrace, "Recording current tick %ld and event queue\n",
-            curtick_original);
-
-    // Schedule an event to start cache cooldown
-    DPRINTF(RubyCacheTrace, "Starting cache flush\n");
-    enqueueRubyEvent(curTick());
-    simulate();
-    DPRINTF(RubyCacheTrace, "Cache flush complete\n");
-
-    // Restore eventq head
-    eventq_head = eventq->replaceHead(eventq_head);
-    // Restore curTick
-    setCurTick(curtick_original);
-
     // Aggregate the trace entries together into a single array
     uint8_t *raw_data = new uint8_t[4096];
     uint64 cache_trace_size = m_cache_recorder->aggregateRecords(&raw_data,
@@ -193,7 +260,9 @@ RubySystem::serializeOld(CheckpointOut &cp)
     SERIALIZE_SCALAR(cache_trace_file);
     SERIALIZE_SCALAR(cache_trace_size);
 
-    m_cooldown_enabled = false;
+    // Now finished with the cache recorder.
+    delete m_cache_recorder;
+    m_cache_recorder = NULL;
 }
 
 void
@@ -250,23 +319,8 @@ RubySystem::unserialize(CheckpointIn &cp)
     m_warmup_enabled = true;
     m_systems_to_warmup++;
 
-    vector<Sequencer*> sequencer_map;
-    Sequencer* t = NULL;
-    for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
-        sequencer_map.push_back(m_abs_cntrl_vec[cntrl]->getSequencer());
-        if (t == NULL) t = sequencer_map[cntrl];
-    }
-
-    assert(t != NULL);
-
-    for (int cntrl = 0; cntrl < m_abs_cntrl_vec.size(); cntrl++) {
-        if (sequencer_map[cntrl] == NULL) {
-            sequencer_map[cntrl] = t;
-        }
-    }
-
-    m_cache_recorder = new CacheRecorder(uncompressed_trace, cache_trace_size,
-                                         sequencer_map, block_size_bytes);
+    // Create the cache recorder that will hang around until startup.
+    makeCacheRecorder(uncompressed_trace, cache_trace_size, block_size_bytes);
 }
 
 void
@@ -290,6 +344,7 @@ RubySystem::startup()
     // state was checkpointed.
 
     if (m_warmup_enabled) {
+        DPRINTF(RubyCacheTrace, "Starting ruby cache warmup\n");
         // save the current tick value
         Tick curtick_original = curTick();
         // save the event queue head
index fdb5fc88164cc9547aa57cd271924e207fdc9558..787e4f4ae5eef8ca3a76fade2e0b09db626c0aa1 100644 (file)
@@ -94,6 +94,7 @@ class RubySystem : public ClockedObject
     void collateStats() { m_profiler->collateStats(); }
     void resetStats();
 
+    void memWriteback();
     void serializeOld(CheckpointOut &cp) M5_ATTR_OVERRIDE;
     void unserialize(CheckpointIn &cp) M5_ATTR_OVERRIDE;
     void process();
@@ -116,6 +117,10 @@ class RubySystem : public ClockedObject
     RubySystem(const RubySystem& obj);
     RubySystem& operator=(const RubySystem& obj);
 
+    void makeCacheRecorder(uint8_t *uncompressed_trace,
+                           uint64 cache_trace_size,
+                           uint64 block_size_bytes);
+
     void readCompressedTrace(std::string filename,
                              uint8_t *&raw_data,
                              uint64& uncompressed_trace_size);
index cfb79614faaa8023fe70b432606d454650e8a5b0..92cd241a63ee76c4e99474891c93d06670e526eb 100644 (file)
@@ -564,6 +564,7 @@ class EventQueue : public Serializable
     Tick nextTick() const { return head->when(); }
     void setCurTick(Tick newVal) { _curTick = newVal; }
     Tick getCurTick() { return _curTick; }
+    Event *getHead() const { return head; }
 
     Event *serviceOne();