cpu: Add TraceCPU to playback elastic traces

author Radhika Jagtap <radhika.jagtap@ARM.com>

Mon, 7 Dec 2015 22:42:15 +0000 (16:42 -0600)

committer Radhika Jagtap <radhika.jagtap@ARM.com>

Mon, 7 Dec 2015 22:42:15 +0000 (16:42 -0600)
author Radhika Jagtap <radhika.jagtap@ARM.com>
Mon, 7 Dec 2015 22:42:15 +0000 (16:42 -0600)
committer Radhika Jagtap <radhika.jagtap@ARM.com>
Mon, 7 Dec 2015 22:42:15 +0000 (16:42 -0600)
diff --git a/src/cpu/trace/SConscript b/src/cpu/trace/SConscript

new file mode 100644 (file)

index 0000000..aa450b1
--- /dev/null
+++ b/src/cpu/trace/SConscript
@@ -0,0 +1,12 @@
+Import('*')
+
+if env['TARGET_ISA'] == 'null':
+    Return()
+
+# Only build TraceCPU if we have support for protobuf as TraceCPU relies on it
+if env['HAVE_PROTOBUF']:
+    SimObject('TraceCPU.py')
+    Source('trace_cpu.cc')
+
+DebugFlag('TraceCPUData')
+DebugFlag('TraceCPUInst')
diff --git a/src/cpu/trace/TraceCPU.py b/src/cpu/trace/TraceCPU.py

new file mode 100644 (file)

index 0000000..e1c02ae
--- /dev/null
+++ b/src/cpu/trace/TraceCPU.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2013 - 2015 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Authors: Radhika Jagtap
+#          Andreas Hansson
+#          Thomas Grass
+
+from m5.params import *
+from BaseCPU import BaseCPU
+
+class TraceCPU(BaseCPU):
+    """Trace CPU model which replays traces generated in a prior simulation
+     using DerivO3CPU or its derived classes. It interfaces with L1 caches.
+    """
+    type = 'TraceCPU'
+    cxx_header = "cpu/trace/trace_cpu.hh"
+
+    @classmethod
+    def memory_mode(cls):
+        return 'timing'
+
+    @classmethod
+    def require_caches(cls):
+        return True
+
+    def addPMU(self, pmu = None):
+        pass
+
+    @classmethod
+    def support_take_over(cls):
+        return True
+
+    instTraceFile = Param.String("", "Instruction trace file")
+    dataTraceFile = Param.String("", "Data dependency trace file")
+    sizeStoreBuffer = Param.Unsigned(16, "Number of entries in the store "\
+        "buffer")
+    sizeLoadBuffer = Param.Unsigned(16, "Number of entries in the load buffer")
+    sizeROB =  Param.Unsigned(40, "Number of entries in the re-order buffer")
+
diff --git a/src/cpu/trace/trace_cpu.cc b/src/cpu/trace/trace_cpu.cc

new file mode 100644 (file)

index 0000000..2e989f6
--- /dev/null
+++ b/src/cpu/trace/trace_cpu.cc
@@ -0,0 +1,1454 @@
+/*
+ * Copyright (c) 2013 - 2015 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Radhika Jagtap
+ *          Andreas Hansson
+ *          Thomas Grass
+ */
+
+#include "cpu/trace/trace_cpu.hh"
+
+#include "sim/sim_exit.hh"
+
+// Declare and initialize the static counter for number of trace CPUs.
+int TraceCPU::numTraceCPUs = 0;
+
+TraceCPU::TraceCPU(TraceCPUParams *params)
+    :   BaseCPU(params),
+        icachePort(this),
+        dcachePort(this),
+        instMasterID(params->system->getMasterId(name() + ".inst")),
+        dataMasterID(params->system->getMasterId(name() + ".data")),
+        instTraceFile(params->instTraceFile),
+        dataTraceFile(params->dataTraceFile),
+        icacheGen(*this, ".iside", icachePort, instMasterID, instTraceFile),
+        dcacheGen(*this, ".dside", dcachePort, dataMasterID, dataTraceFile,
+                    params->sizeROB, params->sizeStoreBuffer,
+                    params->sizeLoadBuffer),
+        icacheNextEvent(this),
+        dcacheNextEvent(this),
+        oneTraceComplete(false),
+        firstFetchTick(0),
+        execCompleteEvent(nullptr)
+{
+    // Increment static counter for number of Trace CPUs.
+    ++TraceCPU::numTraceCPUs;
+
+    // Check that the python parameters for sizes of ROB, store buffer and load
+    // buffer do not overflow the corresponding C++ variables.
+    fatal_if(params->sizeROB > UINT16_MAX, "ROB size set to %d exceeds the "
+                "max. value of %d.\n", params->sizeROB, UINT16_MAX);
+    fatal_if(params->sizeStoreBuffer > UINT16_MAX, "ROB size set to %d "
+                "exceeds the max. value of %d.\n", params->sizeROB,
+                UINT16_MAX);
+    fatal_if(params->sizeLoadBuffer > UINT16_MAX, "Load buffer size set to"
+                " %d exceeds the max. value of %d.\n",
+                params->sizeLoadBuffer, UINT16_MAX);
+}
+
+TraceCPU::~TraceCPU()
+{
+
+}
+
+TraceCPU*
+TraceCPUParams::create()
+{
+    return new TraceCPU(this);
+}
+
+void
+TraceCPU::takeOverFrom(BaseCPU *oldCPU)
+{
+    // Unbind the ports of the old CPU and bind the ports of the TraceCPU.
+    assert(!getInstPort().isConnected());
+    assert(oldCPU->getInstPort().isConnected());
+    BaseSlavePort &inst_peer_port = oldCPU->getInstPort().getSlavePort();
+    oldCPU->getInstPort().unbind();
+    getInstPort().bind(inst_peer_port);
+
+    assert(!getDataPort().isConnected());
+    assert(oldCPU->getDataPort().isConnected());
+    BaseSlavePort &data_peer_port = oldCPU->getDataPort().getSlavePort();
+    oldCPU->getDataPort().unbind();
+    getDataPort().bind(data_peer_port);
+}
+
+void
+TraceCPU::init()
+{
+    DPRINTF(TraceCPUInst, "Instruction fetch request trace file is \"%s\"."
+            "\n", instTraceFile);
+    DPRINTF(TraceCPUData, "Data memory request trace file is \"%s\".\n",
+            dataTraceFile);
+
+    BaseCPU::init();
+
+    // Get the send tick of the first instruction read request and schedule
+    // icacheNextEvent at that tick.
+    Tick first_icache_tick = icacheGen.init();
+    schedule(icacheNextEvent, first_icache_tick);
+
+    // Get the send tick of the first data read/write request and schedule
+    // dcacheNextEvent at that tick.
+    Tick first_dcache_tick = dcacheGen.init();
+    schedule(dcacheNextEvent, first_dcache_tick);
+
+    // The static counter for number of Trace CPUs is correctly set at this
+    // point so create an event and pass it.
+    execCompleteEvent = new CountedExitEvent("end of all traces reached.",
+                                                numTraceCPUs);
+    // Save the first fetch request tick to dump it as tickOffset
+    firstFetchTick = first_icache_tick;
+}
+
+void
+TraceCPU::schedIcacheNext()
+{
+    DPRINTF(TraceCPUInst, "IcacheGen event.\n");
+
+    // Try to send the current packet or a retry packet if there is one
+    bool sched_next = icacheGen.tryNext();
+    // If packet sent successfully, schedule next event
+    if (sched_next) {
+        DPRINTF(TraceCPUInst, "Scheduling next icacheGen event "
+                "at %d.\n", curTick() + icacheGen.tickDelta());
+        schedule(icacheNextEvent, curTick() + icacheGen.tickDelta());
+        ++numSchedIcacheEvent;
+    } else {
+        // check if traceComplete. If not, do nothing because sending failed
+        // and next event will be scheduled via RecvRetry()
+        if (icacheGen.isTraceComplete()) {
+            // If this is the first trace to complete, set the variable. If it
+            // is already set then both traces are complete to exit sim.
+            checkAndSchedExitEvent();
+        }
+    }
+    return;
+}
+
+void
+TraceCPU::schedDcacheNext()
+{
+    DPRINTF(TraceCPUData, "DcacheGen event.\n");
+
+    dcacheGen.execute();
+    if (dcacheGen.isExecComplete()) {
+        checkAndSchedExitEvent();
+    }
+}
+
+void
+TraceCPU::checkAndSchedExitEvent()
+{
+    if (!oneTraceComplete) {
+        oneTraceComplete = true;
+    } else {
+        // Schedule event to indicate execution is complete as both
+        // instruction and data access traces have been played back.
+        inform("%s: Execution complete.\n", name());
+
+        // Record stats which are computed at the end of simulation
+        tickOffset = firstFetchTick;
+        numCycles = (clockEdge() - firstFetchTick) / clockPeriod();
+        numOps = dcacheGen.getMicroOpCount();
+        schedule(*execCompleteEvent, curTick());
+    }
+}
+
+void
+TraceCPU::regStats()
+{
+
+    BaseCPU::regStats();
+
+    numSchedDcacheEvent
+    .name(name() + ".numSchedDcacheEvent")
+    .desc("Number of events scheduled to trigger data request generator")
+    ;
+
+    numSchedIcacheEvent
+    .name(name() + ".numSchedIcacheEvent")
+    .desc("Number of events scheduled to trigger instruction request generator")
+    ;
+
+    numOps
+    .name(name() + ".numOps")
+    .desc("Number of micro-ops simulated by the Trace CPU")
+    ;
+
+    cpi
+    .name(name() + ".cpi")
+    .desc("Cycles per micro-op used as a proxy for CPI")
+    .precision(6)
+    ;
+    cpi = numCycles/numOps;
+
+    tickOffset
+    .name(name() + ".tickOffset")
+    .desc("The first execution tick for the root node of elastic traces")
+    ;
+
+    icacheGen.regStats();
+    dcacheGen.regStats();
+}
+
+void
+TraceCPU::ElasticDataGen::regStats()
+{
+    using namespace Stats;
+
+    maxDependents
+    .name(name() + ".maxDependents")
+    .desc("Max number of dependents observed on a node")
+    ;
+
+    maxReadyListSize
+    .name(name() + ".maxReadyListSize")
+    .desc("Max size of the ready list observed")
+    ;
+
+    numSendAttempted
+    .name(name() + ".numSendAttempted")
+    .desc("Number of first attempts to send a request")
+    ;
+
+    numSendSucceeded
+    .name(name() + ".numSendSucceeded")
+    .desc("Number of successful first attempts")
+    ;
+
+    numSendFailed
+    .name(name() + ".numSendFailed")
+    .desc("Number of failed first attempts")
+    ;
+
+    numRetrySucceeded
+    .name(name() + ".numRetrySucceeded")
+    .desc("Number of successful retries")
+    ;
+
+    numSplitReqs
+    .name(name() + ".numSplitReqs")
+    .desc("Number of split requests")
+    ;
+
+    numSOLoads
+    .name(name() + ".numSOLoads")
+    .desc("Number of strictly ordered loads")
+    ;
+
+    numSOStores
+    .name(name() + ".numSOStores")
+    .desc("Number of strictly ordered stores")
+    ;
+
+    dataLastTick
+    .name(name() + ".dataLastTick")
+    .desc("Last tick simulated from the elastic data trace")
+    ;
+}
+
+Tick
+TraceCPU::ElasticDataGen::init()
+{
+    DPRINTF(TraceCPUData, "Initializing data memory request generator "
+            "DcacheGen: elastic issue with retry.\n");
+
+    if (!readNextWindow())
+        panic("Trace has %d elements. It must have at least %d elements.\n",
+              depGraph.size(), 2 * windowSize);
+    DPRINTF(TraceCPUData, "After 1st read, depGraph size:%d.\n",
+            depGraph.size());
+
+    if (!readNextWindow())
+        panic("Trace has %d elements. It must have at least %d elements.\n",
+              depGraph.size(), 2 * windowSize);
+    DPRINTF(TraceCPUData, "After 2st read, depGraph size:%d.\n",
+            depGraph.size());
+
+    // Print readyList
+    if (DTRACE(TraceCPUData)) {
+        printReadyList();
+    }
+    auto free_itr = readyList.begin();
+    DPRINTF(TraceCPUData, "Execute tick of the first dependency free node %lli"
+            " is %d.\n", free_itr->seqNum, free_itr->execTick);
+    // Return the execute tick of the earliest ready node so that an event
+    // can be scheduled to call execute()
+    return (free_itr->execTick);
+}
+
+void
+TraceCPU::ElasticDataGen::exit()
+{
+    trace.reset();
+}
+
+bool
+TraceCPU::ElasticDataGen::readNextWindow()
+{
+
+    // Read and add next window
+    DPRINTF(TraceCPUData, "Reading next window from file.\n");
+
+    if (traceComplete) {
+        // We are at the end of the file, thus we have no more records.
+        // Return false.
+        return false;
+    }
+
+    DPRINTF(TraceCPUData, "Start read: Size of depGraph is %d.\n",
+            depGraph.size());
+
+    uint32_t num_read = 0;
+    while (num_read != windowSize) {
+
+        // Create a new graph node
+        GraphNode* new_node = new GraphNode;
+
+        // Read the next line to get the next record. If that fails then end of
+        // trace has been reached and traceComplete needs to be set in addition
+        // to returning false.
+        if (!trace.read(new_node)) {
+            DPRINTF(TraceCPUData, "\tTrace complete!\n");
+            traceComplete = true;
+            return false;
+        }
+
+        // Annotate the ROB dependencies of the new node onto the parent nodes.
+        addDepsOnParent(new_node, new_node->robDep, new_node->numRobDep);
+        // Annotate the register dependencies of the new node onto the parent
+        // nodes.
+        addDepsOnParent(new_node, new_node->regDep, new_node->numRegDep);
+
+        num_read++;
+        // Add to map
+        depGraph[new_node->seqNum] = new_node;
+        if (new_node->numRobDep == 0 && new_node->numRegDep == 0) {
+            // Source dependencies are already complete, check if resources
+            // are available and issue. The execution time is approximated
+            // to current time plus the computational delay.
+            checkAndIssue(new_node);
+        }
+    }
+
+    DPRINTF(TraceCPUData, "End read: Size of depGraph is %d.\n",
+            depGraph.size());
+    return true;
+}
+
+template<typename T> void
+TraceCPU::ElasticDataGen::addDepsOnParent(GraphNode *new_node,
+                                            T& dep_array, uint8_t& num_dep)
+{
+    for (auto& a_dep : dep_array) {
+        // The convention is to set the dependencies starting with the first
+        // index in the ROB and register dependency arrays. Thus, when we reach
+        // a dependency equal to the initialisation value of zero, we know have
+        // iterated over all dependencies and can break.
+        if (a_dep == 0)
+            break;
+        // We look up the valid dependency, i.e. the parent of this node
+        auto parent_itr = depGraph.find(a_dep);
+        if (parent_itr != depGraph.end()) {
+            // If the parent is found, it is yet to be executed. Append a
+            // pointer to the new node to the dependents list of the parent
+            // node.
+            parent_itr->second->dependents.push_back(new_node);
+            auto num_depts = parent_itr->second->dependents.size();
+            maxDependents = std::max<double>(num_depts, maxDependents.value());
+        } else {
+            // The dependency is not found in the graph. So consider
+            // the execution of the parent is complete, i.e. remove this
+            // dependency.
+            a_dep = 0;
+            num_dep--;
+        }
+    }
+}
+
+void
+TraceCPU::ElasticDataGen::execute()
+{
+    DPRINTF(TraceCPUData, "Execute start occupancy:\n");
+    DPRINTFR(TraceCPUData, "\tdepGraph = %d, readyList = %d, "
+            "depFreeQueue = %d ,", depGraph.size(), readyList.size(),
+            depFreeQueue.size());
+    hwResource.printOccupancy();
+
+    // Read next window to make sure that dependents of all dep-free nodes
+    // are in the depGraph
+    if (nextRead) {
+        readNextWindow();
+        nextRead = false;
+    }
+
+    // First attempt to issue the pending dependency-free nodes held
+    // in depFreeQueue. If resources have become available for a node,
+    // then issue it, i.e. add the node to readyList.
+    while (!depFreeQueue.empty()) {
+        if (checkAndIssue(depFreeQueue.front(), false)) {
+            DPRINTF(TraceCPUData, "Removing from depFreeQueue: seq. num "
+                "%lli.\n", (depFreeQueue.front())->seqNum);
+            depFreeQueue.pop();
+        } else {
+            break;
+        }
+    }
+    // Proceed to execute from readyList
+    auto graph_itr = depGraph.begin();
+    auto free_itr = readyList.begin();
+    // Iterate through readyList until the next free node has its execute
+    // tick later than curTick or the end of readyList is reached
+    while (free_itr->execTick <= curTick() && free_itr != readyList.end()) {
+
+        // Get pointer to the node to be executed
+        graph_itr = depGraph.find(free_itr->seqNum);
+        assert(graph_itr != depGraph.end());
+        GraphNode* node_ptr = graph_itr->second;
+
+        // If there is a retryPkt send that else execute the load
+        if (retryPkt) {
+            // The retryPkt must be the request that was created by the
+            // first node in the readyList.
+            if (retryPkt->req->getReqInstSeqNum() != node_ptr->seqNum) {
+                panic("Retry packet's seqence number does not match "
+                      "the first node in the readyList.\n");
+            }
+            if (port.sendTimingReq(retryPkt)) {
+                ++numRetrySucceeded;
+                retryPkt = nullptr;
+            }
+        } else if (node_ptr->isLoad || node_ptr->isStore) {
+            // If there is no retryPkt, attempt to send a memory request in
+            // case of a load or store node. If the send fails, executeMemReq()
+            // returns a packet pointer, which we save in retryPkt. In case of
+            // a comp node we don't do anything and simply continue as if the
+            // execution of the comp node succedded.
+            retryPkt = executeMemReq(node_ptr);
+        }
+        // If the retryPkt or a new load/store node failed, we exit from here
+        // as a retry from cache will bring the control to execute(). The
+        // first node in readyList then, will be the failed node.
+        if (retryPkt) {
+            break;
+        }
+
+        // Proceed to remove dependencies for the successfully executed node.
+        // If it is a load which is not strictly ordered and we sent a
+        // request for it successfully, we do not yet mark any register
+        // dependencies complete. But as per dependency modelling we need
+        // to mark ROB dependencies of load and non load/store nodes which
+        // are based on successful sending of the load as complete.
+        if (node_ptr->isLoad && !node_ptr->isStrictlyOrdered()) {
+            // If execute succeeded mark its dependents as complete
+            DPRINTF(TraceCPUData, "Node seq. num %lli sent. Waking up "
+                    "dependents..\n", node_ptr->seqNum);
+
+            auto child_itr = (node_ptr->dependents).begin();
+            while (child_itr != (node_ptr->dependents).end()) {
+                // ROB dependency of a store on a load must not be removed
+                // after load is sent but after response is received
+                if (!(*child_itr)->isStore &&
+                    (*child_itr)->removeRobDep(node_ptr->seqNum)) {
+
+                    // Check if the child node has become dependency free
+                    if ((*child_itr)->numRobDep == 0 &&
+                        (*child_itr)->numRegDep == 0) {
+
+                        // Source dependencies are complete, check if
+                        // resources are available and issue
+                        checkAndIssue(*child_itr);
+                    }
+                    // Remove this child for the sent load and point to new
+                    // location of the element following the erased element
+                    child_itr = node_ptr->dependents.erase(child_itr);
+                } else {
+                    // This child is not dependency-free, point to the next
+                    // child
+                    child_itr++;
+                }
+            }
+        } else {
+            // If it is a strictly ordered load mark its dependents as complete
+            // as we do not send a request for this case. If it is a store or a
+            // comp node we also mark all its dependents complete.
+            DPRINTF(TraceCPUData, "Node seq. num %lli done. Waking"
+                    " up dependents..\n", node_ptr->seqNum);
+
+            for (auto child : node_ptr->dependents) {
+                // If the child node is dependency free removeDepOnInst()
+                // returns true.
+                if (child->removeDepOnInst(node_ptr->seqNum)) {
+                    // Source dependencies are complete, check if resources
+                    // are available and issue
+                    checkAndIssue(child);
+                }
+            }
+        }
+
+        // After executing the node, remove from readyList and delete node.
+        readyList.erase(free_itr);
+        // If it is a cacheable load which was sent, don't delete
+        // just yet.  Delete it in completeMemAccess() after the
+        // response is received. If it is an strictly ordered
+        // load, it was not sent and all dependencies were simply
+        // marked complete. Thus it is safe to delete it. For
+        // stores and non load/store nodes all dependencies were
+        // marked complete so it is safe to delete it.
+        if (!node_ptr->isLoad || node_ptr->isStrictlyOrdered()) {
+            // Release all resources occupied by the completed node
+            hwResource.release(node_ptr);
+            // clear the dynamically allocated set of dependents
+            (node_ptr->dependents).clear();
+            // delete node
+            delete node_ptr;
+            // remove from graph
+            depGraph.erase(graph_itr);
+        }
+        // Point to first node to continue to next iteration of while loop
+        free_itr = readyList.begin();
+    } // end of while loop
+
+    // Print readyList, sizes of queues and resource status after updating
+    if (DTRACE(TraceCPUData)) {
+        printReadyList();
+        DPRINTF(TraceCPUData, "Execute end occupancy:\n");
+        DPRINTFR(TraceCPUData, "\tdepGraph = %d, readyList = %d, "
+                "depFreeQueue = %d ,", depGraph.size(), readyList.size(),
+                depFreeQueue.size());
+        hwResource.printOccupancy();
+    }
+
+    if (retryPkt) {
+        DPRINTF(TraceCPUData, "Not scheduling an event as expecting a retry"
+                "event from the cache for seq. num %lli.\n",
+                retryPkt->req->getReqInstSeqNum());
+        return;
+    }
+    // If the size of the dependency graph is less than the dependency window
+    // then read from the trace file to populate the graph next time we are in
+    // execute.
+    if (depGraph.size() < windowSize && !traceComplete)
+        nextRead = true;
+
+    // If cache is not blocked, schedule an event for the first execTick in
+    // readyList else retry from cache will schedule the event. If the ready
+    // list is empty then check if the next pending node has resources
+    // available to issue. If yes, then schedule an event for the next cycle.
+    if (!readyList.empty()) {
+        Tick next_event_tick = std::max(readyList.begin()->execTick,
+                                        curTick());
+        DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n",
+                next_event_tick);
+        owner.schedDcacheNextEvent(next_event_tick);
+    } else if (readyList.empty() && !depFreeQueue.empty() &&
+                hwResource.isAvailable(depFreeQueue.front())) {
+        DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n",
+                owner.clockEdge(Cycles(1)));
+        owner.schedDcacheNextEvent(owner.clockEdge(Cycles(1)));
+    }
+
+    // If trace is completely read, readyList is empty and depGraph is empty,
+    // set execComplete to true
+    if (depGraph.empty() && readyList.empty() && traceComplete &&
+        !hwResource.awaitingResponse()) {
+        DPRINTF(TraceCPUData, "\tExecution Complete!\n");
+        execComplete = true;
+        dataLastTick = curTick();
+    }
+}
+
+PacketPtr
+TraceCPU::ElasticDataGen::executeMemReq(GraphNode* node_ptr)
+{
+
+    DPRINTF(TraceCPUData, "Executing memory request %lli (addr %d, pc %#x, "
+            "size %d, flags %d).\n", node_ptr->seqNum, node_ptr->addr,
+            node_ptr->pc, node_ptr->size, node_ptr->flags);
+
+    // If the request is strictly ordered, do not send it. Just return nullptr
+    // as if it was succesfully sent.
+    if (node_ptr->isStrictlyOrdered()) {
+        node_ptr->isLoad ? ++numSOLoads : ++numSOStores;
+        DPRINTF(TraceCPUData, "Skipping strictly ordered request %lli.\n",
+                node_ptr->seqNum);
+        return nullptr;
+    }
+
+    // Check if the request spans two cache lines as this condition triggers
+    // an assert fail in the L1 cache. If it does then truncate the size to
+    // access only until the end of that line and ignore the remainder. The
+    // stat counting this is useful to keep a check on how frequently this
+    // happens. If required the code could be revised to mimick splitting such
+    // a request into two.
+    unsigned blk_size = owner.cacheLineSize();
+    Addr blk_offset = (node_ptr->addr & (Addr)(blk_size - 1));
+    if (!(blk_offset + node_ptr->size <= blk_size)) {
+        node_ptr->size = blk_size - blk_offset;
+        ++numSplitReqs;
+    }
+
+    // Create a request and the packet containing request
+    Request* req = new Request(node_ptr->addr, node_ptr->size, node_ptr->flags,
+                               masterID, node_ptr->seqNum,
+                               ContextID(0), ThreadID(0));
+    req->setPC(node_ptr->pc);
+    PacketPtr pkt;
+    uint8_t* pkt_data = new uint8_t[req->getSize()];
+    if (node_ptr->isLoad) {
+        pkt = Packet::createRead(req);
+    } else {
+        pkt = Packet::createWrite(req);
+        memset(pkt_data, 0xA, req->getSize());
+    }
+    pkt->dataDynamic(pkt_data);
+
+    // Call MasterPort method to send a timing request for this packet
+    bool success = port.sendTimingReq(pkt);
+    ++numSendAttempted;
+
+    if (!success) {
+        // If it fails, return the packet to retry when a retry is signalled by
+        // the cache
+        ++numSendFailed;
+        DPRINTF(TraceCPUData, "Send failed. Saving packet for retry.\n");
+        return pkt;
+    } else {
+        // It is succeeds, return nullptr
+        ++numSendSucceeded;
+        return nullptr;
+    }
+}
+
+bool
+TraceCPU::ElasticDataGen::checkAndIssue(const GraphNode* node_ptr, bool first)
+{
+    // Assert the node is dependency-free
+    assert(node_ptr->numRobDep == 0 && node_ptr->numRegDep == 0);
+
+    // If this is the first attempt, print a debug message to indicate this.
+    if (first) {
+        DPRINTFR(TraceCPUData, "\t\tseq. num %lli(%s) with rob num %lli is now"
+            " dependency free.\n", node_ptr->seqNum,
+            node_ptr->isLoad ? "L" : (node_ptr->isStore ? "S" : "C"),
+            node_ptr->robNum);
+    }
+
+    // Check if resources are available to issue the specific node
+    if (hwResource.isAvailable(node_ptr)) {
+        // If resources are free only then add to readyList
+        DPRINTFR(TraceCPUData, "\t\tResources available for seq. num %lli. Adding"
+            " to readyList, occupying resources.\n", node_ptr->seqNum);
+        // Compute the execute tick by adding the compute delay for the node
+        // and add the ready node to the ready list
+        addToSortedReadyList(node_ptr->seqNum,
+                                owner.clockEdge() + node_ptr->compDelay);
+        // Account for the resources taken up by this issued node.
+        hwResource.occupy(node_ptr);
+        return true;
+
+    } else {
+        if (first) {
+            // Although dependencies are complete, resources are not available.
+            DPRINTFR(TraceCPUData, "\t\tResources unavailable for seq. num %lli."
+                " Adding to depFreeQueue.\n", node_ptr->seqNum);
+            depFreeQueue.push(node_ptr);
+        } else {
+            DPRINTFR(TraceCPUData, "\t\tResources unavailable for seq. num %lli. "
+                "Still pending issue.\n", node_ptr->seqNum);
+        }
+        return false;
+    }
+}
+
+void
+TraceCPU::ElasticDataGen::completeMemAccess(PacketPtr pkt)
+{
+    // Release the resources for this completed node.
+    if (pkt->isWrite()) {
+        // Consider store complete.
+        hwResource.releaseStoreBuffer();
+        // If it is a store response then do nothing since we do not model
+        // dependencies on store completion in the trace. But if we were
+        // blocking execution due to store buffer fullness, we need to schedule
+        // an event and attempt to progress.
+    } else {
+        // If it is a load response then release the dependents waiting on it.
+        // Get pointer to the completed load
+        auto graph_itr = depGraph.find(pkt->req->getReqInstSeqNum());
+        assert(graph_itr != depGraph.end());
+        GraphNode* node_ptr = graph_itr->second;
+
+        // Release resources occupied by the load
+        hwResource.release(node_ptr);
+
+        DPRINTF(TraceCPUData, "Load seq. num %lli response received. Waking up"
+                " dependents..\n", node_ptr->seqNum);
+
+        for (auto child : node_ptr->dependents) {
+            if (child->removeDepOnInst(node_ptr->seqNum)) {
+                checkAndIssue(child);
+            }
+        }
+
+        // clear the dynamically allocated set of dependents
+        (node_ptr->dependents).clear();
+        // delete node
+        delete node_ptr;
+        // remove from graph
+        depGraph.erase(graph_itr);
+    }
+
+    if (DTRACE(TraceCPUData)) {
+        printReadyList();
+    }
+
+    // If the size of the dependency graph is less than the dependency window
+    // then read from the trace file to populate the graph next time we are in
+    // execute.
+    if (depGraph.size() < windowSize && !traceComplete)
+        nextRead = true;
+
+    // If not waiting for retry, attempt to schedule next event
+    if (!retryPkt) {
+        // We might have new dep-free nodes in the list which will have execute
+        // tick greater than or equal to curTick. But a new dep-free node might
+        // have its execute tick earlier. Therefore, attempt to reschedule. It
+        // could happen that the readyList is empty and we got here via a
+        // last remaining response. So, either the trace is complete or there
+        // are pending nodes in the depFreeQueue. The checking is done in the
+        // execute() control flow, so schedule an event to go via that flow.
+        Tick next_event_tick = readyList.empty() ? owner.clockEdge(Cycles(1)) :
+            std::max(readyList.begin()->execTick, owner.clockEdge(Cycles(1)));
+        DPRINTF(TraceCPUData, "Attempting to schedule @%lli.\n",
+                next_event_tick);
+        owner.schedDcacheNextEvent(next_event_tick);
+    }
+}
+
+void
+TraceCPU::ElasticDataGen::addToSortedReadyList(NodeSeqNum seq_num,
+                                                    Tick exec_tick)
+{
+    ReadyNode ready_node;
+    ready_node.seqNum = seq_num;
+    ready_node.execTick = exec_tick;
+
+    // Iterator to readyList
+    auto itr = readyList.begin();
+
+    // If the readyList is empty, simply insert the new node at the beginning
+    // and return
+    if (itr == readyList.end()) {
+        readyList.insert(itr, ready_node);
+        maxReadyListSize = std::max<double>(readyList.size(),
+                                              maxReadyListSize.value());
+        return;
+    }
+
+    // If the new node has its execution tick equal to the first node in the
+    // list then go to the next node. If the first node in the list failed
+    // to execute, its position as the first is thus maintained.
+    if (retryPkt)
+        if (retryPkt->req->getReqInstSeqNum() == itr->seqNum)
+            itr++;
+
+    // Increment the iterator and compare the node pointed to by it to the new
+    // node till the position to insert the new node is found.
+    bool found = false;
+    while (!found && itr != readyList.end()) {
+        // If the execution tick of the new node is less than the node then
+        // this is the position to insert
+        if (exec_tick < itr->execTick)
+            found = true;
+        // If the execution tick of the new node is equal to the node then
+        // sort in ascending order of sequence numbers
+        else if (exec_tick == itr->execTick) {
+            // If the sequence number of the new node is less than the node
+            // then this is the position to insert
+            if (seq_num < itr->seqNum)
+                found = true;
+            // Else go to next node
+            else
+                itr++;
+        }
+        // If the execution tick of the new node is greater than the node then
+        // go to the next node
+        else
+            itr++;
+    }
+    readyList.insert(itr, ready_node);
+    // Update the stat for max size reached of the readyList
+    maxReadyListSize = std::max<double>(readyList.size(),
+                                          maxReadyListSize.value());
+}
+
+void
+TraceCPU::ElasticDataGen::printReadyList() {
+
+    auto itr = readyList.begin();
+    if (itr == readyList.end()) {
+        DPRINTF(TraceCPUData, "readyList is empty.\n");
+        return;
+    }
+    DPRINTF(TraceCPUData, "Printing readyList:\n");
+    while (itr != readyList.end()) {
+        auto graph_itr = depGraph.find(itr->seqNum);
+        GraphNode* node_ptr M5_VAR_USED = graph_itr->second;
+        DPRINTFR(TraceCPUData, "\t%lld(%s), %lld\n", itr->seqNum,
+            node_ptr->isLoad ? "L" : (node_ptr->isStore ? "S" : "C"),
+            itr->execTick);
+        itr++;
+    }
+}
+
+TraceCPU::ElasticDataGen::HardwareResource::HardwareResource(
+    uint16_t max_rob, uint16_t max_stores, uint16_t max_loads)
+  : sizeROB(max_rob),
+    sizeStoreBuffer(max_stores),
+    sizeLoadBuffer(max_loads),
+    oldestInFlightRobNum(UINT64_MAX),
+    numInFlightLoads(0),
+    numInFlightStores(0)
+{}
+
+void
+TraceCPU::ElasticDataGen::HardwareResource::occupy(const GraphNode* new_node)
+{
+    // Occupy ROB entry for the issued node
+    // Merely maintain the oldest node, i.e. numerically least robNum by saving
+    // it in the variable oldestInFLightRobNum.
+    inFlightNodes[new_node->seqNum] = new_node->robNum;
+    oldestInFlightRobNum = inFlightNodes.begin()->second;
+
+    // Occupy Load/Store Buffer entry for the issued node if applicable
+    if (new_node->isLoad) {
+        ++numInFlightLoads;
+    } else if (new_node->isStore) {
+        ++numInFlightStores;
+    } // else if it is a non load/store node, no buffer entry is occupied
+
+    printOccupancy();
+}
+
+void
+TraceCPU::ElasticDataGen::HardwareResource::release(const GraphNode* done_node)
+{
+    assert(!inFlightNodes.empty());
+    DPRINTFR(TraceCPUData, "\tClearing done seq. num %d from inFlightNodes..\n",
+        done_node->seqNum);
+
+    assert(inFlightNodes.find(done_node->seqNum) != inFlightNodes.end());
+    inFlightNodes.erase(done_node->seqNum);
+
+    if (inFlightNodes.empty()) {
+        // If we delete the only in-flight node and then the
+        // oldestInFlightRobNum is set to it's initialized (max) value.
+        oldestInFlightRobNum = UINT64_MAX;
+    } else {
+        // Set the oldest in-flight node rob number equal to the first node in
+        // the inFlightNodes since that will have the numerically least value.
+        oldestInFlightRobNum = inFlightNodes.begin()->second;
+    }
+
+    DPRINTFR(TraceCPUData, "\tCleared. inFlightNodes.size() = %d, "
+        "oldestInFlightRobNum = %d\n", inFlightNodes.size(),
+        oldestInFlightRobNum);
+
+    // A store is considered complete when a request is sent, thus ROB entry is
+    // freed. But it occupies an entry in the Store Buffer until its response
+    // is received. A load is considered complete when a response is received,
+    // thus both ROB and Load Buffer entries can be released.
+    if (done_node->isLoad) {
+        assert(numInFlightLoads != 0);
+        --numInFlightLoads;
+    }
+    // For normal writes, we send the requests out and clear a store buffer
+    // entry on response. For writes which are strictly ordered, for e.g.
+    // writes to device registers, we do that within release() which is called
+    // when node is executed and taken off from readyList.
+    if (done_node->isStore && done_node->isStrictlyOrdered()) {
+        releaseStoreBuffer();
+    }
+}
+
+void
+TraceCPU::ElasticDataGen::HardwareResource::releaseStoreBuffer()
+{
+    assert(numInFlightStores != 0);
+    --numInFlightStores;
+}
+
+bool
+TraceCPU::ElasticDataGen::HardwareResource::isAvailable(
+    const GraphNode* new_node) const
+{
+    uint16_t num_in_flight_nodes;
+    if (inFlightNodes.empty()) {
+        num_in_flight_nodes = 0;
+        DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:"
+            " #in-flight nodes = 0", new_node->seqNum);
+    } else if (new_node->robNum > oldestInFlightRobNum) {
+        // This is the intuitive case where new dep-free node is younger
+        // instruction than the oldest instruction in-flight. Thus we make sure
+        // in_flight_nodes does not overflow.
+        num_in_flight_nodes = new_node->robNum - oldestInFlightRobNum;
+        DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:"
+            " #in-flight nodes = %d - %d =  %d", new_node->seqNum,
+             new_node->robNum, oldestInFlightRobNum, num_in_flight_nodes);
+    } else {
+        // This is the case where an instruction older than the oldest in-
+        // flight instruction becomes dep-free. Thus we must have already
+        // accounted for the entry in ROB for this new dep-free node.
+        // Immediately after this check returns true, oldestInFlightRobNum will
+        // be updated in occupy(). We simply let this node issue now.
+        num_in_flight_nodes = 0;
+        DPRINTFR(TraceCPUData, "\t\tChecking resources to issue seq. num %lli:"
+            " new oldestInFlightRobNum = %d, #in-flight nodes ignored",
+            new_node->seqNum, new_node->robNum);
+    }
+    DPRINTFR(TraceCPUData, ", LQ = %d/%d, SQ  = %d/%d.\n",
+        numInFlightLoads, sizeLoadBuffer,
+        numInFlightStores, sizeStoreBuffer);
+    // Check if resources are available to issue the specific node
+    if (num_in_flight_nodes >= sizeROB) {
+        return false;
+    }
+    if (new_node->isLoad && numInFlightLoads >= sizeLoadBuffer) {
+        return false;
+    }
+    if (new_node->isStore && numInFlightStores >= sizeStoreBuffer) {
+        return false;
+    }
+    return true;
+}
+
+bool
+TraceCPU::ElasticDataGen::HardwareResource::awaitingResponse() const {
+    // Return true if there is at least one read or write request in flight
+    return (numInFlightStores != 0 || numInFlightLoads != 0);
+}
+
+void
+TraceCPU::ElasticDataGen::HardwareResource::printOccupancy() {
+    DPRINTFR(TraceCPUData, "oldestInFlightRobNum = %d, "
+            "LQ = %d/%d, SQ  = %d/%d.\n",
+            oldestInFlightRobNum,
+            numInFlightLoads, sizeLoadBuffer,
+            numInFlightStores, sizeStoreBuffer);
+}
+
+void
+TraceCPU::FixedRetryGen::regStats()
+{
+    using namespace Stats;
+
+    numSendAttempted
+    .name(name() + ".numSendAttempted")
+    .desc("Number of first attempts to send a request")
+    ;
+
+    numSendSucceeded
+    .name(name() + ".numSendSucceeded")
+    .desc("Number of successful first attempts")
+    ;
+
+    numSendFailed
+    .name(name() + ".numSendFailed")
+    .desc("Number of failed first attempts")
+    ;
+
+    numRetrySucceeded
+    .name(name() + ".numRetrySucceeded")
+    .desc("Number of successful retries")
+    ;
+
+    instLastTick
+    .name(name() + ".instLastTick")
+    .desc("Last tick simulated from the fixed inst trace")
+    ;
+}
+
+Tick
+TraceCPU::FixedRetryGen::init()
+{
+    DPRINTF(TraceCPUInst, "Initializing instruction fetch request generator"
+            " IcacheGen: fixed issue with retry.\n");
+
+    if (nextExecute()) {
+        DPRINTF(TraceCPUInst, "\tFirst tick = %d.\n", currElement.tick);
+        return currElement.tick;
+    } else {
+        panic("Read of first message in the trace failed.\n");
+        return MaxTick;
+    }
+}
+
+bool
+TraceCPU::FixedRetryGen::tryNext()
+{
+    // If there is a retry packet, try to send it
+    if (retryPkt) {
+
+        DPRINTF(TraceCPUInst, "Trying to send retry packet.\n");
+
+        if (!port.sendTimingReq(retryPkt)) {
+            // Still blocked! This should never occur.
+            DPRINTF(TraceCPUInst, "Retry packet sending failed.\n");
+            return false;
+        }
+        ++numRetrySucceeded;
+    } else {
+
+        DPRINTF(TraceCPUInst, "Trying to send packet for currElement.\n");
+
+        // try sending current element
+        assert(currElement.isValid());
+
+        ++numSendAttempted;
+
+        if (!send(currElement.addr, currElement.blocksize,
+                    currElement.cmd, currElement.flags, currElement.pc)) {
+            DPRINTF(TraceCPUInst, "currElement sending failed.\n");
+            ++numSendFailed;
+            // return false to indicate not to schedule next event
+            return false;
+        } else {
+            ++numSendSucceeded;
+        }
+    }
+    // If packet was sent successfully, either retryPkt or currElement, return
+    // true to indicate to schedule event at current Tick plus delta. If packet
+    // was sent successfully and there is no next packet to send, return false.
+    DPRINTF(TraceCPUInst, "Packet sent successfully, trying to read next "
+        "element.\n");
+    retryPkt = nullptr;
+    // Read next element into currElement, currElement gets cleared so save the
+    // tick to calculate delta
+    Tick last_tick = currElement.tick;
+    if (nextExecute()) {
+        assert(currElement.tick >= last_tick);
+        delta = currElement.tick - last_tick;
+    }
+    return !traceComplete;
+}
+
+void
+TraceCPU::FixedRetryGen::exit()
+{
+    trace.reset();
+}
+
+bool
+TraceCPU::FixedRetryGen::nextExecute()
+{
+    if (traceComplete)
+        // We are at the end of the file, thus we have no more messages.
+        // Return false.
+        return false;
+
+
+    //Reset the currElement to the default values
+    currElement.clear();
+
+    // Read the next line to get the next message. If that fails then end of
+    // trace has been reached and traceComplete needs to be set in addition
+    // to returning false. If successful then next message is in currElement.
+    if (!trace.read(&currElement)) {
+        traceComplete = true;
+        instLastTick = curTick();
+        return false;
+    }
+
+    DPRINTF(TraceCPUInst, "inst fetch: %c addr %d pc %#x size %d tick %d\n",
+            currElement.cmd.isRead() ? 'r' : 'w',
+            currElement.addr,
+            currElement.pc,
+            currElement.blocksize,
+            currElement.tick);
+
+    return true;
+}
+
+bool
+TraceCPU::FixedRetryGen::send(Addr addr, unsigned size, const MemCmd& cmd,
+              Request::FlagsType flags, Addr pc)
+{
+
+    // Create new request
+    Request* req = new Request(addr, size, flags, masterID);
+    req->setPC(pc);
+
+    // If this is not done it triggers assert in L1 cache for invalid contextId
+    req->setThreadContext(ContextID(0), ThreadID(0));
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, cmd);
+
+    uint8_t* pkt_data = new uint8_t[req->getSize()];
+    pkt->dataDynamic(pkt_data);
+
+    if (cmd.isWrite()) {
+        memset(pkt_data, 0xA, req->getSize());
+    }
+
+    // Call MasterPort method to send a timing request for this packet
+    bool success = port.sendTimingReq(pkt);
+    if (!success) {
+        // If it fails, save the packet to retry when a retry is signalled by
+        // the cache
+        retryPkt = pkt;
+    }
+    return success;
+}
+
+void
+TraceCPU::icacheRetryRecvd()
+{
+    // Schedule an event to go through the control flow in the same tick as
+    // retry is received
+    DPRINTF(TraceCPUInst, "Icache retry received. Scheduling next IcacheGen"
+            " event @%lli.\n", curTick());
+    schedule(icacheNextEvent, curTick());
+}
+
+void
+TraceCPU::dcacheRetryRecvd()
+{
+    // Schedule an event to go through the execute flow in the same tick as
+    // retry is received
+    DPRINTF(TraceCPUData, "Dcache retry received. Scheduling next DcacheGen"
+            " event @%lli.\n", curTick());
+    schedule(dcacheNextEvent, curTick());
+}
+
+void
+TraceCPU::schedDcacheNextEvent(Tick when)
+{
+    if (!dcacheNextEvent.scheduled()) {
+        DPRINTF(TraceCPUData, "Scheduling next DcacheGen event at %lli.\n",
+                when);
+        schedule(dcacheNextEvent, when);
+        ++numSchedDcacheEvent;
+    } else if (when < dcacheNextEvent.when()) {
+        DPRINTF(TraceCPUData, "Re-scheduling next dcache event from %lli"
+                " to %lli.\n", dcacheNextEvent.when(), when);
+        reschedule(dcacheNextEvent, when);
+    }
+
+}
+
+bool
+TraceCPU::IcachePort::recvTimingResp(PacketPtr pkt)
+{
+    // All responses on the instruction fetch side are ignored. Simply delete
+    // the request and packet to free allocated memory
+    delete pkt->req;
+    delete pkt;
+
+    return true;
+}
+
+void
+TraceCPU::IcachePort::recvReqRetry()
+{
+    owner->icacheRetryRecvd();
+}
+
+void
+TraceCPU::dcacheRecvTimingResp(PacketPtr pkt)
+{
+    DPRINTF(TraceCPUData, "Received timing response from Dcache.\n");
+    dcacheGen.completeMemAccess(pkt);
+}
+
+bool
+TraceCPU::DcachePort::recvTimingResp(PacketPtr pkt)
+{
+    // Handle the responses for data memory requests which is done inside the
+    // elastic data generator
+    owner->dcacheRecvTimingResp(pkt);
+    // After processing the response delete the request and packet to free
+    // memory
+    delete pkt->req;
+    delete pkt;
+
+    return true;
+}
+
+void
+TraceCPU::DcachePort::recvReqRetry()
+{
+    owner->dcacheRetryRecvd();
+}
+
+TraceCPU::ElasticDataGen::InputStream::InputStream(const std::string& filename)
+    : trace(filename),
+      microOpCount(0)
+{
+    // Create a protobuf message for the header and read it from the stream
+    ProtoMessage::InstDepRecordHeader header_msg;
+    if (!trace.read(header_msg)) {
+        panic("Failed to read packet header from %s\n", filename);
+
+        if (header_msg.tick_freq() != SimClock::Frequency) {
+            panic("Trace %s was recorded with a different tick frequency %d\n",
+                  header_msg.tick_freq());
+        }
+    } else {
+        // Assign window size equal to the field in the trace that was recorded
+        // when the data dependency trace was captured in the o3cpu model
+        windowSize = header_msg.window_size();
+    }
+}
+
+void
+TraceCPU::ElasticDataGen::InputStream::reset()
+{
+    trace.reset();
+}
+
+bool
+TraceCPU::ElasticDataGen::InputStream::read(GraphNode* element)
+{
+    ProtoMessage::InstDepRecord pkt_msg;
+    if (trace.read(pkt_msg)) {
+        // Required fields
+        element->seqNum = pkt_msg.seq_num();
+        element->isLoad = pkt_msg.load();
+        element->isStore = pkt_msg.store();
+        element->compDelay = pkt_msg.comp_delay();
+
+        // Repeated field robDepList
+        element->clearRobDep();
+        assert((pkt_msg.rob_dep()).size() <= element->maxRobDep);
+        for (int i = 0; i < (pkt_msg.rob_dep()).size(); i++) {
+            element->robDep[element->numRobDep] = pkt_msg.rob_dep(i);
+            element->numRobDep += 1;
+        }
+
+        // Repeated field
+        element->clearRegDep();
+        assert((pkt_msg.reg_dep()).size() <= TheISA::MaxInstSrcRegs);
+        for (int i = 0; i < (pkt_msg.reg_dep()).size(); i++) {
+            // There is a possibility that an instruction has both, a register
+            // and order dependency on an instruction. In such a case, the
+            // register dependency is omitted
+            bool duplicate = false;
+            for (int j = 0; j < element->numRobDep; j++) {
+                duplicate |= (pkt_msg.reg_dep(i) == element->robDep[j]);
+            }
+            if (!duplicate) {
+                element->regDep[element->numRegDep] = pkt_msg.reg_dep(i);
+                element->numRegDep += 1;
+            }
+        }
+
+        // Optional fields
+        if (pkt_msg.has_addr())
+            element->addr = pkt_msg.addr();
+        else
+            element->addr = 0;
+
+        if (pkt_msg.has_size())
+            element->size = pkt_msg.size();
+        else
+            element->size = 0;
+
+        if (pkt_msg.has_flags())
+            element->flags = pkt_msg.flags();
+        else
+            element->flags = 0;
+
+        if (pkt_msg.has_pc())
+            element->pc = pkt_msg.pc();
+        else
+            element->pc = 0;
+
+        // ROB occupancy number
+        ++microOpCount;
+        if (pkt_msg.has_weight()) {
+            microOpCount += pkt_msg.weight();
+        }
+        element->robNum = microOpCount;
+        return true;
+    }
+
+    // We have reached the end of the file
+    return false;
+}
+
+bool
+TraceCPU::ElasticDataGen::GraphNode::removeRegDep(NodeSeqNum reg_dep)
+{
+    for (auto& own_reg_dep : regDep) {
+        if (own_reg_dep == reg_dep) {
+            // If register dependency is found, make it zero and return true
+            own_reg_dep = 0;
+            --numRegDep;
+            assert(numRegDep >= 0);
+            DPRINTFR(TraceCPUData, "\tFor %lli: Marking register dependency %lli "
+                    "done.\n", seqNum, reg_dep);
+            return true;
+        }
+    }
+
+    // Return false if the dependency is not found
+    return false;
+}
+
+bool
+TraceCPU::ElasticDataGen::GraphNode::removeRobDep(NodeSeqNum rob_dep)
+{
+    for (auto& own_rob_dep : robDep) {
+        if (own_rob_dep == rob_dep) {
+            // If the rob dependency is found, make it zero and return true
+            own_rob_dep = 0;
+            --numRobDep;
+            assert(numRobDep >= 0);
+            DPRINTFR(TraceCPUData, "\tFor %lli: Marking ROB dependency %lli "
+                "done.\n", seqNum, rob_dep);
+            return true;
+        }
+    }
+    return false;
+}
+
+void
+TraceCPU::ElasticDataGen::GraphNode::clearRegDep() {
+    for (auto& own_reg_dep : regDep) {
+        own_reg_dep = 0;
+    }
+    numRegDep = 0;
+}
+
+void
+TraceCPU::ElasticDataGen::GraphNode::clearRobDep() {
+    for (auto& own_rob_dep : robDep) {
+        own_rob_dep = 0;
+    }
+    numRobDep = 0;
+}
+
+bool
+TraceCPU::ElasticDataGen::GraphNode::removeDepOnInst(NodeSeqNum done_seq_num)
+{
+    // If it is an rob dependency then remove it
+    if (!removeRobDep(done_seq_num)) {
+        // If it is not an rob dependency then it must be a register dependency
+        // If the register dependency is not found, it violates an assumption
+        // and must be caught by assert.
+        bool regdep_found M5_VAR_USED = removeRegDep(done_seq_num);
+        assert(regdep_found);
+    }
+    // Return true if the node is dependency free
+    return (numRobDep == 0 && numRegDep == 0);
+}
+
+void
+TraceCPU::ElasticDataGen::GraphNode::writeElementAsTrace() const
+{
+    DPRINTFR(TraceCPUData, "%lli", seqNum);
+    DPRINTFR(TraceCPUData, ",%s", (isLoad ? "True" : "False"));
+    DPRINTFR(TraceCPUData, ",%s", (isStore ? "True" : "False"));
+    if (isLoad || isStore) {
+        DPRINTFR(TraceCPUData, ",%i", addr);
+        DPRINTFR(TraceCPUData, ",%i", size);
+        DPRINTFR(TraceCPUData, ",%i", flags);
+    }
+    DPRINTFR(TraceCPUData, ",%lli", compDelay);
+    int i = 0;
+    DPRINTFR(TraceCPUData, "robDep:");
+    while (robDep[i] != 0) {
+        DPRINTFR(TraceCPUData, ",%lli", robDep[i]);
+        i++;
+    }
+    i = 0;
+    DPRINTFR(TraceCPUData, "regDep:");
+    while (regDep[i] != 0) {
+        DPRINTFR(TraceCPUData, ",%lli", regDep[i]);
+        i++;
+    }
+    auto child_itr = dependents.begin();
+    DPRINTFR(TraceCPUData, "dependents:");
+    while (child_itr != dependents.end()) {
+        DPRINTFR(TraceCPUData, ":%lli", (*child_itr)->seqNum);
+        child_itr++;
+    }
+
+    DPRINTFR(TraceCPUData, "\n");
+}
+
+TraceCPU::FixedRetryGen::InputStream::InputStream(const std::string& filename)
+    : trace(filename)
+{
+    // Create a protobuf message for the header and read it from the stream
+    ProtoMessage::PacketHeader header_msg;
+    if (!trace.read(header_msg)) {
+        panic("Failed to read packet header from %s\n", filename);
+
+        if (header_msg.tick_freq() != SimClock::Frequency) {
+            panic("Trace %s was recorded with a different tick frequency %d\n",
+                  header_msg.tick_freq());
+        }
+    }
+}
+
+void
+TraceCPU::FixedRetryGen::InputStream::reset()
+{
+    trace.reset();
+}
+
+bool
+TraceCPU::FixedRetryGen::InputStream::read(TraceElement* element)
+{
+    ProtoMessage::Packet pkt_msg;
+    if (trace.read(pkt_msg)) {
+        element->cmd = pkt_msg.cmd();
+        element->addr = pkt_msg.addr();
+        element->blocksize = pkt_msg.size();
+        element->tick = pkt_msg.tick();
+        element->flags = pkt_msg.has_flags() ? pkt_msg.flags() : 0;
+        element->pc = pkt_msg.has_pc() ? pkt_msg.pc() : 0;
+        return true;
+    }
+
+    // We have reached the end of the file
+    return false;
+}
diff --git a/src/cpu/trace/trace_cpu.hh b/src/cpu/trace/trace_cpu.hh

new file mode 100644 (file)

index 0000000..3a869eb
--- /dev/null
+++ b/src/cpu/trace/trace_cpu.hh
@@ -0,0 +1,1101 @@
+/*
+ * Copyright (c) 2013 - 2015 ARM Limited
+ * All rights reserved
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Authors: Radhika Jagtap
+ *          Andreas Hansson
+ *          Thomas Grass
+ */
+
+#ifndef __CPU_TRACE_TRACE_CPU_HH__
+#define __CPU_TRACE_TRACE_CPU_HH__
+
+#include <array>
+#include <cstdint>
+#include <queue>
+#include <set>
+#include <unordered_map>
+
+#include "arch/registers.hh"
+#include "base/statistics.hh"
+#include "cpu/base.hh"
+#include "debug/TraceCPUData.hh"
+#include "debug/TraceCPUInst.hh"
+#include "params/TraceCPU.hh"
+#include "proto/inst_dep_record.pb.h"
+#include "proto/packet.pb.h"
+#include "proto/protoio.hh"
+#include "sim/sim_events.hh"
+
+/**
+ * The trace cpu replays traces generated using the elastic trace probe
+ * attached to the O3 CPU model. The elastic trace is an execution trace with
+ * register data dependencies and ordering dependencies annotated to it. The
+ * trace cpu also replays a fixed timestamp fetch trace that is also generated
+ * by the elastic trace probe. This trace cpu model aims at achieving faster
+ * simulation compared to the detailed cpu model and good correlation when the
+ * same trace is used for playback on different memory sub-systems.
+ *
+ * The TraceCPU inherits from BaseCPU so some virtual methods need to be
+ * defined. It has two port subclasses inherited from MasterPort for
+ * instruction and data ports. It issues the memory requests deducing the
+ * timing from the trace and without performing real execution of micro-ops. As
+ * soon as the last dependency for an instruction is complete, its
+ * computational delay, also provided in the input trace is added. The
+ * dependency-free nodes are maintained in a list, called 'ReadyList', ordered
+ * by ready time. Instructions which depend on load stall until the responses
+ * for read requests are received thus achieving elastic replay. If the
+ * dependency is not found when adding a new node, it is assumed complete.
+ * Thus, if this node is found to be completely dependency-free its issue time
+ * is calculated and it is added to the ready list immediately. This is
+ * encapsulated in the subclass ElasticDataGen.
+ *
+ * If ready nodes are issued in an unconstrained way there can be more nodes
+ * outstanding which results in divergence in timing compared to the O3CPU.
+ * Therefore, the Trace CPU also models hardware resources. A sub-class to
+ * model hardware resources contains the maximum sizes of load buffer, store
+ * buffer and ROB. If resources are not available, the node is not issued. Such
+ * nodes that are pending issue are held in the 'depFreeQueue' structure.
+ *
+ * Modeling the ROB size in the Trace CPU as a resource limitation is arguably
+ * the most important parameter of all resources. The ROB occupancy is
+ * estimated using the newly added field 'robNum'. We need to use ROB number as
+ * sequence number is at times much higher due to squashing and trace replay is
+ * focused on correct path modeling.
+ *
+ * A map called 'inFlightNodes' is added to track nodes that are not only in
+ * the readyList but also load nodes that are executed (and thus removed from
+ * readyList) but are not complete. ReadyList handles what and when to execute
+ * next node while the inFlightNodes is used for resource modelling. The oldest
+ * ROB number is updated when any node occupies the ROB or when an entry in the
+ * ROB is released. The ROB occupancy is equal to the difference in the ROB
+ * number of the newly dependency-free node and the oldest ROB number in
+ * flight.
+ *
+ * If no node depends on a non load/store node then there is no reason to
+ * track it in the dependency graph. We filter out such nodes but count them
+ * and add a weight field to the subsequent node that we do include in the
+ * trace. The weight field is used to model ROB occupancy during replay.
+ *
+ * The depFreeQueue is chosen to be FIFO so that child nodes which are in
+ * program order get pushed into it in that order and thus issued in program
+ * order, like in the O3CPU. This is also why the dependents is made a
+ * sequential container, std::set to std::vector. We only check head of the
+ * depFreeQueue as nodes are issued in order and blocking on head models that
+ * better than looping the entire queue. An alternative choice would be to
+ * inspect top N pending nodes where N is the issue-width. This is left for
+ * future as the timing correlation looks good as it is.
+ *
+ * At the start of an execution event, first we attempt to issue such pending
+ * nodes by checking if appropriate resources have become available. If yes, we
+ * compute the execute tick with respect to the time then. Then we proceed to
+ * complete nodes from the readyList.
+ *
+ * When a read response is received, sometimes a dependency on it that was
+ * supposed to be released when it was issued is still not released. This
+ * occurs because the dependent gets added to the graph after the read was
+ * sent. So the check is made less strict and the dependency is marked complete
+ * on read response instead of insisting that it should have been removed on
+ * read sent.
+ *
+ * There is a check for requests spanning two cache lines as this condition
+ * triggers an assert fail in the L1 cache. If it does then truncate the size
+ * to access only until the end of that line and ignore the remainder.
+ * Strictly-ordered requests are skipped and the dependencies on such requests
+ * are handled by simply marking them complete immediately.
+ *
+ * The simulated seconds can be calculated as the difference between the
+ * final_tick stat and the tickOffset stat. A CountedExitEvent that contains a
+ * static int belonging to the Trace CPU class as a down counter is used to
+ * implement multi Trace CPU simulation exit.
+ */
+
+class TraceCPU : public BaseCPU
+{
+
+  public:
+    TraceCPU(TraceCPUParams *params);
+    ~TraceCPU();
+
+    void init();
+
+    /**
+     * This is a pure virtual function in BaseCPU. As we don't know how many
+     * insts are in the trace but only know how how many micro-ops are we
+     * cannot count this stat.
+     *
+     * @return 0
+     */
+    Counter totalInsts() const
+    {
+        return 0;
+    }
+
+    /**
+     * Return totalOps as the number of committed micro-ops plus the
+     * speculatively issued loads that are modelled in the TraceCPU replay.
+     *
+     * @return number of micro-ops i.e. nodes in the elastic data generator
+     */
+    Counter totalOps() const
+    {
+        return dcacheGen.getMicroOpCount();
+    }
+
+    /* Pure virtual function in BaseCPU. Do nothing. */
+    void wakeup(ThreadID tid = 0)
+    {
+        return;
+    }
+
+    /*
+     * When resuming from checkpoint in FS mode, the TraceCPU takes over from
+     * the old cpu. This function overrides the takeOverFrom() function in the
+     * BaseCPU. It unbinds the ports of the old CPU and binds the ports of the
+     * TraceCPU.
+     */
+    void takeOverFrom(BaseCPU *oldCPU);
+
+    /**
+     * When instruction cache port receives a retry, schedule event
+     * icacheNextEvent.
+     */
+    void icacheRetryRecvd();
+
+    /**
+     * When data cache port receives a retry, schedule event
+     * dcacheNextEvent.
+     */
+    void dcacheRetryRecvd();
+
+    /**
+     * When data cache port receives a response, this calls the dcache
+     * generator method handle to complete the load writeback.
+     *
+     * @param pkt Pointer to packet received
+     */
+    void dcacheRecvTimingResp(PacketPtr pkt);
+
+    /**
+     * Schedule event dcacheNextEvent at the given tick
+     *
+     * @param when Tick at which to schedule event
+     */
+    void schedDcacheNextEvent(Tick when);
+
+  protected:
+
+    /**
+     * IcachePort class that interfaces with L1 Instruction Cache.
+     */
+    class IcachePort : public MasterPort
+    {
+      public:
+        /** Default constructor. */
+        IcachePort(TraceCPU* _cpu)
+            : MasterPort(_cpu->name() + ".icache_port", _cpu),
+                         owner(_cpu)
+        { }
+
+      public:
+        /**
+         * Receive the timing reponse and simply delete the packet since
+         * instruction fetch requests are issued as per the timing in the trace
+         * and responses are ignored.
+         *
+         * @param pkt Pointer to packet received
+         * @return true
+         */
+        bool recvTimingResp(PacketPtr pkt);
+
+        /**
+         * Required functionally but do nothing.
+         *
+         * @param pkt Pointer to packet received
+         */
+        void recvTimingSnoopReq(PacketPtr pkt) { }
+
+        /**
+         * Handle a retry signalled by the cache if instruction read failed in
+         * the first attempt.
+         */
+        void recvReqRetry();
+
+      private:
+        TraceCPU* owner;
+    };
+
+    /**
+     * DcachePort class that interfaces with L1 Data Cache.
+     */
+    class DcachePort : public MasterPort
+    {
+
+      public:
+        /** Default constructor. */
+        DcachePort(TraceCPU* _cpu)
+            : MasterPort(_cpu->name() + ".dcache_port", _cpu),
+                         owner(_cpu)
+        { }
+
+      public:
+
+        /**
+         * Receive the timing reponse and call dcacheRecvTimingResp() method
+         * of the dcacheGen to handle completing the load
+         *
+         * @param pkt Pointer to packet received
+         * @return true
+         */
+        bool recvTimingResp(PacketPtr pkt);
+
+        /**
+         * Required functionally but do nothing.
+         *
+         * @param pkt Pointer to packet received
+         */
+        void recvTimingSnoopReq(PacketPtr pkt)
+        { }
+
+        /**
+         * Required functionally but do nothing.
+         *
+         * @param pkt Pointer to packet received
+         */
+        void recvFunctionalSnoop(PacketPtr pkt)
+        { }
+
+        /**
+         * Handle a retry signalled by the cache if data access failed in the
+         * first attempt.
+         */
+        void recvReqRetry();
+
+        /**
+         * Required functionally.
+         *
+         * @return true since we have to snoop
+         */
+        bool isSnooping() const { return true; }
+
+      private:
+        TraceCPU* owner;
+    };
+
+    /** Port to connect to L1 instruction cache. */
+    IcachePort icachePort;
+
+    /** Port to connect to L1 data cache. */
+    DcachePort dcachePort;
+
+    /** Master id for instruction read requests. */
+    const MasterID instMasterID;
+
+    /** Master id for data read and write requests. */
+    const MasterID dataMasterID;
+
+    /** File names for input instruction and data traces. */
+    std::string instTraceFile, dataTraceFile;
+
+    /**
+     * Generator to read protobuf trace containing memory requests at fixed
+     * timestamps, perform flow control and issue memory requests. If L1 cache
+     * port sends packet succesfully, determine the tick to send the next
+     * packet else wait for retry from cache.
+     */
+    class FixedRetryGen
+    {
+
+      private:
+
+        /**
+         * This struct stores a line in the trace file.
+         */
+        struct TraceElement {
+
+            /** Specifies if the request is to be a read or a write */
+            MemCmd cmd;
+
+            /** The address for the request */
+            Addr addr;
+
+            /** The size of the access for the request */
+            Addr blocksize;
+
+            /** The time at which the request should be sent */
+            Tick tick;
+
+            /** Potential request flags to use */
+            Request::FlagsType flags;
+
+            /** Instruction PC */
+            Addr pc;
+
+            /**
+             * Check validity of this element.
+             *
+             * @return if this element is valid
+             */
+            bool isValid() const {
+                return cmd != MemCmd::InvalidCmd;
+            }
+
+            /**
+             * Make this element invalid.
+             */
+            void clear() {
+                cmd = MemCmd::InvalidCmd;
+            }
+        };
+
+        /**
+         * The InputStream encapsulates a trace file and the
+         * internal buffers and populates TraceElements based on
+         * the input.
+         */
+        class InputStream
+        {
+
+          private:
+
+            // Input file stream for the protobuf trace
+            ProtoInputStream trace;
+
+          public:
+
+            /**
+             * Create a trace input stream for a given file name.
+             *
+             * @param filename Path to the file to read from
+             */
+            InputStream(const std::string& filename);
+
+            /**
+             * Reset the stream such that it can be played once
+             * again.
+             */
+            void reset();
+
+            /**
+             * Attempt to read a trace element from the stream,
+             * and also notify the caller if the end of the file
+             * was reached.
+             *
+             * @param element Trace element to populate
+             * @return True if an element could be read successfully
+             */
+            bool read(TraceElement* element);
+        };
+
+        public:
+        /* Constructor */
+        FixedRetryGen(TraceCPU& _owner, const std::string& _name,
+                   MasterPort& _port, MasterID master_id,
+                   const std::string& trace_file)
+            : owner(_owner),
+              port(_port),
+              masterID(master_id),
+              trace(trace_file),
+              genName(owner.name() + ".fixedretry" + _name),
+              retryPkt(nullptr),
+              delta(0),
+              traceComplete(false)
+        {
+        }
+
+        /**
+         * Called from TraceCPU init(). Reads the first message from the
+         * input trace file and returns the send tick.
+         *
+         * @return Tick when first packet must be sent
+         */
+        Tick init();
+
+        /**
+         * This tries to send current or retry packet and returns true if
+         * successfull. It calls nextExecute() to read next message.
+         *
+         * @return bool true if packet is sent successfully
+         */
+        bool tryNext();
+
+        /** Returns name of the FixedRetryGen instance. */
+        const std::string& name() const { return genName; }
+
+        /**
+         * Creates a new request assigning the request parameters passed by the
+         * arguments. Calls the port's sendTimingReq() and returns true if
+         * the packet was sent succesfully. It is called by tryNext()
+         *
+         * @param addr address of request
+         * @param size size of request
+         * @param cmd if it is a read or write request
+         * @param flags associated request flags
+         * @param pc instruction PC that generated the request
+         *
+         * @return true if packet was sent successfully
+         */
+        bool send(Addr addr, unsigned size, const MemCmd& cmd,
+              Request::FlagsType flags, Addr pc);
+
+        /** Exit the FixedRetryGen. */
+        void exit();
+
+        /**
+         * Reads a line of the trace file. Returns the tick
+         * when the next request should be generated. If the end
+         * of the file has been reached, it returns false.
+         *
+         * @return bool false id end of file has been reached
+         */
+        bool nextExecute();
+
+        /**
+         * Returns the traceComplete variable which is set when end of the
+         * input trace file is reached.
+         *
+         * @return bool true if traceComplete is set, false otherwise.
+         */
+        bool isTraceComplete() { return traceComplete; }
+
+        int64_t tickDelta() { return delta; }
+
+        void regStats();
+
+      private:
+
+        /** Reference of the TraceCPU. */
+        TraceCPU& owner;
+
+        /** Reference of the port to be used to issue memory requests. */
+        MasterPort& port;
+
+        /** MasterID used for the requests being sent. */
+        const MasterID masterID;
+
+        /** Input stream used for reading the input trace file. */
+        InputStream trace;
+
+        /** String to store the name of the FixedRetryGen. */
+        std::string genName;
+
+        /** PacketPtr used to store the packet to retry. */
+        PacketPtr retryPkt;
+
+        /**
+         * Stores the difference in the send ticks of the current and last
+         * packets. Keeping this signed to check overflow to a negative value
+         * which will be caught by assert(delta > 0)
+         */
+        int64_t delta;
+
+        /**
+         * Set to true when end of trace is reached.
+         */
+        bool traceComplete;
+
+        /** Store an element read from the trace to send as the next packet. */
+        TraceElement currElement;
+
+        /** Stats for instruction accesses replayed. */
+        Stats::Scalar numSendAttempted;
+        Stats::Scalar numSendSucceeded;
+        Stats::Scalar numSendFailed;
+        Stats::Scalar numRetrySucceeded;
+        /** Last simulated tick by the FixedRetryGen */
+        Stats::Scalar instLastTick;
+
+    };
+
+    /**
+     * The elastic data memory request generator to read protobuf trace
+     * containing execution trace annotated with data and ordering
+     * dependencies. It deduces the time at which to send a load/store request
+     * by tracking the dependencies. It attempts to send a memory request for a
+     * load/store without performing real execution of micro-ops. If L1 cache
+     * port sends packet succesfully, the generator checks which instructions
+     * became dependency free as a result of this and schedules an event
+     * accordingly. If it fails to send the packet, it waits for a retry from
+     * the cache.
+     */
+    class ElasticDataGen
+    {
+
+      private:
+
+        /** Node sequence number type. */
+        typedef uint64_t NodeSeqNum;
+
+        /** Node ROB number type. */
+        typedef uint64_t NodeRobNum;
+
+        /**
+         * The struct GraphNode stores an instruction in the trace file. The
+         * format of the trace file favours constructing a dependency graph of
+         * the execution and this struct is used to encapsulate the request
+         * data as well as pointers to its dependent GraphNodes.
+         */
+        class GraphNode {
+
+          public:
+            /**
+             * The maximum no. of ROB dependencies. There can be at most 2
+             * order dependencies which could exist for a store. For a load
+             * and comp node there can be at most one order dependency.
+             */
+            static const uint8_t maxRobDep = 2;
+
+            /** Typedef for the array containing the ROB dependencies */
+            typedef std::array<NodeSeqNum, maxRobDep> RobDepArray;
+
+            /** Typedef for the array containing the register dependencies */
+            typedef std::array<NodeSeqNum, TheISA::MaxInstSrcRegs> RegDepArray;
+
+            /** Instruction sequence number */
+            NodeSeqNum seqNum;
+
+            /** ROB occupancy number */
+            NodeRobNum robNum;
+
+            /** If instruction is a load */
+            bool isLoad;
+
+            /** If instruction is a store */
+            bool isStore;
+
+            /** The address for the request if any */
+            Addr addr;
+
+            /** Size of request if any */
+            uint32_t size;
+
+            /** Request flags if any */
+            Request::Flags flags;
+
+            /** Instruction PC */
+            Addr pc;
+
+            /** Array of order dependencies. */
+            RobDepArray robDep;
+
+            /** Number of order dependencies */
+            uint8_t numRobDep;
+
+            /** Computational delay */
+            uint64_t compDelay;
+
+            /**
+             * Array of register dependencies (incoming) if any. Maximum number
+             * of source registers used to set maximum size of the array
+             */
+            RegDepArray regDep;
+
+            /** Number of register dependencies */
+            uint8_t numRegDep;
+
+            /**
+             * A vector of nodes dependent (outgoing) on this node. A
+             * sequential container is chosen because when dependents become
+             * free, they attempt to issue in program order.
+             */
+            std::vector<GraphNode *> dependents;
+
+            /** Initialize register dependency array to all zeroes */
+            void clearRegDep();
+
+            /** Initialize register dependency array to all zeroes */
+            void clearRobDep();
+
+            /** Remove completed instruction from register dependency array */
+            bool removeRegDep(NodeSeqNum reg_dep);
+
+            /** Remove completed instruction from order dependency array */
+            bool removeRobDep(NodeSeqNum rob_dep);
+
+            /** Check for all dependencies on completed inst */
+            bool removeDepOnInst(NodeSeqNum done_seq_num);
+
+            /** Return true if node has a request which is strictly ordered */
+            bool isStrictlyOrdered() const {
+                return (flags.isSet(Request::STRICT_ORDER));
+            }
+            /**
+             * Write out element in trace-compatible format using debug flag
+             * TraceCPUData.
+             */
+            void writeElementAsTrace() const;
+        };
+
+        /** Struct to store a ready-to-execute node and its execution tick. */
+        struct ReadyNode
+        {
+            /** The sequence number of the ready node */
+            NodeSeqNum seqNum;
+
+            /** The tick at which the ready node must be executed */
+            Tick execTick;
+        };
+
+        /**
+         * The HardwareResource class models structures that hold the in-flight
+         * nodes. When a node becomes dependency free, first check if resources
+         * are available to issue it.
+         */
+        class HardwareResource
+        {
+          public:
+            /**
+             * Constructor that initializes the sizes of the structures.
+             *
+             * @param max_rob size of the Reorder Buffer
+             * @param max_stores size of Store Buffer
+             * @param max_loads size of Load Buffer
+             */
+            HardwareResource(uint16_t max_rob, uint16_t max_stores,
+                                uint16_t max_loads);
+
+            /**
+             * Occupy appropriate structures for an issued node.
+             *
+             * @param node_ptr pointer to the issued node
+             */
+            void occupy(const GraphNode* new_node);
+
+            /**
+             * Release appropriate structures for a completed node.
+             *
+             * @param node_ptr pointer to the completed node
+             */
+            void release(const GraphNode* done_node);
+
+            /** Release store buffer entry for a completed store */
+            void releaseStoreBuffer();
+
+            /**
+             * Check if structures required to issue a node are free.
+             *
+             * @param node_ptr pointer to the node ready to issue
+             * @return true if resources are available
+             */
+            bool isAvailable(const GraphNode* new_node) const;
+
+            /**
+             * Check if there are any outstanding requests, i.e. requests for
+             * which we are yet to receive a response.
+             *
+             * @return true if there is at least one read or write request
+             *      outstanding
+             */
+            bool awaitingResponse() const;
+
+            /** Print resource occupancy for debugging */
+            void printOccupancy();
+
+          private:
+            /**
+             * The size of the ROB used to throttle the max. number of in-flight
+             * nodes.
+             */
+            const uint16_t sizeROB;
+
+            /**
+             * The size of store buffer. This is used to throttle the max. number
+             * of in-flight stores.
+             */
+            const uint16_t sizeStoreBuffer;
+
+            /**
+             * The size of load buffer. This is used to throttle the max. number
+             * of in-flight loads.
+             */
+            const uint16_t sizeLoadBuffer;
+
+            /**
+             * A map from the sequence number to the ROB number of the in-
+             * flight nodes. This includes all nodes that are in the readyList
+             * plus the loads for which a request has been sent which are not
+             * present in the readyList. But such loads are not yet complete
+             * and thus occupy resources. We need to query the oldest in-flight
+             * node and since a map container keeps all its keys sorted using
+             * the less than criterion, the first element is the in-flight node
+             * with the least sequence number, i.e. the oldest in-flight node.
+             */
+            std::map<NodeSeqNum, NodeRobNum> inFlightNodes;
+
+            /** The ROB number of the oldest in-flight node */
+            NodeRobNum oldestInFlightRobNum;
+
+            /** Number of ready loads for which request may or may not be sent */
+            uint16_t numInFlightLoads;
+
+            /** Number of ready stores for which request may or may not be sent */
+            uint16_t numInFlightStores;
+        };
+
+        /**
+         * The InputStream encapsulates a trace file and the
+         * internal buffers and populates GraphNodes based on
+         * the input.
+         */
+        class InputStream
+        {
+
+          private:
+
+            /** Input file stream for the protobuf trace */
+            ProtoInputStream trace;
+
+            /** Count of committed ops read from trace plus the filtered ops */
+            uint64_t microOpCount;
+
+            /**
+             * The window size that is read from the header of the protobuf
+             * trace and used to process the dependency trace
+             */
+            uint32_t windowSize;
+          public:
+
+            /**
+             * Create a trace input stream for a given file name.
+             *
+             * @param filename Path to the file to read from
+             */
+            InputStream(const std::string& filename);
+
+            /**
+             * Reset the stream such that it can be played once
+             * again.
+             */
+            void reset();
+
+            /**
+             * Attempt to read a trace element from the stream,
+             * and also notify the caller if the end of the file
+             * was reached.
+             *
+             * @param element Trace element to populate
+             * @param size of register dependency array stored in the element
+             * @return True if an element could be read successfully
+             */
+            bool read(GraphNode* element);
+
+            /** Get window size from trace */
+            uint32_t getWindowSize() const { return windowSize; }
+
+            /** Get number of micro-ops modelled in the TraceCPU replay */
+            uint64_t getMicroOpCount() const { return microOpCount; }
+        };
+
+        public:
+        /* Constructor */
+        ElasticDataGen(TraceCPU& _owner, const std::string& _name,
+                   MasterPort& _port, MasterID master_id,
+                   const std::string& trace_file, uint16_t max_rob,
+                   uint16_t max_stores, uint16_t max_loads)
+            : owner(_owner),
+              port(_port),
+              masterID(master_id),
+              trace(trace_file),
+              genName(owner.name() + ".elastic" + _name),
+              retryPkt(nullptr),
+              traceComplete(false),
+              nextRead(false),
+              execComplete(false),
+              windowSize(trace.getWindowSize()),
+              hwResource(max_rob, max_stores, max_loads)
+        {
+            DPRINTF(TraceCPUData, "Window size in the trace is %d.\n",
+                    windowSize);
+        }
+
+        /**
+         * Called from TraceCPU init(). Reads the first message from the
+         * input trace file and returns the send tick.
+         *
+         * @return Tick when first packet must be sent
+         */
+        Tick init();
+
+        /** Returns name of the ElasticDataGen instance. */
+        const std::string& name() const { return genName; }
+
+        /** Exit the ElasticDataGen. */
+        void exit();
+
+        /**
+         * Reads a line of the trace file. Returns the tick when the next
+         * request should be generated. If the end of the file has been
+         * reached, it returns false.
+         *
+         * @return bool false if end of file has been reached else true
+         */
+        bool readNextWindow();
+
+        /**
+         * Iterate over the dependencies of a new node and add the new node
+         * to the list of dependents of the parent node.
+         *
+         * @param   new_node    new node to add to the graph
+         * @tparam  dep_array   the dependency array of type rob or register,
+         *                      that is to be iterated, and may get modified
+         * @param   num_dep     the number of dependencies set in the array
+         *                      which may get modified during iteration
+         */
+        template<typename T> void addDepsOnParent(GraphNode *new_node,
+                                                    T& dep_array,
+                                                    uint8_t& num_dep);
+
+        /**
+         * This is the main execute function which consumes nodes from the
+         * sorted readyList. First attempt to issue the pending dependency-free
+         * nodes held in the depFreeQueue. Insert the ready-to-issue nodes into
+         * the readyList. Then iterate through the readyList and when a node
+         * has its execute tick equal to curTick(), execute it. If the node is
+         * a load or a store call executeMemReq() and if it is neither, simply
+         * mark it complete.
+         */
+        void execute();
+
+        /**
+         * Creates a new request for a load or store assigning the request
+         * parameters. Calls the port's sendTimingReq() and returns a packet
+         * if the send failed so that it can be saved for a retry.
+         *
+         * @param node_ptr pointer to the load or store node to be executed
+         *
+         * @return packet pointer if the request failed and nullptr if it was
+         *          sent successfully
+         */
+        PacketPtr executeMemReq(GraphNode* node_ptr);
+
+        /**
+         * Add a ready node to the readyList. When inserting, ensure the nodes
+         * are sorted in ascending order of their execute ticks.
+         *
+         * @param seq_num seq. num of ready node
+         * @param exec_tick the execute tick of the ready node
+         */
+        void addToSortedReadyList(NodeSeqNum seq_num, Tick exec_tick);
+
+        /** Print readyList for debugging using debug flag TraceCPUData. */
+        void printReadyList();
+
+        /**
+         * When a load writeback is received, that is when the load completes,
+         * release the dependents on it. This is called from the dcache port
+         * recvTimingResp().
+         */
+        void completeMemAccess(PacketPtr pkt);
+
+        /**
+         * Returns the execComplete variable which is set when the last
+         * node is executed.
+         *
+         * @return bool true if execComplete is set, false otherwise.
+         */
+        bool isExecComplete() const { return execComplete; }
+
+        /**
+         * Attempts to issue a node once the node's source dependencies are
+         * complete. If resources are available then add it to the readyList,
+         * otherwise the node is not issued and is stored in depFreeQueue
+         * until resources become available.
+         *
+         * @param node_ptr pointer to node to be issued
+         * @param first true if this is the first attempt to issue this node
+         * @return true if node was added to readyList
+         */
+        bool checkAndIssue(const GraphNode* node_ptr, bool first = true);
+
+        /** Get number of micro-ops modelled in the TraceCPU replay */
+        uint64_t getMicroOpCount() const { return trace.getMicroOpCount(); }
+
+        void regStats();
+
+      private:
+
+        /** Reference of the TraceCPU. */
+        TraceCPU& owner;
+
+        /** Reference of the port to be used to issue memory requests. */
+        MasterPort& port;
+
+        /** MasterID used for the requests being sent. */
+        const MasterID masterID;
+
+        /** Input stream used for reading the input trace file. */
+        InputStream trace;
+
+        /** String to store the name of the FixedRetryGen. */
+        std::string genName;
+
+        /** PacketPtr used to store the packet to retry. */
+        PacketPtr retryPkt;
+
+        /** Set to true when end of trace is reached. */
+        bool traceComplete;
+
+        /** Set to true when the next window of instructions need to be read */
+        bool nextRead;
+
+        /** Set true when execution of trace is complete */
+        bool execComplete;
+
+        /**
+         * Window size within which to check for dependencies. Its value is
+         * made equal to the window size used to generate the trace which is
+         * recorded in the trace header. The dependency graph must be
+         * populated enough such that when a node completes, its potential
+         * child node must be found and the dependency removed before the
+         * completed node itself is removed. Thus as soon as the graph shrinks
+         * to become smaller than this window, we read in the next window.
+         */
+        const uint32_t windowSize;
+
+        /**
+         * Hardware resources required to contain in-flight nodes and to
+         * throttle issuing of new nodes when resources are not available.
+         */
+        HardwareResource hwResource;
+
+        /** Store the depGraph of GraphNodes */
+        std::unordered_map<NodeSeqNum, GraphNode*> depGraph;
+
+        /**
+         * Queue of dependency-free nodes that are pending issue because
+         * resources are not available. This is chosen to be FIFO so that
+         * dependent nodes which become free in program order get pushed
+         * into the queue in that order. Thus nodes are more likely to
+         * issue in program order.
+         */
+        std::queue<const GraphNode*> depFreeQueue;
+
+        /** List of nodes that are ready to execute */
+        std::list<ReadyNode> readyList;
+
+        /** Stats for data memory accesses replayed. */
+        Stats::Scalar maxDependents;
+        Stats::Scalar maxReadyListSize;
+        Stats::Scalar numSendAttempted;
+        Stats::Scalar numSendSucceeded;
+        Stats::Scalar numSendFailed;
+        Stats::Scalar numRetrySucceeded;
+        Stats::Scalar numSplitReqs;
+        Stats::Scalar numSOLoads;
+        Stats::Scalar numSOStores;
+        /** Tick when ElasticDataGen completes execution */
+        Stats::Scalar dataLastTick;
+    };
+
+    /** Instance of FixedRetryGen to replay instruction read requests. */
+    FixedRetryGen icacheGen;
+
+    /** Instance of ElasticDataGen to replay data read and write requests. */
+    ElasticDataGen dcacheGen;
+
+    /**
+     * This is the control flow that uses the functionality of the icacheGen to
+     * replay the trace. It calls tryNext(). If it returns true then next event
+     * is scheduled at curTick() plus delta. If it returns false then delta is
+     * ignored and control is brought back via recvRetry().
+     */
+    void schedIcacheNext();
+
+    /**
+     * This is the control flow that uses the functionality of the dcacheGen to
+     * replay the trace. It calls execute(). It checks if execution is complete
+     * and schedules an event to exit simulation accordingly.
+     */
+    void schedDcacheNext();
+
+    /** Event for the control flow method schedIcacheNext() */
+    EventWrapper<TraceCPU, &TraceCPU::schedIcacheNext> icacheNextEvent;
+
+    /** Event for the control flow method schedDcacheNext() */
+    EventWrapper<TraceCPU, &TraceCPU::schedDcacheNext> dcacheNextEvent;
+
+    /** This is called when either generator finishes executing from the trace */
+    void checkAndSchedExitEvent();
+
+    /** Set to true when one of the generators finishes replaying its trace. */
+    bool oneTraceComplete;
+
+    /**
+     * This is stores the tick of the first instruction fetch request
+     * which is later used for dumping the tickOffset stat.
+     */
+    Tick firstFetchTick;
+
+    /**
+     * Number of Trace CPUs in the system used as a shared variable and passed
+     * to the CountedExitEvent event used for counting down exit events.  It is
+     * incremented in the constructor call so that the total is arrived at
+     * automatically.
+     */
+    static int numTraceCPUs;
+
+   /**
+    * A CountedExitEvent which when serviced decrements the counter. A sim
+    * exit event is scheduled when the counter equals zero, that is all
+    * instances of Trace CPU have had their execCompleteEvent serviced.
+    */
+    CountedExitEvent *execCompleteEvent;
+
+    Stats::Scalar numSchedDcacheEvent;
+    Stats::Scalar numSchedIcacheEvent;
+
+    /** Stat for number of simulated micro-ops. */
+    Stats::Scalar numOps;
+    /** Stat for the CPI. This is really cycles per micro-op and not inst. */
+    Stats::Formula cpi;
+
+    /**
+     * The first execution tick is dumped as a stat so that the simulated
+     * seconds for a trace replay can be calculated as a difference between the
+     * final_tick stat and the tickOffset stat
+     */
+    Stats::Scalar tickOffset;
+
+  public:
+
+    /** Used to get a reference to the icache port. */
+    MasterPort &getInstPort() { return icachePort; }
+
+    /** Used to get a reference to the dcache port. */
+    MasterPort &getDataPort() { return dcachePort; }
+
+    void regStats();
+};
+#endif // __CPU_TRACE_TRACE_CPU_HH__
author	Radhika Jagtap <radhika.jagtap@ARM.com>
	Mon, 7 Dec 2015 22:42:15 +0000 (16:42 -0600)
committer	Radhika Jagtap <radhika.jagtap@ARM.com>
	Mon, 7 Dec 2015 22:42:15 +0000 (16:42 -0600)
src/cpu/trace/SConscript	[new file with mode: 0644]	patch \| blob
src/cpu/trace/TraceCPU.py	[new file with mode: 0644]	patch \| blob
src/cpu/trace/trace_cpu.cc	[new file with mode: 0644]	patch \| blob
src/cpu/trace/trace_cpu.hh	[new file with mode: 0644]	patch \| blob