;
     }
 
+// These macros make it easier to sum the right subset of commands and
+// to change the subset of commands that are considered "demand" vs
+// "non-demand"
+#define SUM_DEMAND(s) \
+    (s[MemCmd::ReadReq] + s[MemCmd::WriteReq] + s[MemCmd::ReadExReq])
+
+// should writebacks be included here?  prior code was inconsistent...
+#define SUM_NON_DEMAND(s) \
+    (s[MemCmd::SoftPFReq] + s[MemCmd::HardPFReq])
+
     demandHits
         .name(name() + ".demand_hits")
         .desc("number of demand (read+write) hits")
         .flags(total)
         ;
-    demandHits = hits[MemCmd::ReadReq] + hits[MemCmd::WriteReq];
+    demandHits = SUM_DEMAND(hits);
 
     overallHits
         .name(name() + ".overall_hits")
         .desc("number of overall hits")
         .flags(total)
         ;
-    overallHits = demandHits + hits[MemCmd::SoftPFReq] + hits[MemCmd::HardPFReq]
-        + hits[MemCmd::Writeback];
+    overallHits = demandHits + SUM_NON_DEMAND(hits);
 
     // Miss statistics
     for (int access_idx = 0; access_idx < MemCmd::NUM_MEM_CMDS; ++access_idx) {
         .desc("number of demand (read+write) misses")
         .flags(total)
         ;
-    demandMisses = misses[MemCmd::ReadReq] + misses[MemCmd::WriteReq];
+    demandMisses = SUM_DEMAND(misses);
 
     overallMisses
         .name(name() + ".overall_misses")
         .desc("number of overall misses")
         .flags(total)
         ;
-    overallMisses = demandMisses + misses[MemCmd::SoftPFReq] +
-        misses[MemCmd::HardPFReq] + misses[MemCmd::Writeback];
+    overallMisses = demandMisses + SUM_NON_DEMAND(misses);
 
     // Miss latency statistics
     for (int access_idx = 0; access_idx < MemCmd::NUM_MEM_CMDS; ++access_idx) {
         .desc("number of demand (read+write) miss cycles")
         .flags(total)
         ;
-    demandMissLatency = missLatency[MemCmd::ReadReq] + missLatency[MemCmd::WriteReq];
+    demandMissLatency = SUM_DEMAND(missLatency);
 
     overallMissLatency
         .name(name() + ".overall_miss_latency")
         .desc("number of overall miss cycles")
         .flags(total)
         ;
-    overallMissLatency = demandMissLatency + missLatency[MemCmd::SoftPFReq] +
-        missLatency[MemCmd::HardPFReq];
+    overallMissLatency = demandMissLatency + SUM_NON_DEMAND(missLatency);
 
     // access formulas
     for (int access_idx = 0; access_idx < MemCmd::NUM_MEM_CMDS; ++access_idx) {
         .desc("number of demand (read+write) MSHR hits")
         .flags(total)
         ;
-    demandMshrHits = mshr_hits[MemCmd::ReadReq] + mshr_hits[MemCmd::WriteReq];
+    demandMshrHits = SUM_DEMAND(mshr_hits);
 
     overallMshrHits
         .name(name() + ".overall_mshr_hits")
         .desc("number of overall MSHR hits")
         .flags(total)
         ;
-    overallMshrHits = demandMshrHits + mshr_hits[MemCmd::SoftPFReq] +
-        mshr_hits[MemCmd::HardPFReq];
+    overallMshrHits = demandMshrHits + SUM_NON_DEMAND(mshr_hits);
 
     // MSHR miss statistics
     for (int access_idx = 0; access_idx < MemCmd::NUM_MEM_CMDS; ++access_idx) {
         .desc("number of demand (read+write) MSHR misses")
         .flags(total)
         ;
-    demandMshrMisses = mshr_misses[MemCmd::ReadReq] + mshr_misses[MemCmd::WriteReq];
+    demandMshrMisses = SUM_DEMAND(mshr_misses);
 
     overallMshrMisses
         .name(name() + ".overall_mshr_misses")
         .desc("number of overall MSHR misses")
         .flags(total)
         ;
-    overallMshrMisses = demandMshrMisses + mshr_misses[MemCmd::SoftPFReq] +
-        mshr_misses[MemCmd::HardPFReq];
+    overallMshrMisses = demandMshrMisses + SUM_NON_DEMAND(mshr_misses);
 
     // MSHR miss latency statistics
     for (int access_idx = 0; access_idx < MemCmd::NUM_MEM_CMDS; ++access_idx) {
         .desc("number of demand (read+write) MSHR miss cycles")
         .flags(total)
         ;
-    demandMshrMissLatency = mshr_miss_latency[MemCmd::ReadReq]
-        + mshr_miss_latency[MemCmd::WriteReq];
+    demandMshrMissLatency = SUM_DEMAND(mshr_miss_latency);
 
     overallMshrMissLatency
         .name(name() + ".overall_mshr_miss_latency")
         .desc("number of overall MSHR miss cycles")
         .flags(total)
         ;
-    overallMshrMissLatency = demandMshrMissLatency +
-        mshr_miss_latency[MemCmd::SoftPFReq] + mshr_miss_latency[MemCmd::HardPFReq];
+    overallMshrMissLatency =
+        demandMshrMissLatency + SUM_NON_DEMAND(mshr_miss_latency);
 
     // MSHR uncacheable statistics
     for (int access_idx = 0; access_idx < MemCmd::NUM_MEM_CMDS; ++access_idx) {
         .desc("number of overall MSHR uncacheable misses")
         .flags(total)
         ;
-    overallMshrUncacheable = mshr_uncacheable[MemCmd::ReadReq]
-        + mshr_uncacheable[MemCmd::WriteReq] + mshr_uncacheable[MemCmd::SoftPFReq]
-        + mshr_uncacheable[MemCmd::HardPFReq];
+    overallMshrUncacheable =
+        SUM_DEMAND(mshr_uncacheable) + SUM_NON_DEMAND(mshr_uncacheable);
 
     // MSHR miss latency statistics
     for (int access_idx = 0; access_idx < MemCmd::NUM_MEM_CMDS; ++access_idx) {
         .desc("number of overall MSHR uncacheable cycles")
         .flags(total)
         ;
-    overallMshrUncacheableLatency = mshr_uncacheable_lat[MemCmd::ReadReq]
-        + mshr_uncacheable_lat[MemCmd::WriteReq]
-        + mshr_uncacheable_lat[MemCmd::SoftPFReq]
-        + mshr_uncacheable_lat[MemCmd::HardPFReq];
+    overallMshrUncacheableLatency =
+        SUM_DEMAND(mshr_uncacheable_lat) +
+        SUM_NON_DEMAND(mshr_uncacheable_lat);
 
 #if 0
     // MSHR access formulas
 
         }
     }
 
-    Tick nextMSHRReadyTick()
+    Tick nextMSHRReadyTime()
     {
-        return std::min(mshrQueue.nextMSHRReadyTick(),
-                        writeBuffer.nextMSHRReadyTick());
+        return std::min(mshrQueue.nextMSHRReadyTime(),
+                        writeBuffer.nextMSHRReadyTime());
     }
 
     /**
 
 
             if (!target->pkt->req->isUncacheable()) {
                 missLatency[target->pkt->cmdToIndex()][0/*pkt->req->getThreadNum()*/] +=
-                    completion_time - target->time;
+                    completion_time - target->recvTime;
             }
             target->pkt->makeTimingResponse();
             cpuSidePort->respond(target->pkt, completion_time);
     // Can we deallocate MSHR when done?
     bool deallocate = false;
 
+    // Initial target is used just for stats
+    MSHR::Target *initial_tgt = mshr->getTarget();
+    int stats_cmd_idx = initial_tgt->pkt->cmdToIndex();
+    Tick miss_latency = curTick - initial_tgt->recvTime;
+
     if (mshr->isCacheFill) {
-#if 0
-        mshr_miss_latency[mshr->originalCmd.toInt()][0/*pkt->req->getThreadNum()*/] +=
-            curTick - pkt->time;
-#endif
+        mshr_miss_latency[stats_cmd_idx][0/*pkt->req->getThreadNum()*/] +=
+            miss_latency;
         DPRINTF(Cache, "Block for addr %x being updated in Cache\n",
                 pkt->getAddr());
         BlkType *blk = tags->findBlock(pkt->getAddr());
         }
     } else {
         if (pkt->req->isUncacheable()) {
-            mshr_uncacheable_lat[pkt->cmd.toInt()][0/*pkt->req->getThreadNum()*/] +=
-                curTick - pkt->time;
+            mshr_uncacheable_lat[stats_cmd_idx][0/*pkt->req->getThreadNum()*/] +=
+                miss_latency;
         }
 
         while (mshr->hasTargets()) {
     // tried to send packet... if it was successful (no retry), see if
     // we need to rerequest bus or not
     if (!waitingOnRetry) {
-        Tick nextReady = std::min(deferredPacketReadyTick(),
-                                  myCache()->nextMSHRReadyTick());
+        Tick nextReady = std::min(deferredPacketReadyTime(),
+                                  myCache()->nextMSHRReadyTime());
         // @TODO: need to facotr in prefetch requests here somehow
         if (nextReady != MaxTick) {
             DPRINTF(CachePort, "more packets to send @ %d\n", nextReady);
 
 
 void
 MSHR::allocate(Addr _addr, int _size, PacketPtr target,
-               Tick when, Counter _order)
+               Tick whenReady, Counter _order)
 {
     addr = _addr;
     size = _size;
-    readyTick = when;
+    readyTime = whenReady;
     order = _order;
     assert(target);
     isCacheFill = false;
     ntargets = 1;
     // Don't know of a case where we would allocate a new MSHR for a
     // snoop (mem-side request), so set cpuSide to true here.
-    targets.push_back(Target(target, when, _order, true));
+    targets.push_back(Target(target, whenReady, _order, true));
     assert(deferredTargets.empty());
     deferredNeedsExclusive = false;
     pendingInvalidate = false;
  * Adds a target to an MSHR
  */
 void
-MSHR::allocateTarget(PacketPtr target, Tick when, Counter _order)
+MSHR::allocateTarget(PacketPtr target, Tick whenReady, Counter _order)
 {
     if (inService) {
         if (!deferredTargets.empty() || pendingInvalidate ||
             (!needsExclusive && target->needsExclusive())) {
             // need to put on deferred list
-            deferredTargets.push_back(Target(target, when, _order, true));
+            deferredTargets.push_back(Target(target, whenReady, _order, true));
             if (target->needsExclusive()) {
                 deferredNeedsExclusive = true;
             }
         } else {
             // still OK to append to outstanding request
-            targets.push_back(Target(target, when, _order, true));
+            targets.push_back(Target(target, whenReady, _order, true));
         }
     } else {
         if (target->needsExclusive()) {
             needsExclusive = true;
         }
 
-        targets.push_back(Target(target, when, _order, true));
+        targets.push_back(Target(target, whenReady, _order, true));
     }
 
     ++ntargets;
 }
 
 void
-MSHR::allocateSnoopTarget(PacketPtr pkt, Tick when, Counter _order)
+MSHR::allocateSnoopTarget(PacketPtr pkt, Tick whenReady, Counter _order)
 {
     assert(inService); // don't bother to call otherwise
 
     if (needsExclusive || pkt->needsExclusive()) {
         // actual target device (typ. PhysicalMemory) will delete the
         // packet on reception, so we need to save a copy here
-        targets.push_back(Target(new Packet(pkt), when, _order, false));
+        targets.push_back(Target(new Packet(pkt), whenReady, _order, false));
         ++ntargets;
 
         if (needsExclusive) {
     pendingShared = false;
     deferredNeedsExclusive = false;
     order = targets.front().order;
-    readyTick = std::max(curTick, targets.front().time);
+    readyTime = std::max(curTick, targets.front().readyTime);
 
     return true;
 }
 
 
     class Target {
       public:
-        Tick time;      //!< Time when request was received (for stats)
+        Tick recvTime;  //!< Time when request was received (for stats)
+        Tick readyTime; //!< Time when request is ready to be serviced
         Counter order;  //!< Global order (for memory consistency mgmt)
         PacketPtr pkt;  //!< Pending request packet.
         bool cpuSide;   //!< Did request come from cpu side or mem side?
 
         bool isCpuSide() { return cpuSide; }
 
-        Target(PacketPtr _pkt, Tick _time, Counter _order, bool _cpuSide)
-            : time(_time), order(_order), pkt(_pkt), cpuSide(_cpuSide)
+        Target(PacketPtr _pkt, Tick _readyTime, Counter _order, bool _cpuSide)
+            : recvTime(curTick), readyTime(_readyTime), order(_order),
+              pkt(_pkt), cpuSide(_cpuSide)
         {}
     };
 
     MSHRQueue *queue;
 
     /** Cycle when ready to issue */
-    Tick readyTick;
+    Tick readyTime;
 
     /** Order number assigned by the miss queue. */
     Counter order;
 
 MSHR::Iterator
 MSHRQueue::addToReadyList(MSHR *mshr)
 {
-    if (readyList.empty() || readyList.back()->readyTick <= mshr->readyTick) {
+    if (readyList.empty() || readyList.back()->readyTime <= mshr->readyTime) {
         return readyList.insert(readyList.end(), mshr);
     }
 
     MSHR::Iterator i = readyList.begin();
     MSHR::Iterator end = readyList.end();
     for (; i != end; ++i) {
-        if ((*i)->readyTick > mshr->readyTick) {
+        if ((*i)->readyTime > mshr->readyTime) {
             return readyList.insert(i, mshr);
         }
     }
 
      */
     MSHR *getNextMSHR() const
     {
-        if (readyList.empty() || readyList.front()->readyTick > curTick) {
+        if (readyList.empty() || readyList.front()->readyTime > curTick) {
             return NULL;
         }
         return readyList.front();
     }
 
-    Tick nextMSHRReadyTick() const
+    Tick nextMSHRReadyTime() const
     {
-        return readyList.empty() ? MaxTick : readyList.front()->readyTick;
+        return readyList.empty() ? MaxTick : readyList.front()->readyTime;
     }
 };
 
 
     bool deferredPacketReady()
     { return !transmitList.empty() && transmitList.front().tick <= curTick; }
 
-    Tick deferredPacketReadyTick()
+    Tick deferredPacketReadyTime()
     { return transmitList.empty() ? MaxTick : transmitList.front().tick; }
 
     void schedSendEvent(Tick when)