From 93c0fdfb1232cb32ba6b77d0f90be0bff5b63119 Mon Sep 17 00:00:00 2001
From: Gabe Black <gabe.black@gmail.com>
Date: Thu, 24 Dec 2020 04:15:56 -0800
Subject: [PATCH] dev: Generate DMA packets as needed.

Instead of generating all of the DMA packets when a request is
initiated, keep track of the necessary properties and generate them as
needed.

The primary benefit of this approach is that if we don't actually need
packets, for instance if we can satisfy the request using a memory
backdoor, we can just skip them. Otherwise we'll have wasted time
creating them, and then have to clean them up.

Change-Id: I04d399fb7bce1ff9a44979c311be356baf2db555
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/38717
Reviewed-by: Andreas Sandberg <andreas.sandberg@arm.com>
Maintainer: Andreas Sandberg <andreas.sandberg@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
---
 src/dev/dma_device.cc | 150 +++++++++++++++++++++---------------------
 src/dev/dma_device.hh |  39 ++++++++---
 2 files changed, 106 insertions(+), 83 deletions(-)
diff --git a/src/dev/dma_device.cc b/src/dev/dma_device.cc
index da3583378..f60cba896 100644
--- a/src/dev/dma_device.cc
+++ b/src/dev/dma_device.cc
@@ -42,7 +42,6 @@
 
 #include <utility>
 
-#include "base/chunk_generator.hh"
 #include "debug/DMA.hh"
 #include "debug/Drain.hh"
 #include "sim/clocked_object.hh"
@@ -59,10 +58,10 @@ DmaPort::DmaPort(ClockedObject *dev, System *s,
 void
 DmaPort::handleResp(PacketPtr pkt, Tick delay)
 {
-    // should always see a response with a sender state
+    // Should always see a response with a sender state.
     assert(pkt->isResponse());
 
-    // get the DMA sender state
+    // Get the DMA sender state.
     auto *state = dynamic_cast<DmaReqState*>(pkt->senderState);
     assert(state);
 
@@ -73,17 +72,16 @@ DmaPort::handleResp(PacketPtr pkt, Tick delay)
             state->completionEvent ?
             state->completionEvent->scheduled() : 0);
 
-    assert(pendingCount != 0);
-    pendingCount--;
-
-    // update the number of bytes received based on the request rather
-    // than the packet as the latter could be rounded up to line sizes
+    // Update the number of bytes received based on the request rather
+    // than the packet as the latter could be rounded up to line sizes.
     state->numBytes += pkt->req->getSize();
     assert(state->totBytes >= state->numBytes);
 
-    // if we have reached the total number of bytes for this DMA
-    // request, then signal the completion and delete the sate
+    // If we have reached the total number of bytes for this DMA request,
+    // then signal the completion and delete the sate.
     if (state->totBytes == state->numBytes) {
+        assert(pendingCount != 0);
+        pendingCount--;
         if (state->completionEvent) {
             delay += state->delay;
             device->schedule(state->completionEvent, curTick() + delay);
@@ -91,18 +89,35 @@ DmaPort::handleResp(PacketPtr pkt, Tick delay)
         delete state;
     }
 
-    // delete the packet
     delete pkt;
 
-    // we might be drained at this point, if so signal the drain event
+    // We might be drained at this point, if so signal the drain event.
     if (pendingCount == 0)
         signalDrainDone();
 }
 
+PacketPtr
+DmaPort::DmaReqState::createPacket()
+{
+    RequestPtr req = std::make_shared<Request>(
+            gen.addr(), gen.size(), flags, id);
+    req->setStreamId(sid);
+    req->setSubstreamId(ssid);
+    req->taskId(ContextSwitchTaskId::DMA);
+
+    PacketPtr pkt = new Packet(req, cmd);
+
+    if (data)
+        pkt->dataStatic(data + gen.complete());
+
+    pkt->senderState = this;
+    return pkt;
+}
+
 bool
 DmaPort::recvTimingResp(PacketPtr pkt)
 {
-    // We shouldn't ever get a cacheable block in Modified state
+    // We shouldn't ever get a cacheable block in Modified state.
     assert(pkt->req->isUncacheable() ||
            !(pkt->cacheResponding() && !pkt->hasSharers()));
 
@@ -146,41 +161,20 @@ DmaPort::dmaAction(Packet::Command cmd, Addr addr, int size, Event *event,
                    uint8_t *data, uint32_t sid, uint32_t ssid, Tick delay,
                    Request::Flags flag)
 {
-    // one DMA request sender state for every action, that is then
-    // split into many requests and packets based on the block size,
-    // i.e. cache line size
-    DmaReqState *reqState = new DmaReqState(event, size, delay);
-
-    RequestPtr req = nullptr;
-
     DPRINTF(DMA, "Starting DMA for addr: %#x size: %d sched: %d\n", addr, size,
             event ? event->scheduled() : -1);
-    for (ChunkGenerator gen(addr, size, cacheLineSize);
-         !gen.done(); gen.next()) {
-
-        req = std::make_shared<Request>(
-            gen.addr(), gen.size(), flag, requestorId);
-
-        req->setStreamId(sid);
-        req->setSubstreamId(ssid);
-
-        req->taskId(ContextSwitchTaskId::DMA);
-        PacketPtr pkt = new Packet(req, cmd);
-
-        // Increment the data pointer on a write
-        if (data)
-            pkt->dataStatic(data + gen.complete());
 
-        pkt->senderState = reqState;
-
-        DPRINTF(DMA, "--Queuing DMA for addr: %#x size: %d\n", gen.addr(),
-                gen.size());
-        queueDma(pkt);
-    }
+    // One DMA request sender state for every action, that is then
+    // split into many requests and packets based on the block size,
+    // i.e. cache line size.
+    transmitList.push_back(
+            new DmaReqState(cmd, addr, cacheLineSize, size,
+                data, flag, requestorId, sid, ssid, event, delay));
+    pendingCount++;
 
-    // in zero time also initiate the sending of the packets we have
-    // just created, for atomic this involves actually completing all
-    // the requests
+    // In zero time, also initiate the sending of the packets for the request
+    // we have just created. For atomic this involves actually completing all
+    // the requests.
     sendDma();
 }
 
@@ -192,51 +186,51 @@ DmaPort::dmaAction(Packet::Command cmd, Addr addr, int size, Event *event,
               defaultSid, defaultSSid, delay, flag);
 }
 
-void
-DmaPort::queueDma(PacketPtr pkt)
-{
-    transmitList.push_back(pkt);
-
-    // remember that we have another packet pending, this will only be
-    // decremented once a response comes back
-    pendingCount++;
-}
-
 void
 DmaPort::trySendTimingReq()
 {
-    // send the first packet on the transmit list and schedule the
-    // following send if it is successful
-    PacketPtr pkt = transmitList.front();
+    // Send the next packet for the first DMA request on the transmit list,
+    // and schedule the following send if it is successful
+    DmaReqState *state = transmitList.front();
+
+    PacketPtr pkt = inRetry ? inRetry : state->createPacket();
+    inRetry = nullptr;
 
     DPRINTF(DMA, "Trying to send %s addr %#x\n", pkt->cmdString(),
             pkt->getAddr());
 
-    inRetry = !sendTimingReq(pkt);
+    // Check if this was the last packet now, since hypothetically the packet
+    // response may come immediately, and state may be deleted.
+    bool last = state->gen.last();
+    if (!sendTimingReq(pkt))
+        inRetry = pkt;
     if (!inRetry) {
-        transmitList.pop_front();
+        // If that was the last packet from this request, pop it from the list.
+        if (last)
+            transmitList.pop_front();
+        else
+            state->gen.next();
         DPRINTF(DMA, "-- Done\n");
-        // if there is more to do, then do so
-        if (!transmitList.empty())
-            // this should ultimately wait for as many cycles as the
-            // device needs to send the packet, but currently the port
-            // does not have any known width so simply wait a single
-            // cycle
+        // If there is more to do, then do so.
+        if (!transmitList.empty()) {
+            // This should ultimately wait for as many cycles as the device
+            // needs to send the packet, but currently the port does not have
+            // any known width so simply wait a single cycle.
             device->schedule(sendEvent, device->clockEdge(Cycles(1)));
+        }
     } else {
         DPRINTF(DMA, "-- Failed, waiting for retry\n");
     }
 
     DPRINTF(DMA, "TransmitList: %d, inRetry: %d\n",
-            transmitList.size(), inRetry);
+            transmitList.size(), inRetry ? 1 : 0);
 }
 
 void
 DmaPort::sendDma()
 {
-    // some kind of selcetion between access methods
-    // more work is going to have to be done to make
-    // switching actually work
+    // Some kind of selection between access methods. More work is going to
+    // have to be done to make switching actually work.
     assert(transmitList.size());
 
     if (sys->isTimingMode()) {
@@ -251,14 +245,20 @@ DmaPort::sendDma()
     } else if (sys->isAtomicMode()) {
         // send everything there is to send in zero time
         while (!transmitList.empty()) {
-            PacketPtr pkt = transmitList.front();
+            DmaReqState *state = transmitList.front();
             transmitList.pop_front();
 
-            DPRINTF(DMA, "Sending  DMA for addr: %#x size: %d\n",
-                    pkt->req->getPaddr(), pkt->req->getSize());
-            Tick lat = sendAtomic(pkt);
-
-            handleResp(pkt, lat);
+            bool done = state->gen.done();
+            while (!done) {
+                PacketPtr pkt = state->createPacket();
+                DPRINTF(DMA, "Sending  DMA for addr: %#x size: %d\n",
+                        pkt->req->getPaddr(), pkt->req->getSize());
+                Tick lat = sendAtomic(pkt);
+
+                // Check if we're done now, since handleResp may delete state.
+                done = !state->gen.next();
+                handleResp(pkt, lat);
+            }
         }
     } else {
         panic("Unknown memory mode.");
diff --git a/src/dev/dma_device.hh b/src/dev/dma_device.hh
index 00af98de8..bf6ed9533 100644
--- a/src/dev/dma_device.hh
+++ b/src/dev/dma_device.hh
@@ -44,6 +44,7 @@
 #include <deque>
 #include <memory>
 
+#include "base/chunk_generator.hh"
 #include "base/circlebuf.hh"
 #include "dev/io_device.hh"
 #include "mem/port_proxy.hh"
@@ -101,9 +102,34 @@ class DmaPort : public RequestPort, public Drainable
         /** Amount to delay completion of dma by */
         const Tick delay;
 
-        DmaReqState(Event *ce, Addr tb, Tick _delay)
-            : completionEvent(ce), totBytes(tb), delay(_delay)
+        /** Object to track what chunks of bytes to send at a time. */
+        ChunkGenerator gen;
+
+        /** Pointer to a buffer for the data. */
+        uint8_t *const data = nullptr;
+
+        /** The flags to use for requests. */
+        const Request::Flags flags;
+
+        /** The requestor ID to use for requests. */
+        const RequestorID id;
+
+        /** Stream IDs. */
+        const uint32_t sid;
+        const uint32_t ssid;
+
+        /** Command for the request. */
+        const Packet::Command cmd;
+
+        DmaReqState(Packet::Command _cmd, Addr addr, Addr chunk_sz, Addr tb,
+                    uint8_t *_data, Request::Flags _flags, RequestorID _id,
+                    uint32_t _sid, uint32_t _ssid, Event *ce, Tick _delay)
+            : completionEvent(ce), totBytes(tb), delay(_delay),
+              gen(addr, tb, chunk_sz), data(_data), flags(_flags), id(_id),
+              sid(_sid), ssid(_ssid), cmd(_cmd)
         {}
+
+        PacketPtr createPacket();
     };
 
   public:
@@ -119,7 +145,7 @@ class DmaPort : public RequestPort, public Drainable
 
   protected:
     /** Use a deque as we never do any insertion or removal in the middle */
-    std::deque<PacketPtr> transmitList;
+    std::deque<DmaReqState *> transmitList;
 
     /** Event used to schedule a future sending from the transmit list. */
     EventFunctionWrapper sendEvent;
@@ -127,9 +153,8 @@ class DmaPort : public RequestPort, public Drainable
     /** Number of outstanding packets the dma port has. */
     uint32_t pendingCount = 0;
 
-    /** If the port is currently waiting for a retry before it can
-     * send whatever it is that it's sending. */
-    bool inRetry = false;
+    /** The packet (if any) waiting for a retry to send. */
+    PacketPtr inRetry = nullptr;
 
     /** Default streamId */
     const uint32_t defaultSid;
@@ -144,8 +169,6 @@ class DmaPort : public RequestPort, public Drainable
     bool recvTimingResp(PacketPtr pkt) override;
     void recvReqRetry() override;
 
-    void queueDma(PacketPtr pkt);
-
   public:
 
     DmaPort(ClockedObject *dev, System *s, uint32_t sid=0, uint32_t ssid=0);
-- 
2.30.2