cpu: Add a fetch queue to the o3 cpu

author Mitch Hayenga <mitch.hayenga@arm.com>

Wed, 3 Sep 2014 11:42:35 +0000 (07:42 -0400)

committer Mitch Hayenga <mitch.hayenga@arm.com>

Wed, 3 Sep 2014 11:42:35 +0000 (07:42 -0400)
author Mitch Hayenga <mitch.hayenga@arm.com>
Wed, 3 Sep 2014 11:42:35 +0000 (07:42 -0400)
committer Mitch Hayenga <mitch.hayenga@arm.com>
Wed, 3 Sep 2014 11:42:35 +0000 (07:42 -0400)
diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py

index fb5b5de2b3e46e865d25fdf8cfe2c5ed09595ec3..c70a12f1de6edf7e1dc7651288f7161fc9dcad68 100644 (file)
--- a/src/cpu/o3/O3CPU.py
+++ b/src/cpu/o3/O3CPU.py
@@ -61,6 +61,7 @@ class DerivO3CPU(BaseCPU):
      commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay")
      fetchWidth = Param.Unsigned(8, "Fetch width")
      fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes")
+    fetchQueueSize = Param.Unsigned(32, "Fetch queue size in micro-ops")
  
      renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay")
      iewToDecodeDelay = Param.Cycles(1, "Issue/Execute/Writeback to decode "
diff --git a/src/cpu/o3/fetch.hh b/src/cpu/o3/fetch.hh

index 0c1b81d86eb3ec7eac71cfa82443fadac91f81d4..2e9428ef14ee2b2bd26e6f9bf01d6db521606de5 100644 (file)
--- a/src/cpu/o3/fetch.hh
+++ b/src/cpu/o3/fetch.hh
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2010-2012 ARM Limited
+ * Copyright (c) 2010-2012, 2014 ARM Limited
   * All rights reserved
   *
   * The license below extends only to copyright in the software and shall
@@ -401,9 +401,6 @@ class DefaultFetch
      /** Wire to get commit's information from backwards time buffer. */
      typename TimeBuffer<TimeStruct>::wire fromCommit;
  
-    /** Internal fetch instruction queue. */
-    TimeBuffer<FetchStruct> *fetchQueue;
-
      //Might be annoying how this name is different than the queue.
      /** Wire used to write any information heading to decode. */
      typename TimeBuffer<FetchStruct>::wire toDecode;
@@ -455,6 +452,9 @@ class DefaultFetch
      /** The width of fetch in instructions. */
      unsigned fetchWidth;
  
+    /** The width of decode in instructions. */
+    unsigned decodeWidth;
+
      /** Is the cache blocked?  If so no threads can access it. */
      bool cacheBlocked;
  
@@ -481,6 +481,12 @@ class DefaultFetch
      /** The PC of the first instruction loaded into the fetch buffer. */
      Addr fetchBufferPC[Impl::MaxThreads];
  
+    /** The size of the fetch queue in micro-ops */
+    unsigned fetchQueueSize;
+
+    /** Queue of fetched instructions */
+    std::deque<DynInstPtr> fetchQueue;
+
      /** Whether or not the fetch buffer data is valid. */
      bool fetchBufferValid[Impl::MaxThreads];
  
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh

index 637e39957d363ea8b8e84431feed9a72c698b649..219444ace7f4b294abb349fa76a942f4c983a373 100644 (file)
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -82,11 +82,13 @@ DefaultFetch<Impl>::DefaultFetch(O3CPU *_cpu, DerivO3CPUParams *params)
        iewToFetchDelay(params->iewToFetchDelay),
        commitToFetchDelay(params->commitToFetchDelay),
        fetchWidth(params->fetchWidth),
+      decodeWidth(params->decodeWidth),
        retryPkt(NULL),
        retryTid(InvalidThreadID),
        cacheBlkSize(cpu->cacheLineSize()),
        fetchBufferSize(params->fetchBufferSize),
        fetchBufferMask(fetchBufferSize - 1),
+      fetchQueueSize(params->fetchQueueSize),
        numThreads(params->numThreads),
        numFetchingThreads(params->smtNumFetchingThreads),
        finishTranslationEvent(this)
@@ -313,12 +315,10 @@ DefaultFetch<Impl>::setActiveThreads(std::list<ThreadID> *at_ptr)
  
  template<class Impl>
  void
-DefaultFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
+DefaultFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *ftb_ptr)
  {
-    fetchQueue = fq_ptr;
-
-    // Create wire to write information to proper place in fetch queue.
-    toDecode = fetchQueue->getWire(0);
+    // Create wire to write information to proper place in fetch time buf.
+    toDecode = ftb_ptr->getWire(0);
  }
  
  template<class Impl>
@@ -342,6 +342,7 @@ DefaultFetch<Impl>::resetStage()
      cacheBlocked = false;
  
      priorityList.clear();
+    fetchQueue.clear();
  
      // Setup PC and nextPC with initial state.
      for (ThreadID tid = 0; tid < numThreads; ++tid) {
@@ -454,6 +455,10 @@ DefaultFetch<Impl>::isDrained() const
              return false;
      }
  
+    // Not drained if fetch queue contains entries
+    if (!fetchQueue.empty())
+        return false;
+
      /* The pipeline might start up again in the middle of the drain
       * cycle if the finish translation event is scheduled, so make
       * sure that's not the case.
@@ -673,11 +678,8 @@ DefaultFetch<Impl>::finishTranslation(Fault fault, RequestPtr mem_req)
              fetchStatus[tid] = IcacheWaitResponse;
          }
      } else {
-        // Don't send an instruction to decode if it can't handle it.
-        // Asynchronous nature of this function's calling means we have to
-        // check 2 signals to see if decode is stalled.
-        if (!(numInst < fetchWidth) || stalls[tid].decode ||
-            fromDecode->decodeBlock[tid]) {
+        // Don't send an instruction to decode if we can't handle it.
+        if (!(numInst < fetchWidth) || !(fetchQueue.size() < fetchQueueSize)) {
              assert(!finishTranslationEvent.scheduled());
              finishTranslationEvent.setFault(fault);
              finishTranslationEvent.setReq(mem_req);
@@ -758,6 +760,15 @@ DefaultFetch<Impl>::doSquash(const TheISA::PCState &newPC,
  
      fetchStatus[tid] = Squashing;
  
+    // Empty fetch queue
+    auto inst_itr = fetchQueue.begin();
+    while (inst_itr != fetchQueue.end()) {
+        if ((*inst_itr)->threadNumber == tid)
+            inst_itr = fetchQueue.erase(inst_itr);
+         else
+            ++inst_itr;
+    }
+
      // microops are being squashed, it is not known wheather the
      // youngest non-squashed microop was  marked delayed commit
      // or not. Setting the flag to true ensures that the
@@ -796,9 +807,6 @@ DefaultFetch<Impl>::checkStall(ThreadID tid) const
          assert(cpu->isDraining());
          DPRINTF(Fetch,"[tid:%i]: Drain stall detected.\n",tid);
          ret_val = true;
-    } else if (stalls[tid].decode) {
-        DPRINTF(Fetch,"[tid:%i]: Stall from Decode stage detected.\n",tid);
-        ret_val = true;
      }
  
      return ret_val;
@@ -921,6 +929,21 @@ DefaultFetch<Impl>::tick()
          }
      }
  
+    // Send instructions enqueued into the fetch queue to decode.
+    // Limit rate by fetchWidth.  Stall if decode is stalled.
+    unsigned instsToDecode = 0;
+    while(!fetchQueue.empty() &&
+          instsToDecode < decodeWidth &&
+          !stalls[fetchQueue.front()->threadNumber].decode) {
+        auto inst = fetchQueue.front();
+        toDecode->insts[toDecode->size++] = inst;
+        DPRINTF(Fetch, "[tid:%i][sn:%i]: Sending instruction to decode from "
+                "fetch queue. Fetch queue size: %i.\n",
+                inst->threadNumber, inst->seqNum, fetchQueue.size());
+        fetchQueue.pop_front();
+        instsToDecode++;
+    }
+
      // Reset the number of the instruction we've fetched.
      numInst = 0;
  }
@@ -1072,7 +1095,11 @@ DefaultFetch<Impl>::buildInst(ThreadID tid, StaticInstPtr staticInst,
      // Write the instruction to the first slot in the queue
      // that heads to decode.
      assert(numInst < fetchWidth);
-    toDecode->insts[toDecode->size++] = instruction;
+    fetchQueue.push_back(instruction);
+    assert(fetchQueue.size() <= fetchQueueSize);
+    DPRINTF(Fetch, "[tid:%i]: Fetch queue entry created (%i/%i).\n",
+            tid, fetchQueue.size(), fetchQueueSize);
+    //toDecode->insts[toDecode->size++] = instruction;
  
      // Keep track of if we can take an interrupt at this boundary
      delayedCommit[tid] = instruction->isDelayedCommit();
@@ -1186,8 +1213,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
      // Loop through instruction memory from the cache.
      // Keep issuing while fetchWidth is available and branch is not
      // predicted taken
-    while (numInst < fetchWidth && !predictedBranch) {
-
+    while (numInst < fetchWidth && fetchQueue.size() < fetchQueueSize
+           && !predictedBranch) {
          // We need to process more memory if we aren't going to get a
          // StaticInst from the rom, the current macroop, or what's already
          // in the decoder.
@@ -1310,7 +1337,7 @@ DefaultFetch<Impl>::fetch(bool &status_change)
                  break;
              }
          } while ((curMacroop || decoder[tid]->instReady()) &&
-                 numInst < fetchWidth);
+                 numInst < fetchWidth && fetchQueue.size() < fetchQueueSize);
      }
  
      if (predictedBranch) {
author	Mitch Hayenga <mitch.hayenga@arm.com>
	Wed, 3 Sep 2014 11:42:35 +0000 (07:42 -0400)
committer	Mitch Hayenga <mitch.hayenga@arm.com>
	Wed, 3 Sep 2014 11:42:35 +0000 (07:42 -0400)
src/cpu/o3/O3CPU.py		patch \| blob \| history
src/cpu/o3/fetch.hh		patch \| blob \| history
src/cpu/o3/fetch_impl.hh		patch \| blob \| history