Merge zizzer.eecs.umich.edu:/z/m5/Bitkeeper/newmem

[gem5.git] / src / cpu / o3 / fetch_impl.hh
diff --git a/src/cpu/o3/fetch_impl.hh b/src/cpu/o3/fetch_impl.hh

index 350ecd52dae83ef7db5cc56892eaa587e827ded5..3ae7bc4020c47bc205de40828d960aa70033d872 100644 (file)
--- a/src/cpu/o3/fetch_impl.hh
+++ b/src/cpu/o3/fetch_impl.hh
@@ -40,7 +40,7 @@
  #include "mem/request.hh"
  #include "sim/byteswap.hh"
  #include "sim/host.hh"
-#include "sim/root.hh"
+#include "sim/core.hh"
  
  #if FULL_SYSTEM
  #include "arch/tlb.hh"
@@ -50,6 +50,15 @@
  
  #include <algorithm>
  
+template<class Impl>
+void
+DefaultFetch<Impl>::IcachePort::setPeer(Port *port)
+{
+    Port::setPeer(port);
+
+    fetch->setIcache();
+}
+
  template<class Impl>
  Tick
  DefaultFetch<Impl>::IcachePort::recvAtomic(PacketPtr pkt)
@@ -70,8 +79,13 @@ template<class Impl>
  void
  DefaultFetch<Impl>::IcachePort::recvStatusChange(Status status)
  {
-    if (status == RangeChange)
+    if (status == RangeChange) {
+        if (!snoopRangeSent) {
+            snoopRangeSent = true;
+            sendStatusChange(Port::RangeChange);
+        }
          return;
+    }
  
      panic("DefaultFetch doesn't expect recvStatusChange callback!");
  }
@@ -96,8 +110,10 @@ DefaultFetch<Impl>::IcachePort::recvRetry()
  }
  
  template<class Impl>
-DefaultFetch<Impl>::DefaultFetch(Params *params)
-    : branchPred(params),
+DefaultFetch<Impl>::DefaultFetch(O3CPU *_cpu, Params *params)
+    : cpu(_cpu),
+      branchPred(params),
+      predecoder(NULL),
        decodeToFetchDelay(params->decodeToFetchDelay),
        renameToFetchDelay(params->renameToFetchDelay),
        iewToFetchDelay(params->iewToFetchDelay),
@@ -146,38 +162,19 @@ DefaultFetch<Impl>::DefaultFetch(Params *params)
                " RoundRobin,LSQcount,IQcount}\n");
      }
  
-    // Size of cache block.
-    cacheBlkSize = 64;
-
-    // Create mask to get rid of offset bits.
-    cacheBlkMask = (cacheBlkSize - 1);
-
-    for (int tid=0; tid < numThreads; tid++) {
-
-        fetchStatus[tid] = Running;
-
-        priorityList.push_back(tid);
-
-        memReq[tid] = NULL;
+    // Get the size of an instruction.
+    instSize = sizeof(TheISA::MachInst);
  
-        // Create space to store a cache line.
-        cacheData[tid] = new uint8_t[cacheBlkSize];
-        cacheDataPC[tid] = 0;
-        cacheDataValid[tid] = false;
+    // Name is finally available, so create the port.
+    icachePort = new IcachePort(this);
  
-        delaySlotInfo[tid].branchSeqNum = -1;
-        delaySlotInfo[tid].numInsts = 0;
-        delaySlotInfo[tid].targetAddr = 0;
-        delaySlotInfo[tid].targetReady = false;
+    icachePort->snoopRangeSent = false;
  
-        stalls[tid].decode = false;
-        stalls[tid].rename = false;
-        stalls[tid].iew = false;
-        stalls[tid].commit = false;
+#if USE_CHECKER
+    if (cpu->checker) {
+        cpu->checker->setIcachePort(icachePort);
      }
-
-    // Get the size of an instruction.
-    instSize = sizeof(TheISA::MachInst);
+#endif
  }
  
  template <class Impl>
@@ -277,35 +274,10 @@ DefaultFetch<Impl>::regStats()
      branchPred.regStats();
  }
  
-template<class Impl>
-void
-DefaultFetch<Impl>::setCPU(O3CPU *cpu_ptr)
-{
-    DPRINTF(Fetch, "Setting the CPU pointer.\n");
-    cpu = cpu_ptr;
-
-    // Name is finally available, so create the port.
-    icachePort = new IcachePort(this);
-
-#if USE_CHECKER
-    if (cpu->checker) {
-        cpu->checker->setIcachePort(icachePort);
-    }
-#endif
-
-    // Schedule fetch to get the correct PC from the CPU
-    // scheduleFetchStartupEvent(1);
-
-    // Fetch needs to start fetching instructions at the very beginning,
-    // so it must start up in active state.
-    switchToActive();
-}
-
  template<class Impl>
  void
  DefaultFetch<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *time_buffer)
  {
-    DPRINTF(Fetch, "Setting the time buffer pointer.\n");
      timeBuffer = time_buffer;
  
      // Create wires to get information from proper places in time buffer.
@@ -319,7 +291,6 @@ template<class Impl>
  void
  DefaultFetch<Impl>::setActiveThreads(std::list<unsigned> *at_ptr)
  {
-    DPRINTF(Fetch, "Setting active threads list pointer.\n");
      activeThreads = at_ptr;
  }
  
@@ -327,7 +298,6 @@ template<class Impl>
  void
  DefaultFetch<Impl>::setFetchQueue(TimeBuffer<FetchStruct> *fq_ptr)
  {
-    DPRINTF(Fetch, "Setting the fetch queue pointer.\n");
      fetchQueue = fq_ptr;
  
      // Create wire to write information to proper place in fetch queue.
@@ -342,9 +312,46 @@ DefaultFetch<Impl>::initStage()
      for (int tid = 0; tid < numThreads; tid++) {
          PC[tid] = cpu->readPC(tid);
          nextPC[tid] = cpu->readNextPC(tid);
-#if ISA_HAS_DELAY_SLOT
-        nextNPC[tid] = cpu->readNextNPC(tid);
-#endif
+        microPC[tid] = cpu->readMicroPC(tid);
+    }
+
+    for (int tid=0; tid < numThreads; tid++) {
+
+        fetchStatus[tid] = Running;
+
+        priorityList.push_back(tid);
+
+        memReq[tid] = NULL;
+
+        stalls[tid].decode = false;
+        stalls[tid].rename = false;
+        stalls[tid].iew = false;
+        stalls[tid].commit = false;
+    }
+
+    // Schedule fetch to get the correct PC from the CPU
+    // scheduleFetchStartupEvent(1);
+
+    // Fetch needs to start fetching instructions at the very beginning,
+    // so it must start up in active state.
+    switchToActive();
+}
+
+template<class Impl>
+void
+DefaultFetch<Impl>::setIcache()
+{
+    // Size of cache block.
+    cacheBlkSize = icachePort->peerBlockSize();
+
+    // Create mask to get rid of offset bits.
+    cacheBlkMask = (cacheBlkSize - 1);
+
+    for (int tid=0; tid < numThreads; tid++) {
+        // Create space to store a cache line.
+        cacheData[tid] = new uint8_t[cacheBlkSize];
+        cacheDataPC[tid] = 0;
+        cacheDataValid[tid] = false;
      }
  }
  
@@ -432,13 +439,7 @@ DefaultFetch<Impl>::takeOverFrom()
          stalls[i].commit = 0;
          PC[i] = cpu->readPC(i);
          nextPC[i] = cpu->readNextPC(i);
-#if ISA_HAS_DELAY_SLOT
-        nextNPC[i] = cpu->readNextNPC(i);
-        delaySlotInfo[i].branchSeqNum = -1;
-        delaySlotInfo[i].numInsts = 0;
-        delaySlotInfo[i].targetAddr = 0;
-        delaySlotInfo[i].targetReady = false;
-#endif
+        microPC[i] = cpu->readMicroPC(i);
          fetchStatus[i] = Running;
      }
      numInst = 0;
@@ -488,7 +489,7 @@ DefaultFetch<Impl>::switchToInactive()
  template <class Impl>
  bool
  DefaultFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC,
-                                          Addr &next_NPC)
+                                          Addr &next_NPC, Addr &next_MicroPC)
  {
      // Do branch prediction check here.
      // A bit of a misnomer...next_PC is actually the current PC until
@@ -496,54 +497,50 @@ DefaultFetch<Impl>::lookupAndUpdateNextPC(DynInstPtr &inst, Addr &next_PC,
      bool predict_taken;
  
      if (!inst->isControl()) {
-#if ISA_HAS_DELAY_SLOT
-        Addr cur_PC = next_PC;
-        next_PC  = cur_PC + instSize;      //next_NPC;
-        next_NPC = cur_PC + (2 * instSize);//next_NPC + instSize;
-        inst->setPredTarg(next_NPC);
-#else
-        next_PC = next_PC + instSize;
-        inst->setPredTarg(next_PC);
-#endif
+        if (inst->isMicroOp() && !inst->isLastMicroOp()) {
+            next_MicroPC++;
+        } else {
+            next_PC  = next_NPC;
+            next_NPC = next_NPC + instSize;
+            next_MicroPC = 0;
+        }
+        inst->setPredTarg(next_PC, next_NPC, next_MicroPC);
+        inst->setPredTaken(false);
          return false;
      }
  
+    //Assume for now that all control flow is to a different macroop which
+    //would reset the micro pc to 0.
+    next_MicroPC = 0;
+
      int tid = inst->threadNumber;
-#if ISA_HAS_DELAY_SLOT
      Addr pred_PC = next_PC;
      predict_taken = branchPred.predict(inst, pred_PC, tid);
  
-    if (predict_taken) {
-        DPRINTF(Fetch, "[tid:%i]: Branch predicted to be true.\n", tid);
+/*    if (predict_taken) {
+        DPRINTF(Fetch, "[tid:%i]: Branch predicted to be taken to %#x.\n",
+                tid, pred_PC);
      } else {
-        DPRINTF(Fetch, "[tid:%i]: Branch predicted to be false.\n", tid);
-    }
+        DPRINTF(Fetch, "[tid:%i]: Branch predicted to be not taken.\n", tid);
+    }*/
  
-    if (predict_taken) {
-        next_PC = next_NPC;
+#if ISA_HAS_DELAY_SLOT
+    next_PC = next_NPC;
+    if (predict_taken)
          next_NPC = pred_PC;
-
-        // Update delay slot info
-        ++delaySlotInfo[tid].numInsts;
-        delaySlotInfo[tid].targetAddr = pred_PC;
-        DPRINTF(Fetch, "[tid:%i]: %i delay slot inst(s) to process.\n", tid,
-                delaySlotInfo[tid].numInsts);
-    } else { // !predict_taken
-        if (inst->isCondDelaySlot()) {
-            next_PC = pred_PC;
-            // The delay slot is skipped here if there is on
-            // prediction
-        } else {
-            next_PC = next_NPC;
-            // No need to declare a delay slot here since
-            // there is no for the pred. target to jump
-        }
-
-        next_NPC = next_NPC + instSize;
-    }
+    else
+        next_NPC += instSize;
  #else
-    predict_taken = branchPred.predict(inst, next_PC, tid);
+    if (predict_taken)
+        next_PC = pred_PC;
+    else
+        next_PC += instSize;
+    next_NPC = next_PC + instSize;
  #endif
+/*    DPRINTF(Fetch, "[tid:%i]: Branch predicted to go to %#x and then %#x.\n",
+            tid, next_PC, next_NPC);*/
+    inst->setPredTarg(next_PC, next_NPC, next_MicroPC);
+    inst->setPredTaken(predict_taken);
  
      ++fetchedBranches;
  
@@ -561,27 +558,36 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
      Fault fault = NoFault;
  
      //AlphaDep
-    if (cacheBlocked || isSwitchedOut() ||
-            (interruptPending && (fetch_PC & 0x3))) {
+    if (cacheBlocked) {
+        DPRINTF(Fetch, "[tid:%i] Can't fetch cache line, cache blocked\n",
+                tid);
+        return false;
+    } else if (isSwitchedOut()) {
+        DPRINTF(Fetch, "[tid:%i] Can't fetch cache line, switched out\n",
+                tid);
+        return false;
+    } else if (interruptPending && !(fetch_PC & 0x3)) {
          // Hold off fetch from getting new instructions when:
          // Cache is blocked, or
          // while an interrupt is pending and we're not in PAL mode, or
          // fetch is switched out.
+        DPRINTF(Fetch, "[tid:%i] Can't fetch cache line, interrupt pending\n",
+                tid);
          return false;
      }
  
      // Align the fetch PC so it's at the start of a cache block.
-    fetch_PC = icacheBlockAlignPC(fetch_PC);
+    Addr block_PC = icacheBlockAlignPC(fetch_PC);
  
      // If we've already got the block, no need to try to fetch it again.
-    if (cacheDataValid[tid] && fetch_PC == cacheDataPC[tid]) {
+    if (cacheDataValid[tid] && block_PC == cacheDataPC[tid]) {
          return true;
      }
  
      // Setup the memReq to do a read of the first instruction's address.
      // Set the appropriate read size and flags as well.
      // Build request here.
-    RequestPtr mem_req = new Request(tid, fetch_PC, cacheBlkSize, 0,
+    RequestPtr mem_req = new Request(tid, block_PC, cacheBlkSize, 0,
                                       fetch_PC, cpu->readCpuId(), tid);
  
      memReq[tid] = mem_req;
@@ -608,10 +614,10 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
  
          // Build packet here.
          PacketPtr data_pkt = new Packet(mem_req,
-                                        Packet::ReadReq, Packet::Broadcast);
+                                        MemCmd::ReadReq, Packet::Broadcast);
          data_pkt->dataDynamicArray(new uint8_t[cacheBlkSize]);
  
-        cacheDataPC[tid] = fetch_PC;
+        cacheDataPC[tid] = block_PC;
          cacheDataValid[tid] = false;
  
          DPRINTF(Fetch, "Fetch: Doing instruction read.\n");
@@ -625,6 +631,7 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
                  fault = TheISA::genMachineCheckFault();
                  delete mem_req;
                  memReq[tid] = NULL;
+                warn("Bad address!\n");
              }
              assert(retryPkt == NULL);
              assert(retryTid == -1);
@@ -655,14 +662,15 @@ DefaultFetch<Impl>::fetchCacheLine(Addr fetch_PC, Fault &ret_fault, unsigned tid
  
  template <class Impl>
  inline void
-DefaultFetch<Impl>::doSquash(const Addr &new_PC, unsigned tid)
+DefaultFetch<Impl>::doSquash(const Addr &new_PC,
+        const Addr &new_NPC, const Addr &new_microPC, unsigned tid)
  {
-    DPRINTF(Fetch, "[tid:%i]: Squashing, setting PC to: %#x.\n",
-            tid, new_PC);
+    DPRINTF(Fetch, "[tid:%i]: Squashing, setting PC to: %#x, NPC to: %#x.\n",
+            tid, new_PC, new_NPC);
  
      PC[tid] = new_PC;
-    nextPC[tid] = new_PC + instSize;
-    nextNPC[tid] = new_PC + (2 * instSize);
+    nextPC[tid] = new_NPC;
+    microPC[tid] = new_microPC;
  
      // Clear the icache miss if it's outstanding.
      if (fetchStatus[tid] == IcacheWaitResponse) {
@@ -674,11 +682,12 @@ DefaultFetch<Impl>::doSquash(const Addr &new_PC, unsigned tid)
      // Get rid of the retrying packet if it was from this thread.
      if (retryTid == tid) {
          assert(cacheBlocked);
-        cacheBlocked = false;
-        retryTid = -1;
-        delete retryPkt->req;
-        delete retryPkt;
+        if (retryPkt) {
+            delete retryPkt->req;
+            delete retryPkt;
+        }
          retryPkt = NULL;
+        retryTid = -1;
      }
  
      fetchStatus[tid] = Squashing;
@@ -688,21 +697,13 @@ DefaultFetch<Impl>::doSquash(const Addr &new_PC, unsigned tid)
  
  template<class Impl>
  void
-DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC,
-                                     const InstSeqNum &seq_num,
-                                     unsigned tid)
+DefaultFetch<Impl>::squashFromDecode(const Addr &new_PC, const Addr &new_NPC,
+                                     const Addr &new_MicroPC,
+                                     const InstSeqNum &seq_num, unsigned tid)
  {
      DPRINTF(Fetch, "[tid:%i]: Squashing from decode.\n",tid);
  
-    doSquash(new_PC, tid);
-
-#if ISA_HAS_DELAY_SLOT
-    if (seq_num <=  delaySlotInfo[tid].branchSeqNum) {
-        delaySlotInfo[tid].numInsts = 0;
-        delaySlotInfo[tid].targetAddr = 0;
-        delaySlotInfo[tid].targetReady = false;
-    }
-#endif
+    doSquash(new_PC, new_NPC, new_MicroPC, tid);
  
      // Tell the CPU to remove any instructions that are in flight between
      // fetch and decode.
@@ -740,10 +741,10 @@ typename DefaultFetch<Impl>::FetchStatus
  DefaultFetch<Impl>::updateFetchStatus()
  {
      //Check Running
-    std::list<unsigned>::iterator threads = (*activeThreads).begin();
-
-    while (threads != (*activeThreads).end()) {
+    std::list<unsigned>::iterator threads = activeThreads->begin();
+    std::list<unsigned>::iterator end = activeThreads->end();
  
+    while (threads != end) {
          unsigned tid = *threads++;
  
          if (fetchStatus[tid] == Running ||
@@ -777,38 +778,29 @@ DefaultFetch<Impl>::updateFetchStatus()
  
  template <class Impl>
  void
-DefaultFetch<Impl>::squash(const Addr &new_PC, const InstSeqNum &seq_num,
-                           bool squash_delay_slot, unsigned tid)
+DefaultFetch<Impl>::squash(const Addr &new_PC, const Addr &new_NPC,
+                           const Addr &new_MicroPC,
+                           const InstSeqNum &seq_num, unsigned tid)
  {
      DPRINTF(Fetch, "[tid:%u]: Squash from commit.\n",tid);
  
-    doSquash(new_PC, tid);
-
-#if ISA_HAS_DELAY_SLOT
-    if (seq_num <=  delaySlotInfo[tid].branchSeqNum) {
-        delaySlotInfo[tid].numInsts = 0;
-        delaySlotInfo[tid].targetAddr = 0;
-        delaySlotInfo[tid].targetReady = false;
-    }
+    doSquash(new_PC, new_NPC, new_MicroPC, tid);
  
      // Tell the CPU to remove any instructions that are not in the ROB.
-    cpu->removeInstsNotInROB(tid, squash_delay_slot, seq_num);
-#else
-    // Tell the CPU to remove any instructions that are not in the ROB.
-    cpu->removeInstsNotInROB(tid, true, 0);
-#endif
+    cpu->removeInstsNotInROB(tid);
  }
  
  template <class Impl>
  void
  DefaultFetch<Impl>::tick()
  {
-    std::list<unsigned>::iterator threads = (*activeThreads).begin();
+    std::list<unsigned>::iterator threads = activeThreads->begin();
+    std::list<unsigned>::iterator end = activeThreads->end();
      bool status_change = false;
  
      wroteToTimeBuffer = false;
  
-    while (threads != (*activeThreads).end()) {
+    while (threads != end) {
          unsigned tid = *threads++;
  
          // Check the signals for each thread to determine the proper status
@@ -904,16 +896,11 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)
  
          DPRINTF(Fetch, "[tid:%u]: Squashing instructions due to squash "
                  "from commit.\n",tid);
-
-#if ISA_HAS_DELAY_SLOT
-    InstSeqNum doneSeqNum = fromCommit->commitInfo[tid].bdelayDoneSeqNum;
-#else
-    InstSeqNum doneSeqNum = fromCommit->commitInfo[tid].doneSeqNum;
-#endif
          // In any case, squash.
          squash(fromCommit->commitInfo[tid].nextPC,
-               doneSeqNum,
-               fromCommit->commitInfo[tid].squashDelaySlot,
+               fromCommit->commitInfo[tid].nextNPC,
+               fromCommit->commitInfo[tid].nextMicroPC,
+               fromCommit->commitInfo[tid].doneSeqNum,
                 tid);
  
          // Also check if there's a mispredict that happened.
@@ -962,21 +949,23 @@ DefaultFetch<Impl>::checkSignalsAndUpdate(unsigned tid)
  
          if (fetchStatus[tid] != Squashing) {
  
-#if ISA_HAS_DELAY_SLOT
-            InstSeqNum doneSeqNum = fromDecode->decodeInfo[tid].bdelayDoneSeqNum;
-#else
-            InstSeqNum doneSeqNum = fromDecode->decodeInfo[tid].doneSeqNum;
-#endif
+            DPRINTF(Fetch, "Squashing from decode with PC = %#x, NPC = %#x\n",
+                    fromDecode->decodeInfo[tid].nextPC,
+                    fromDecode->decodeInfo[tid].nextNPC);
              // Squash unless we're already squashing
              squashFromDecode(fromDecode->decodeInfo[tid].nextPC,
-                             doneSeqNum,
+                             fromDecode->decodeInfo[tid].nextNPC,
+                             fromDecode->decodeInfo[tid].nextMicroPC,
+                             fromDecode->decodeInfo[tid].doneSeqNum,
                               tid);
  
              return true;
          }
      }
  
-    if (checkStall(tid) && fetchStatus[tid] != IcacheWaitResponse) {
+    if (checkStall(tid) &&
+        fetchStatus[tid] != IcacheWaitResponse &&
+        fetchStatus[tid] != IcacheWaitRetry) {
          DPRINTF(Fetch, "[tid:%i]: Setting to blocked\n",tid);
  
          fetchStatus[tid] = Blocked;
@@ -1021,7 +1010,9 @@ DefaultFetch<Impl>::fetch(bool &status_change)
      DPRINTF(Fetch, "Attempting to fetch from [tid:%i]\n", tid);
  
      // The current PC.
-    Addr &fetch_PC = PC[tid];
+    Addr fetch_PC = PC[tid];
+    Addr fetch_NPC = nextPC[tid];
+    Addr fetch_MicroPC = microPC[tid];
  
      // Fault code for memory access.
      Fault fault = NoFault;
@@ -1052,12 +1043,16 @@ DefaultFetch<Impl>::fetch(bool &status_change)
      } else {
          if (fetchStatus[tid] == Idle) {
              ++fetchIdleCycles;
+            DPRINTF(Fetch, "[tid:%i]: Fetch is idle!\n", tid);
          } else if (fetchStatus[tid] == Blocked) {
              ++fetchBlockedCycles;
+            DPRINTF(Fetch, "[tid:%i]: Fetch is blocked!\n", tid);
          } else if (fetchStatus[tid] == Squashing) {
              ++fetchSquashCycles;
+            DPRINTF(Fetch, "[tid:%i]: Fetch is squashing!\n", tid);
          } else if (fetchStatus[tid] == IcacheWaitResponse) {
              ++icacheStallCycles;
+            DPRINTF(Fetch, "[tid:%i]: Fetch is waiting cache response!\n", tid);
          }
  
          // Status is Idle, Squashing, Blocked, or IcacheWaitResponse, so
@@ -1075,13 +1070,18 @@ DefaultFetch<Impl>::fetch(bool &status_change)
      }
  
      Addr next_PC = fetch_PC;
-    Addr next_NPC = next_PC + instSize;
+    Addr next_NPC = fetch_NPC;
+    Addr next_MicroPC = fetch_MicroPC;
+
      InstSeqNum inst_seq;
      MachInst inst;
      ExtMachInst ext_inst;
      // @todo: Fix this hack.
      unsigned offset = (fetch_PC & cacheBlkMask) & ~3;
  
+    StaticInstPtr staticInst = NULL;
+    StaticInstPtr macroop = NULL;
+
      if (fault == NoFault) {
          // If the read of the first instruction was successful, then grab the
          // instructions from the rest of the cache line and put them into the
@@ -1094,115 +1094,116 @@ DefaultFetch<Impl>::fetch(bool &status_change)
          // ended this fetch block.
          bool predicted_branch = false;
  
-        // Need to keep track of whether or not a delay slot
-        // instruction has been fetched
+        while (offset < cacheBlkSize &&
+               numInst < fetchWidth &&
+               !predicted_branch) {
  
-        for (;
-             offset < cacheBlkSize &&
-                 numInst < fetchWidth &&
-                 (!predicted_branch || delaySlotInfo[tid].numInsts > 0);
-             ++numInst) {
-
-            // Get a sequence number.
-            inst_seq = cpu->getAndIncrementInstSeq();
+            // If we're branching after this instruction, quite fetching
+            // from the same block then.
+            predicted_branch =
+                (fetch_PC + sizeof(TheISA::MachInst) != fetch_NPC);
+            if (predicted_branch) {
+                DPRINTF(Fetch, "Branch detected with PC = %#x, NPC = %#x\n",
+                        fetch_PC, fetch_NPC);
+            }
  
              // Make sure this is a valid index.
              assert(offset <= cacheBlkSize - instSize);
  
-            // Get the instruction from the array of the cache line.
-            inst = TheISA::gtoh(*reinterpret_cast<TheISA::MachInst *>
-                        (&cacheData[tid][offset]));
-
-#if THE_ISA == ALPHA_ISA
-            ext_inst = TheISA::makeExtMI(inst, fetch_PC);
-#elif THE_ISA == SPARC_ISA
-            ext_inst = TheISA::makeExtMI(inst, cpu->thread[tid]->getTC());
-#endif
-
-            // Create a new DynInst from the instruction fetched.
-            DynInstPtr instruction = new DynInst(ext_inst, fetch_PC,
-                                                 next_PC,
-                                                 inst_seq, cpu);
-            instruction->setTid(tid);
+            if (!macroop) {
+                // Get the instruction from the array of the cache line.
+                inst = TheISA::gtoh(*reinterpret_cast<TheISA::MachInst *>
+                            (&cacheData[tid][offset]));
  
-            instruction->setASID(tid);
+                predecoder.setTC(cpu->thread[tid]->getTC());
+                predecoder.moreBytes(fetch_PC, 0, inst);
  
-            instruction->setThreadState(cpu->thread[tid]);
+                ext_inst = predecoder.getExtMachInst();
+                staticInst = StaticInstPtr(ext_inst);
+                if (staticInst->isMacroOp())
+                    macroop = staticInst;
+            }
+            do {
+                if (macroop) {
+                    staticInst = macroop->fetchMicroOp(fetch_MicroPC);
+                    if (staticInst->isLastMicroOp())
+                        macroop = NULL;
+                }
  
-            DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created "
-                    "[sn:%lli]\n",
-                    tid, instruction->readPC(), inst_seq);
+                // Get a sequence number.
+                inst_seq = cpu->getAndIncrementInstSeq();
  
-            DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n",
-                    tid, instruction->staticInst->disassemble(fetch_PC));
+                // Create a new DynInst from the instruction fetched.
+                DynInstPtr instruction = new DynInst(staticInst,
+                                                     fetch_PC, fetch_NPC, fetch_MicroPC,
+                                                     next_PC, next_NPC, next_MicroPC,
+                                                     inst_seq, cpu);
+                instruction->setTid(tid);
  
-            instruction->traceData =
-                Trace::getInstRecord(curTick, cpu->tcBase(tid),
-                                     instruction->staticInst,
-                                     instruction->readPC());
+                instruction->setASID(tid);
  
-            predicted_branch = lookupAndUpdateNextPC(instruction, next_PC,
-                                                     next_NPC);
+                instruction->setThreadState(cpu->thread[tid]);
  
-            // Add instruction to the CPU's list of instructions.
-            instruction->setInstListIt(cpu->addInst(instruction));
+                DPRINTF(Fetch, "[tid:%i]: Instruction PC %#x created "
+                        "[sn:%lli]\n",
+                        tid, instruction->readPC(), inst_seq);
  
-            // Write the instruction to the first slot in the queue
-            // that heads to decode.
-            toDecode->insts[numInst] = instruction;
+                //DPRINTF(Fetch, "[tid:%i]: MachInst is %#x\n", tid, ext_inst);
  
-            toDecode->size++;
+                DPRINTF(Fetch, "[tid:%i]: Instruction is: %s\n",
+                        tid, instruction->staticInst->disassemble(fetch_PC));
  
-            // Increment stat of fetched instructions.
-            ++fetchedInsts;
+                instruction->traceData =
+                    Trace::getInstRecord(curTick, cpu->tcBase(tid),
+                                         instruction->staticInst,
+                                         instruction->readPC());
  
-            // Move to the next instruction, unless we have a branch.
-            fetch_PC = next_PC;
+                ///FIXME This needs to be more robust in dealing with delay slots
+                predicted_branch |=
+                    lookupAndUpdateNextPC(instruction, next_PC, next_NPC, next_MicroPC);
  
-            if (instruction->isQuiesce()) {
-                DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!",
-                        curTick);
-                fetchStatus[tid] = QuiescePending;
-                ++numInst;
-                status_change = true;
-                break;
-            }
+                // Add instruction to the CPU's list of instructions.
+                instruction->setInstListIt(cpu->addInst(instruction));
  
-            offset += instSize;
+                // Write the instruction to the first slot in the queue
+                // that heads to decode.
+                toDecode->insts[numInst] = instruction;
  
-#if ISA_HAS_DELAY_SLOT
-            if (predicted_branch) {
-                delaySlotInfo[tid].branchSeqNum = inst_seq;
+                toDecode->size++;
  
-                DPRINTF(Fetch, "[tid:%i]: Delay slot branch set to [sn:%i]\n",
-                        tid, inst_seq);
-                continue;
-            } else if (delaySlotInfo[tid].numInsts > 0) {
-                --delaySlotInfo[tid].numInsts;
+                // Increment stat of fetched instructions.
+                ++fetchedInsts;
  
-                // It's OK to set PC to target of branch
-                if (delaySlotInfo[tid].numInsts == 0) {
-                    delaySlotInfo[tid].targetReady = true;
+                // Move to the next instruction, unless we have a branch.
+                fetch_PC = next_PC;
+                fetch_NPC = next_NPC;
+                fetch_MicroPC = next_MicroPC;
  
-                    // Break the looping condition
-                    predicted_branch = true;
+                if (instruction->isQuiesce()) {
+                    DPRINTF(Fetch, "Quiesce instruction encountered, halting fetch!",
+                            curTick);
+                    fetchStatus[tid] = QuiescePending;
+                    ++numInst;
+                    status_change = true;
+                    break;
                  }
  
-                DPRINTF(Fetch, "[tid:%i]: %i delay slot inst(s) left to"
-                        " process.\n", tid, delaySlotInfo[tid].numInsts);
-            }
-#endif
+                ++numInst;
+            } while (staticInst->isMicroOp() &&
+                     !staticInst->isLastMicroOp() &&
+                     numInst < fetchWidth);
+            offset += instSize;
          }
  
-        if (offset >= cacheBlkSize) {
-            DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache "
-                    "block.\n", tid);
+        if (predicted_branch) {
+            DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch "
+                    "instruction encountered.\n", tid);
          } else if (numInst >= fetchWidth) {
              DPRINTF(Fetch, "[tid:%i]: Done fetching, reached fetch bandwidth "
                      "for this cycle.\n", tid);
-        } else if (predicted_branch && delaySlotInfo[tid].numInsts <= 0) {
-            DPRINTF(Fetch, "[tid:%i]: Done fetching, predicted branch "
-                    "instruction encountered.\n", tid);
+        } else if (offset >= cacheBlkSize) {
+            DPRINTF(Fetch, "[tid:%i]: Done fetching, reached the end of cache "
+                    "block.\n", tid);
          }
      }
  
@@ -1213,27 +1214,10 @@ DefaultFetch<Impl>::fetch(bool &status_change)
      // Now that fetching is completed, update the PC to signify what the next
      // cycle will be.
      if (fault == NoFault) {
-#if ISA_HAS_DELAY_SLOT
-        if (delaySlotInfo[tid].targetReady &&
-            delaySlotInfo[tid].numInsts == 0) {
-            // Set PC to target
-            PC[tid] = delaySlotInfo[tid].targetAddr; //next_PC
-            nextPC[tid] = next_PC + instSize;        //next_NPC
-            nextNPC[tid] = next_PC + (2 * instSize);
-
-            delaySlotInfo[tid].targetReady = false;
-        } else {
-            PC[tid] = next_PC;
-            nextPC[tid] = next_NPC;
-            nextNPC[tid] = next_NPC + instSize;
-        }
-
-        DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n", tid, PC[tid]);
-#else
-        DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n",tid, next_PC);
          PC[tid] = next_PC;
-        nextPC[tid] = next_PC + instSize;
-#endif
+        nextPC[tid] = next_NPC;
+        microPC[tid] = next_MicroPC;
+        DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n", tid, next_PC);
      } else {
          // We shouldn't be in an icache miss and also have a fault (an ITB
          // miss)
@@ -1245,17 +1229,19 @@ DefaultFetch<Impl>::fetch(bool &status_change)
          // until commit handles the fault.  The only other way it can
          // wake up is if a squash comes along and changes the PC.
  #if FULL_SYSTEM
-        assert(numInst != fetchWidth);
+        assert(numInst < fetchWidth);
          // Get a sequence number.
          inst_seq = cpu->getAndIncrementInstSeq();
          // We will use a nop in order to carry the fault.
          ext_inst = TheISA::NoopMachInst;
  
+        StaticInstPtr staticInst = new StaticInst(ext_inst);
          // Create a new DynInst from the dummy nop.
-        DynInstPtr instruction = new DynInst(ext_inst, fetch_PC,
-                                             next_PC,
+        DynInstPtr instruction = new DynInst(staticInst,
+                                             fetch_PC, fetch_NPC,
+                                             next_PC, next_NPC,
                                               inst_seq, cpu);
-        instruction->setPredTarg(next_PC + instSize);
+        instruction->setPredTarg(next_PC, next_NPC);
          instruction->setTid(tid);
  
          instruction->setASID(tid);
@@ -1339,7 +1325,9 @@ DefaultFetch<Impl>::getFetchingThread(FetchPriority &fetch_priority)
              return -1;
          }
      } else {
-        int tid = *((*activeThreads).begin());
+        std::list<unsigned>::iterator thread = activeThreads->begin();
+        assert(thread != activeThreads->end());
+        int tid = *thread;
  
          if (fetchStatus[tid] == Running ||
              fetchStatus[tid] == IcacheAccessComplete ||
@@ -1389,9 +1377,10 @@ DefaultFetch<Impl>::iqCount()
  {
      std::priority_queue<unsigned> PQ;
  
-    std::list<unsigned>::iterator threads = (*activeThreads).begin();
+    std::list<unsigned>::iterator threads = activeThreads->begin();
+    std::list<unsigned>::iterator end = activeThreads->end();
  
-    while (threads != (*activeThreads).end()) {
+    while (threads != end) {
          unsigned tid = *threads++;
  
          PQ.push(fromIEW->iewInfo[tid].iqCount);
@@ -1419,10 +1408,10 @@ DefaultFetch<Impl>::lsqCount()
  {
      std::priority_queue<unsigned> PQ;
  
+    std::list<unsigned>::iterator threads = activeThreads->begin();
+    std::list<unsigned>::iterator end = activeThreads->end();
  
-    std::list<unsigned>::iterator threads = (*activeThreads).begin();
-
-    while (threads != (*activeThreads).end()) {
+    while (threads != end) {
          unsigned tid = *threads++;
  
          PQ.push(fromIEW->iewInfo[tid].ldstqCount);
@@ -1448,7 +1437,10 @@ template<class Impl>
  int
  DefaultFetch<Impl>::branchCount()
  {
-    std::list<unsigned>::iterator threads = (*activeThreads).begin();
+    std::list<unsigned>::iterator thread = activeThreads->begin();
+    assert(thread != activeThreads->end());
+    unsigned tid = *thread;
+
      panic("Branch Count Fetch policy unimplemented\n");
-    return *threads;
+    return 0 * tid;
  }