CPU: Add readBytes and writeBytes functions to the exec contexts.

[gem5.git] / src / cpu / inorder / cpu.cc
diff --git a/src/cpu/inorder/cpu.cc b/src/cpu/inorder/cpu.cc

index 969359fae88408f368048573674213d1d64d7532..059996b0723bc58b780ac1855fd64136975a4d90 100644 (file)
--- a/src/cpu/inorder/cpu.cc
+++ b/src/cpu/inorder/cpu.cc
@@ -84,25 +84,25 @@ InOrderCPU::TickEvent::description()
  }
  
  InOrderCPU::CPUEvent::CPUEvent(InOrderCPU *_cpu, CPUEventType e_type,
-                             Fault fault, ThreadID _tid, unsigned _vpe)
-    : Event(CPU_Tick_Pri), cpu(_cpu)
+                               Fault fault, ThreadID _tid, DynInstPtr inst,
+                               unsigned event_pri_offset)
+    : Event(Event::Priority((unsigned int)CPU_Tick_Pri + event_pri_offset)),
+      cpu(_cpu)
  {
-    setEvent(e_type, fault, _tid, _vpe);
+    setEvent(e_type, fault, _tid, inst);
  }
  
  
  std::string InOrderCPU::eventNames[NumCPUEvents] =
  {
      "ActivateThread",
-    "DeallocateThread",
+    "ActivateNextReadyThread",
+    "DeactivateThread",
+    "HaltThread",
      "SuspendThread",
-    "DisableThreads",
-    "EnableThreads",
-    "DisableVPEs",
-    "EnableVPEs",
      "Trap",
      "InstGraduated",
-    "SquashAll",
+    "SquashFromMemStall",
      "UpdatePCs"
  };
  
@@ -115,28 +115,24 @@ InOrderCPU::CPUEvent::process()
          cpu->activateThread(tid);
          break;
  
-      //@TODO: Consider Implementing "Suspend Thread" as Separate from Deallocate
-      case SuspendThread: // Suspend & Deallocate are same for now.
-        //cpu->suspendThread(tid);
-        //break;
-      case DeallocateThread:
-        cpu->deallocateThread(tid);
+      case ActivateNextReadyThread:
+        cpu->activateNextReadyThread();
          break;
  
-      case EnableVPEs:
-        cpu->enableVPEs(vpe);
+      case DeactivateThread:
+        cpu->deactivateThread(tid);
          break;
  
-      case DisableVPEs:
-        cpu->disableVPEs(tid, vpe);
+      case HaltThread:
+        cpu->haltThread(tid);
          break;
  
-      case EnableThreads:
-        cpu->enableThreads(vpe);
+      case SuspendThread: 
+        cpu->suspendThread(tid);
          break;
  
-      case DisableThreads:
-        cpu->disableThreads(tid, vpe);
+      case SquashFromMemStall:
+        cpu->squashDueToMemStall(inst->squashingStage, inst->seqNum, tid);
          break;
  
        case Trap:
@@ -144,12 +140,14 @@ InOrderCPU::CPUEvent::process()
          break;
  
        default:
-        fatal("Unrecognized Event Type %d", cpuEventType);
+        fatal("Unrecognized Event Type %s", eventNames[cpuEventType]);    
      }
-
+    
      cpu->cpuEventRemoveList.push(this);
  }
  
+    
+
  const char *
  InOrderCPU::CPUEvent::description()
  {
@@ -160,9 +158,11 @@ void
  InOrderCPU::CPUEvent::scheduleEvent(int delay)
  {
      if (squashed())
-      mainEventQueue.reschedule(this,curTick + cpu->ticks(delay));
+        mainEventQueue.reschedule(this, cpu->nextCycle(curTick +
+                                                       cpu->ticks(delay)));
      else if (!scheduled())
-      mainEventQueue.schedule(this,curTick + cpu->ticks(delay));
+        mainEventQueue.schedule(this, cpu->nextCycle(curTick +
+                                                     cpu->ticks(delay)));
  }
  
  void
@@ -185,11 +185,15 @@ InOrderCPU::InOrderCPU(Params *params)
        system(params->system),
        physmem(system->physmem),
  #endif // FULL_SYSTEM
+#ifdef DEBUG
+      cpuEventNum(0),
+      resReqCount(0),
+#endif // DEBUG
        switchCount(0),
        deferRegistration(false/*params->deferRegistration*/),
        stageTracing(params->stageTracing),
-      numVirtProcs(1)
-{
+      instsPerSwitch(0)
+{    
      ThreadID active_threads;
      cpu_params = params;
  
@@ -208,6 +212,24 @@ InOrderCPU::InOrderCPU(Params *params)
                "in your InOrder implementation or "
                "edit your workload size.");
      }
+
+    
+    if (active_threads > 1) {
+        threadModel = (InOrderCPU::ThreadModel) params->threadModel;
+
+        if (threadModel == SMT) {
+            DPRINTF(InOrderCPU, "Setting Thread Model to SMT.\n");            
+        } else if (threadModel == SwitchOnCacheMiss) {
+            DPRINTF(InOrderCPU, "Setting Thread Model to "
+                    "Switch On Cache Miss\n");
+        }
+        
+    } else {
+        threadModel = Single;
+    }
+     
+        
+    
  #endif
  
      // Bind the fetch & data ports from the resource pool.
@@ -224,20 +246,23 @@ InOrderCPU::InOrderCPU(Params *params)
      for (ThreadID tid = 0; tid < numThreads; ++tid) {
  #if FULL_SYSTEM
          // SMT is not supported in FS mode yet.
-        assert(this->numThreads == 1);
-        this->thread[tid] = new Thread(this, 0);
+        assert(numThreads == 1);
+        thread[tid] = new Thread(this, 0);
  #else
          if (tid < (ThreadID)params->workload.size()) {
              DPRINTF(InOrderCPU, "Workload[%i] process is %#x\n",
-                    tid, this->thread[tid]);
-            this->thread[tid] =
+                    tid, params->workload[tid]->prog_fname);
+            thread[tid] =
                  new Thread(this, tid, params->workload[tid]);
          } else {
              //Allocate Empty thread so M5 can use later
              //when scheduling threads to CPU
              Process* dummy_proc = params->workload[0];
-            this->thread[tid] = new Thread(this, tid, dummy_proc);
+            thread[tid] = new Thread(this, tid, dummy_proc);
          }
+        
+        // Eventually set this with parameters...
+        asid[tid] = tid;
  #endif
  
          // Setup the TC that will serve as the interface to the threads/CPU.
@@ -293,15 +318,30 @@ InOrderCPU::InOrderCPU(Params *params)
          memset(floatRegs.i[tid], 0, sizeof(floatRegs.i[tid]));
          isa[tid].clear();
  
-        isa[tid].expandForMultithreading(numThreads, numVirtProcs);
+        isa[tid].expandForMultithreading(numThreads, 1/*numVirtProcs*/);
+
+        // Define dummy instructions and resource requests to be used.
+        dummyInst[tid] = new InOrderDynInst(this, 
+                                            thread[tid], 
+                                            0, 
+                                            tid, 
+                                            asid[tid]);
+
+        dummyReq[tid] = new ResourceRequest(resPool->getResource(0), 
+                                            dummyInst[tid], 
+                                            0, 
+                                            0, 
+                                            0, 
+                                            0);        
      }
  
-    lastRunningCycle = curTick;
-    contextSwitch = false;
+    dummyReqInst = new InOrderDynInst(this, NULL, 0, 0, 0);
+    dummyReqInst->setSquashed();
  
-    // Define dummy instructions and resource requests to be used.
-    DynInstPtr dummyBufferInst = new InOrderDynInst(this, NULL, 0, 0);
-    dummyReq = new ResourceRequest(NULL, NULL, 0, 0, 0, 0);
+    dummyBufferInst = new InOrderDynInst(this, NULL, 0, 0, 0);
+    dummyBufferInst->setSquashed();
+    
+    lastRunningCycle = curTick;
  
      // Reset CPU to reset state.
  #if FULL_SYSTEM
@@ -311,10 +351,17 @@ InOrderCPU::InOrderCPU(Params *params)
      reset();
  #endif
  
+    dummyBufferInst->resetInstCount();
+    
      // Schedule First Tick Event, CPU will reschedule itself from here on out.
      scheduleTickEvent(0);
  }
  
+InOrderCPU::~InOrderCPU()
+{
+    delete resPool;
+}
+
  
  void
  InOrderCPU::regStats()
@@ -322,7 +369,49 @@ InOrderCPU::regStats()
      /* Register the Resource Pool's stats here.*/
      resPool->regStats();
  
+    /* Register for each Pipeline Stage */
+    for (int stage_num=0; stage_num < ThePipeline::NumStages; stage_num++) {
+        pipelineStage[stage_num]->regStats();
+    }
+
      /* Register any of the InOrderCPU's stats here.*/
+    instsPerCtxtSwitch
+        .name(name() + ".instsPerContextSwitch")
+        .desc("Instructions Committed Per Context Switch")
+        .prereq(instsPerCtxtSwitch);
+    
+    numCtxtSwitches
+        .name(name() + ".contextSwitches")
+        .desc("Number of context switches");
+
+    comLoads
+        .name(name() + ".comLoads")
+        .desc("Number of Load instructions committed");
+
+    comStores
+        .name(name() + ".comStores")
+        .desc("Number of Store instructions committed");
+
+    comBranches
+        .name(name() + ".comBranches")
+        .desc("Number of Branches instructions committed");
+
+    comNops
+        .name(name() + ".comNops")
+        .desc("Number of Nop instructions committed");
+
+    comNonSpec
+        .name(name() + ".comNonSpec")
+        .desc("Number of Non-Speculative instructions committed");
+
+    comInts
+        .name(name() + ".comInts")
+        .desc("Number of Integer instructions committed");
+
+    comFloats
+        .name(name() + ".comFloats")
+        .desc("Number of Floating Point instructions committed");
+            
      timesIdled
          .name(name() + ".timesIdled")
          .desc("Number of times that the entire CPU went into an idle state and"
@@ -331,9 +420,17 @@ InOrderCPU::regStats()
  
      idleCycles
          .name(name() + ".idleCycles")
-        .desc("Total number of cycles that the CPU has spent unscheduled due "
-              "to idling")
-        .prereq(idleCycles);
+        .desc("Number of cycles cpu's stages were not processed");
+
+    runCycles
+        .name(name() + ".runCycles")
+        .desc("Number of cycles cpu stages are processed.");
+
+    activity
+        .name(name() + ".activity")
+        .desc("Percentage of cycles cpu is active")
+        .precision(6);
+    activity = (runCycles / numCycles) * 100;
  
      threadCycles
          .init(numThreads)
@@ -342,7 +439,7 @@ InOrderCPU::regStats()
  
      smtCycles
          .name(name() + ".smtCycles")
-        .desc("Total number of cycles that the CPU was simultaneous multithreading.(SMT)");
+        .desc("Total number of cycles that the CPU was in SMT-mode");
  
      committedInsts
          .init(numThreads)
@@ -362,7 +459,7 @@ InOrderCPU::regStats()
          .name(name() + ".cpi")
          .desc("CPI: Cycles Per Instruction (Per-Thread)")
          .precision(6);
-    cpi = threadCycles / committedInsts;
+    cpi = numCycles / committedInsts;
  
      smtCpi
          .name(name() + ".smt_cpi")
@@ -380,7 +477,7 @@ InOrderCPU::regStats()
          .name(name() + ".ipc")
          .desc("IPC: Instructions Per Cycle (Per-Thread)")
          .precision(6);
-    ipc =  committedInsts / threadCycles;
+    ipc =  committedInsts / numCycles;
  
      smtIpc
          .name(name() + ".smt_ipc")
@@ -405,18 +502,27 @@ InOrderCPU::tick()
  
      ++numCycles;
  
+    bool pipes_idle = true;
+    
      //Tick each of the stages
      for (int stNum=NumStages - 1; stNum >= 0 ; stNum--) {
          pipelineStage[stNum]->tick();
+
+        pipes_idle = pipes_idle && pipelineStage[stNum]->idle;
      }
  
+    if (pipes_idle)
+        idleCycles++;
+    else
+        runCycles++;
+    
      // Now advance the time buffers one tick
      timeBuffer.advance();
      for (int sqNum=0; sqNum < NumStages - 1; sqNum++) {
          stageQueue[sqNum]->advance();
      }
      activityRec.advance();
-
+   
      // Any squashed requests, events, or insts then remove them now
      cleanUpRemovedReqs();
      cleanUpRemovedEvents();
@@ -435,7 +541,8 @@ InOrderCPU::tick()
              //Tick next_tick = curTick + cycles(1);
              //tickEvent.schedule(next_tick);
              mainEventQueue.schedule(&tickEvent, nextCycle(curTick + 1));
-            DPRINTF(InOrderCPU, "Scheduled CPU for next tick @ %i.\n", nextCycle(curTick + 1));
+            DPRINTF(InOrderCPU, "Scheduled CPU for next tick @ %i.\n", 
+                    nextCycle(curTick + 1));
          }
      }
  
@@ -476,7 +583,7 @@ InOrderCPU::reset()
  {
      for (int i = 0; i < numThreads; i++) {
          isa[i].reset(coreType, numThreads,
-                numVirtProcs, dynamic_cast<BaseCPU*>(this));
+                     1/*numVirtProcs*/, dynamic_cast<BaseCPU*>(this));
      }
  }
  
@@ -545,7 +652,7 @@ void
  InOrderCPU::trap(Fault fault, ThreadID tid, int delay)
  {
      //@ Squash Pipeline during TRAP
-    scheduleCpuEvent(Trap, fault, tid, 0/*vpe*/, delay);
+    scheduleCpuEvent(Trap, fault, tid, dummyInst[tid], delay);
  }
  
  void
@@ -554,28 +661,57 @@ InOrderCPU::trapCPU(Fault fault, ThreadID tid)
      fault->invoke(tcBase(tid));
  }
  
+void 
+InOrderCPU::squashFromMemStall(DynInstPtr inst, ThreadID tid, int delay)
+{
+    scheduleCpuEvent(SquashFromMemStall, NoFault, tid, inst, delay);
+}
+
+
+void
+InOrderCPU::squashDueToMemStall(int stage_num, InstSeqNum seq_num,
+                                ThreadID tid)
+{
+    DPRINTF(InOrderCPU, "Squashing Pipeline Stages Due to Memory Stall...\n");
+        
+    // Squash all instructions in each stage including 
+    // instruction that caused the squash (seq_num - 1)
+    // NOTE: The stage bandwidth needs to be cleared so thats why
+    //       the stalling instruction is squashed as well. The stalled
+    //       instruction is previously placed in another intermediate buffer
+    //       while it's stall is being handled.
+    InstSeqNum squash_seq_num = seq_num - 1;
+    
+    for (int stNum=stage_num; stNum >= 0 ; stNum--) {
+        pipelineStage[stNum]->squashDueToMemStall(squash_seq_num, tid);
+    }
+}
+
  void
  InOrderCPU::scheduleCpuEvent(CPUEventType c_event, Fault fault,
-                           ThreadID tid, unsigned vpe, unsigned delay)
+                             ThreadID tid, DynInstPtr inst, 
+                             unsigned delay, unsigned event_pri_offset)
  {
-    CPUEvent *cpu_event = new CPUEvent(this, c_event, fault, tid, vpe);
+    CPUEvent *cpu_event = new CPUEvent(this, c_event, fault, tid, inst,
+                                       event_pri_offset);
  
+    Tick sked_tick = nextCycle(curTick + ticks(delay));
      if (delay >= 0) {
-        DPRINTF(InOrderCPU, "Scheduling CPU Event (%s) for cycle %i.\n",
-                eventNames[c_event], curTick + delay);
-        mainEventQueue.schedule(cpu_event,curTick + delay);
+        DPRINTF(InOrderCPU, "Scheduling CPU Event (%s) for cycle %i, [tid:%i].\n",
+                eventNames[c_event], curTick + delay, tid);
+        mainEventQueue.schedule(cpu_event, sked_tick);
      } else {
          cpu_event->process();
          cpuEventRemoveList.push(cpu_event);
      }
  
      // Broadcast event to the Resource Pool
-    DynInstPtr dummy_inst =
-        new InOrderDynInst(this, NULL, getNextEventNum(), tid);
-    resPool->scheduleEvent(c_event, dummy_inst, 0, 0, tid);
+    // Need to reset tid just in case this is a dummy instruction
+    inst->setTid(tid);        
+    resPool->scheduleEvent(c_event, inst, 0, 0, tid);
  }
  
-inline bool
+bool
  InOrderCPU::isThreadActive(ThreadID tid)
  {
    list<ThreadID>::iterator isActive =
@@ -584,206 +720,144 @@ InOrderCPU::isThreadActive(ThreadID tid)
      return (isActive != activeThreads.end());
  }
  
-
-void
-InOrderCPU::activateThread(ThreadID tid)
+bool
+InOrderCPU::isThreadReady(ThreadID tid)
  {
-    if (!isThreadActive(tid)) {
-        DPRINTF(InOrderCPU,
-                "Adding Thread %i to active threads list in CPU.\n", tid);
-        activeThreads.push_back(tid);
+  list<ThreadID>::iterator isReady =
+      std::find(readyThreads.begin(), readyThreads.end(), tid);
  
-        wakeCPU();
-    }
+    return (isReady != readyThreads.end());
  }
  
-void
-InOrderCPU::deactivateThread(ThreadID tid)
-{
-    DPRINTF(InOrderCPU, "[tid:%i]: Calling deactivate thread.\n", tid);
-
-    if (isThreadActive(tid)) {
-        DPRINTF(InOrderCPU,"[tid:%i]: Removing from active threads list\n",
-                tid);
-        list<ThreadID>::iterator thread_it =
-            std::find(activeThreads.begin(), activeThreads.end(), tid);
-
-        removePipelineStalls(*thread_it);
-
-        //@TODO: change stage status' to Idle?
-
-        activeThreads.erase(thread_it);
-    }
-}
-
-void
-InOrderCPU::removePipelineStalls(ThreadID tid)
-{
-    DPRINTF(InOrderCPU,"[tid:%i]: Removing all pipeline stalls\n",
-            tid);
-
-    for (int stNum = 0; stNum < NumStages ; stNum++) {
-        pipelineStage[stNum]->removeStalls(tid);
-    }
-
-}
  bool
-InOrderCPU::isThreadInCPU(ThreadID tid)
+InOrderCPU::isThreadSuspended(ThreadID tid)
  {
-  list<ThreadID>::iterator isCurrent =
-      std::find(currentThreads.begin(), currentThreads.end(), tid);
+  list<ThreadID>::iterator isSuspended =
+      std::find(suspendedThreads.begin(), suspendedThreads.end(), tid);
  
-    return (isCurrent != currentThreads.end());
+    return (isSuspended != suspendedThreads.end());
  }
  
  void
-InOrderCPU::addToCurrentThreads(ThreadID tid)
-{
-    if (!isThreadInCPU(tid)) {
-        DPRINTF(InOrderCPU, "Adding Thread %i to current threads list in CPU.\n",
-                tid);
-        currentThreads.push_back(tid);
-    }
+InOrderCPU::activateNextReadyThread()
+{
+    if (readyThreads.size() >= 1) {          
+        ThreadID ready_tid = readyThreads.front();
+        
+        // Activate in Pipeline
+        activateThread(ready_tid);                        
+        
+        // Activate in Resource Pool
+        resPool->activateAll(ready_tid);
+        
+        list<ThreadID>::iterator ready_it =
+            std::find(readyThreads.begin(), readyThreads.end(), ready_tid);
+        readyThreads.erase(ready_it);                        
+    } else {
+        DPRINTF(InOrderCPU,
+                "Attempting to activate new thread, but No Ready Threads to"
+                "activate.\n");
+        DPRINTF(InOrderCPU,
+                "Unable to switch to next active thread.\n");
+    }        
  }
  
  void
-InOrderCPU::removeFromCurrentThreads(ThreadID tid)
+InOrderCPU::activateThread(ThreadID tid)
  {
-    if (isThreadInCPU(tid)) {
+    if (isThreadSuspended(tid)) {
          DPRINTF(InOrderCPU,
-                "Adding Thread %i to current threads list in CPU.\n", tid);
-        list<ThreadID>::iterator isCurrent =
-            std::find(currentThreads.begin(), currentThreads.end(), tid);
-        currentThreads.erase(isCurrent);
-    }
-}
+                "Removing [tid:%i] from suspended threads list.\n", tid);
  
-bool
-InOrderCPU::isThreadSuspended(ThreadID tid)
-{
-  list<ThreadID>::iterator isSuspended =
-      std::find(suspendedThreads.begin(), suspendedThreads.end(), tid);
-
-    return (isSuspended!= suspendedThreads.end());
-}
+        list<ThreadID>::iterator susp_it =
+            std::find(suspendedThreads.begin(), suspendedThreads.end(), 
+                      tid);
+        suspendedThreads.erase(susp_it);                        
+    }
  
-void
-InOrderCPU::enableVirtProcElement(unsigned vpe)
-{
-    DPRINTF(InOrderCPU, "[vpe:%i]: Scheduling  "
-            "Enabling of concurrent virtual processor execution",
-            vpe);
+    if (threadModel == SwitchOnCacheMiss &&
+        numActiveThreads() == 1) {
+        DPRINTF(InOrderCPU,
+                "Ignoring activation of [tid:%i], since [tid:%i] is "
+                "already running.\n", tid, activeThreadId());
+        
+        DPRINTF(InOrderCPU,"Placing [tid:%i] on ready threads list\n", 
+                tid);        
+
+        readyThreads.push_back(tid);
+        
+    } else if (!isThreadActive(tid)) {                
+        DPRINTF(InOrderCPU,
+                "Adding [tid:%i] to active threads list.\n", tid);
+        activeThreads.push_back(tid);
+        
+        activateThreadInPipeline(tid);
  
-    scheduleCpuEvent(EnableVPEs, NoFault, 0/*tid*/, vpe);
-}
+        thread[tid]->lastActivate = curTick;            
  
-void
-InOrderCPU::enableVPEs(unsigned vpe)
-{
-    DPRINTF(InOrderCPU, "[vpe:%i]: Enabling Concurrent Execution "
-            "virtual processors %i", vpe);
+        tcBase(tid)->setStatus(ThreadContext::Active);    
  
-    list<ThreadID>::iterator thread_it = currentThreads.begin();
+        wakeCPU();
  
-    while (thread_it != currentThreads.end()) {
-        if (!isThreadSuspended(*thread_it)) {
-            activateThread(*thread_it);
-        }
-        thread_it++;
+        numCtxtSwitches++;        
      }
  }
  
  void
-InOrderCPU::disableVirtProcElement(ThreadID tid, unsigned vpe)
+InOrderCPU::activateThreadInPipeline(ThreadID tid)
  {
-    DPRINTF(InOrderCPU, "[vpe:%i]: Scheduling  "
-            "Disabling of concurrent virtual processor execution",
-            vpe);
-
-    scheduleCpuEvent(DisableVPEs, NoFault, 0/*tid*/, vpe);
+    for (int stNum=0; stNum < NumStages; stNum++) {
+        pipelineStage[stNum]->activateThread(tid);
+    }    
  }
  
  void
-InOrderCPU::disableVPEs(ThreadID tid, unsigned vpe)
+InOrderCPU::deactivateContext(ThreadID tid, int delay)
  {
-    DPRINTF(InOrderCPU, "[vpe:%i]: Disabling Concurrent Execution of "
-            "virtual processors %i", vpe);
+    DPRINTF(InOrderCPU,"[tid:%i]: Deactivating ...\n", tid);
  
-    unsigned base_vpe = TheISA::getVirtProcNum(tcBase(tid));
-
-    list<ThreadID>::iterator thread_it = activeThreads.begin();
+    scheduleCpuEvent(DeactivateThread, NoFault, tid, dummyInst[tid], delay);
  
-    vector<list<ThreadID>::iterator> removeList;
-
-    while (thread_it != activeThreads.end()) {
-        if (base_vpe != vpe) {
-            removeList.push_back(thread_it);
-        }
-        thread_it++;
-    }
+    // Be sure to signal that there's some activity so the CPU doesn't
+    // deschedule itself.
+    activityRec.activity();
  
-    for (int i = 0; i < removeList.size(); i++) {
-        activeThreads.erase(removeList[i]);
-    }
+    _status = Running;
  }
  
  void
-InOrderCPU::enableMultiThreading(unsigned vpe)
+InOrderCPU::deactivateThread(ThreadID tid)
  {
-    // Schedule event to take place at end of cycle
-    DPRINTF(InOrderCPU, "[vpe:%i]: Scheduling Enable Multithreading on "
-            "virtual processor %i", vpe);
+    DPRINTF(InOrderCPU, "[tid:%i]: Calling deactivate thread.\n", tid);
  
-    scheduleCpuEvent(EnableThreads, NoFault, 0/*tid*/, vpe);
-}
+    if (isThreadActive(tid)) {
+        DPRINTF(InOrderCPU,"[tid:%i]: Removing from active threads list\n",
+                tid);
+        list<ThreadID>::iterator thread_it =
+            std::find(activeThreads.begin(), activeThreads.end(), tid);
  
-void
-InOrderCPU::enableThreads(unsigned vpe)
-{
-    DPRINTF(InOrderCPU, "[vpe:%i]: Enabling Multithreading on "
-            "virtual processor %i", vpe);
+        removePipelineStalls(*thread_it);
  
-    list<ThreadID>::iterator thread_it = currentThreads.begin();
+        activeThreads.erase(thread_it);
  
-    while (thread_it != currentThreads.end()) {
-        if (TheISA::getVirtProcNum(tcBase(*thread_it)) == vpe) {
-            if (!isThreadSuspended(*thread_it)) {
-                activateThread(*thread_it);
-            }
-        }
-        thread_it++;
+        // Ideally, this should be triggered from the
+        // suspendContext/Thread functions
+        tcBase(tid)->setStatus(ThreadContext::Suspended);    
      }
-}
-void
-InOrderCPU::disableMultiThreading(ThreadID tid, unsigned vpe)
-{
-    // Schedule event to take place at end of cycle
-   DPRINTF(InOrderCPU, "[tid:%i]: Scheduling Disable Multithreading on "
-            "virtual processor %i", tid, vpe);
  
-    scheduleCpuEvent(DisableThreads, NoFault, tid, vpe);
+    assert(!isThreadActive(tid));    
  }
  
  void
-InOrderCPU::disableThreads(ThreadID tid, unsigned vpe)
+InOrderCPU::removePipelineStalls(ThreadID tid)
  {
-    DPRINTF(InOrderCPU, "[tid:%i]: Disabling Multithreading on "
-            "virtual processor %i", tid, vpe);
-
-    list<ThreadID>::iterator thread_it = activeThreads.begin();
-
-    vector<list<ThreadID>::iterator> removeList;
+    DPRINTF(InOrderCPU,"[tid:%i]: Removing all pipeline stalls\n",
+            tid);
  
-    while (thread_it != activeThreads.end()) {
-        if (TheISA::getVirtProcNum(tcBase(*thread_it)) == vpe) {
-            removeList.push_back(thread_it);
-        }
-        thread_it++;
+    for (int stNum = 0; stNum < NumStages ; stNum++) {
+        pipelineStage[stNum]->removeStalls(tid);
      }
  
-    for (int i = 0; i < removeList.size(); i++) {
-        activeThreads.erase(removeList[i]);
-    }
  }
  
  void
@@ -825,7 +899,8 @@ InOrderCPU::activateContext(ThreadID tid, int delay)
  {
      DPRINTF(InOrderCPU,"[tid:%i]: Activating ...\n", tid);
  
-    scheduleCpuEvent(ActivateThread, NoFault, tid, 0/*vpe*/, delay);
+    
+    scheduleCpuEvent(ActivateThread, NoFault, tid, dummyInst[tid], delay);
  
      // Be sure to signal that there's some activity so the CPU doesn't
      // deschedule itself.
@@ -834,71 +909,74 @@ InOrderCPU::activateContext(ThreadID tid, int delay)
      _status = Running;
  }
  
-
  void
-InOrderCPU::suspendContext(ThreadID tid, int delay)
+InOrderCPU::activateNextReadyContext(int delay)
  {
-    scheduleCpuEvent(SuspendThread, NoFault, tid, 0/*vpe*/, delay);
-    //_status = Idle;
-}
+    DPRINTF(InOrderCPU,"Activating next ready thread\n");
  
-void
-InOrderCPU::suspendThread(ThreadID tid)
-{
-    DPRINTF(InOrderCPU,"[tid: %i]: Suspended ...\n", tid);
-    deactivateThread(tid);
-}
+    // NOTE: Add 5 to the event priority so that we always activate
+    // threads after we've finished deactivating, squashing,etc.
+    // other threads
+    scheduleCpuEvent(ActivateNextReadyThread, NoFault, 0/*tid*/, dummyInst[0], 
+                     delay, 5);
  
-void
-InOrderCPU::deallocateContext(ThreadID tid, int delay)
-{
-    scheduleCpuEvent(DeallocateThread, NoFault, tid, 0/*vpe*/, delay);
+    // Be sure to signal that there's some activity so the CPU doesn't
+    // deschedule itself.
+    activityRec.activity();
+
+    _status = Running;
  }
  
  void
-InOrderCPU::deallocateThread(ThreadID tid)
+InOrderCPU::haltContext(ThreadID tid, int delay)
  {
-    DPRINTF(InOrderCPU,"[tid:%i]: Deallocating ...", tid);
+    DPRINTF(InOrderCPU, "[tid:%i]: Calling Halt Context...\n", tid);
  
-    removeFromCurrentThreads(tid);
+    scheduleCpuEvent(HaltThread, NoFault, tid, dummyInst[tid], delay);
  
-    deactivateThread(tid);
-
-    squashThreadInPipeline(tid);
+    activityRec.activity();
  }
  
  void
-InOrderCPU::squashThreadInPipeline(ThreadID tid)
+InOrderCPU::haltThread(ThreadID tid)
  {
-    //Squash all instructions in each stage
-    for (int stNum=NumStages - 1; stNum >= 0 ; stNum--) {
-        pipelineStage[stNum]->squash(0 /*seq_num*/, tid);
+    DPRINTF(InOrderCPU, "[tid:%i]: Placing on Halted Threads List...\n", tid);
+    deactivateThread(tid);
+    squashThreadInPipeline(tid);   
+    haltedThreads.push_back(tid);    
+
+    tcBase(tid)->setStatus(ThreadContext::Halted);    
+
+    if (threadModel == SwitchOnCacheMiss) {        
+        activateNextReadyContext();    
      }
  }
  
  void
-InOrderCPU::haltContext(ThreadID tid, int delay)
+InOrderCPU::suspendContext(ThreadID tid, int delay)
  {
-    DPRINTF(InOrderCPU, "[tid:%i]: Halt context called.\n", tid);
-
-    // Halt is same thing as deallocate for now
-    // @TODO: Differentiate between halt & deallocate in the CPU
-    // model
-    deallocateContext(tid, delay);
+    scheduleCpuEvent(SuspendThread, NoFault, tid, dummyInst[tid], delay);
  }
  
  void
-InOrderCPU::insertThread(ThreadID tid)
+InOrderCPU::suspendThread(ThreadID tid)
  {
-    panic("Unimplemented Function\n.");
+    DPRINTF(InOrderCPU, "[tid:%i]: Placing on Suspended Threads List...\n",
+            tid);
+    deactivateThread(tid);
+    suspendedThreads.push_back(tid);    
+    thread[tid]->lastSuspend = curTick;    
+
+    tcBase(tid)->setStatus(ThreadContext::Suspended);    
  }
  
  void
-InOrderCPU::removeThread(ThreadID tid)
+InOrderCPU::squashThreadInPipeline(ThreadID tid)
  {
-    DPRINTF(InOrderCPU, "Removing Thread %i from CPU.\n", tid);
-
-    /** Broadcast to CPU resources*/
+    //Squash all instructions in each stage
+    for (int stNum=NumStages - 1; stNum >= 0 ; stNum--) {
+        pipelineStage[stNum]->squash(0 /*seq_num*/, tid);
+    }
  }
  
  PipelineStage*
@@ -907,14 +985,6 @@ InOrderCPU::getPipeStage(int stage_num)
      return pipelineStage[stage_num];
  }
  
-
-void
-InOrderCPU::activateWhenReady(ThreadID tid)
-{
-    panic("Unimplemented Function\n.");
-}
-
-
  uint64_t
  InOrderCPU::readPC(ThreadID tid)
  {
@@ -1002,9 +1072,11 @@ InOrderCPU::readRegOtherThread(unsigned reg_idx, ThreadID tid)
          tid = TheISA::getTargetThread(tcBase(tid));
      }
  
-    if (reg_idx < FP_Base_DepTag) {                   // Integer Register File
+    if (reg_idx < FP_Base_DepTag) {                   
+        // Integer Register File
          return readIntReg(reg_idx, tid);
-    } else if (reg_idx < Ctrl_Base_DepTag) {          // Float Register File
+    } else if (reg_idx < Ctrl_Base_DepTag) {          
+        // Float Register File
          reg_idx -= FP_Base_DepTag;
          return readFloatRegBits(reg_idx, tid);
      } else {
@@ -1067,16 +1139,35 @@ InOrderCPU::addInst(DynInstPtr &inst)
      return --(instList[tid].end());
  }
  
+void 
+InOrderCPU::updateContextSwitchStats()
+{
+    // Set Average Stat Here, then reset to 0    
+    instsPerCtxtSwitch = instsPerSwitch;
+    instsPerSwitch = 0;
+}
+
+    
  void
  InOrderCPU::instDone(DynInstPtr inst, ThreadID tid)
  {
-    // Set the CPU's PCs - This contributes to the precise state of the CPU which can be used
-    // when restoring a thread to the CPU after a fork or after an exception
-    // @TODO: Set-Up Grad-Info/Committed-Info to let ThreadState know if it's a branch or not
+    // Set the CPU's PCs - This contributes to the precise state of the CPU 
+    // which can be used when restoring a thread to the CPU after after any
+    // type of context switching activity (fork, exception, etc.)
      setPC(inst->readPC(), tid);
      setNextPC(inst->readNextPC(), tid);
      setNextNPC(inst->readNextNPC(), tid);
  
+    if (inst->isControl()) {
+        thread[tid]->lastGradIsBranch = true;
+        thread[tid]->lastBranchPC = inst->readPC();
+        thread[tid]->lastBranchNextPC = inst->readNextPC();
+        thread[tid]->lastBranchNextNPC = inst->readNextNPC();        
+    } else {
+        thread[tid]->lastGradIsBranch = false;
+    }
+        
+
      // Finalize Trace Data For Instruction
      if (inst->traceData) {
          //inst->traceData->setCycle(curTick);
@@ -1087,9 +1178,9 @@ InOrderCPU::instDone(DynInstPtr inst, ThreadID tid)
          inst->traceData = NULL;
      }
  
-    // Set Last Graduated Instruction In Thread State
-    //thread[tid]->lastGradInst = inst;
-
+    // Increment active thread's instruction count
+    instsPerSwitch++;
+    
      // Increment thread-state's instruction count
      thread[tid]->numInst++;
  
@@ -1107,23 +1198,53 @@ InOrderCPU::instDone(DynInstPtr inst, ThreadID tid)
          smtCommittedInsts[tid]++;
      }
  
+    // Instruction-Mix Stats
+    if (inst->isLoad()) {
+        comLoads++;
+    } else if (inst->isStore()) {
+        comStores++;
+    } else if (inst->isControl()) {
+        comBranches++;
+    } else if (inst->isNop()) {
+        comNops++;
+    } else if (inst->isNonSpeculative()) {
+        comNonSpec++;
+    } else if (inst->isInteger()) {
+        comInts++;
+    } else if (inst->isFloating()) {
+        comFloats++;
+    }
+
      // Check for instruction-count-based events.
      comInstEventQueue[tid]->serviceEvents(thread[tid]->numInst);
  
      // Broadcast to other resources an instruction
      // has been completed
-    resPool->scheduleEvent((CPUEventType)ResourcePool::InstGraduated, inst, tid);
+    resPool->scheduleEvent((CPUEventType)ResourcePool::InstGraduated, inst, 
+                           0, 0, tid);
  
      // Finally, remove instruction from CPU
      removeInst(inst);
  }
  
+// currently unused function, but substitute repetitive code w/this function
+// call
  void
  InOrderCPU::addToRemoveList(DynInstPtr &inst)
  {
      removeInstsThisCycle = true;
-
-    removeList.push(inst->getInstListIt());
+    if (!inst->isRemoveList()) {            
+        DPRINTF(InOrderCPU, "Pushing instruction [tid:%i] PC %#x "
+                "[sn:%lli] to remove list\n",
+                inst->threadNumber, inst->readPC(), inst->seqNum);
+        inst->setRemoveList();        
+        removeList.push(inst->getInstListIt());
+    }  else {
+        DPRINTF(InOrderCPU, "Ignoring instruction removal for [tid:%i] PC %#x "
+                "[sn:%lli], already remove list\n",
+                inst->threadNumber, inst->readPC(), inst->seqNum);
+    }
+    
  }
  
  void
@@ -1136,7 +1257,18 @@ InOrderCPU::removeInst(DynInstPtr &inst)
      removeInstsThisCycle = true;
  
      // Remove the instruction.
-    removeList.push(inst->getInstListIt());
+    if (!inst->isRemoveList()) {            
+        DPRINTF(InOrderCPU, "Pushing instruction [tid:%i] PC %#x "
+                "[sn:%lli] to remove list\n",
+                inst->threadNumber, inst->readPC(), inst->seqNum);
+        inst->setRemoveList();        
+        removeList.push(inst->getInstListIt());
+    } else {
+        DPRINTF(InOrderCPU, "Ignoring instruction removal for [tid:%i] PC %#x "
+                "[sn:%lli], already on remove list\n",
+                inst->threadNumber, inst->readPC(), inst->seqNum);
+    }
+
  }
  
  void
@@ -1150,7 +1282,7 @@ InOrderCPU::removeInstsUntil(const InstSeqNum &seq_num, ThreadID tid)
  
      inst_iter--;
  
-    DPRINTF(InOrderCPU, "Deleting instructions from CPU instruction "
+    DPRINTF(InOrderCPU, "Squashing instructions from CPU instruction "
              "list that are from [tid:%i] and above [sn:%lli] (end=%lli).\n",
              tid, seq_num, (*inst_iter)->seqNum);
  
@@ -1180,8 +1312,22 @@ InOrderCPU::squashInstIt(const ListIt &instIt, ThreadID tid)
  
          (*instIt)->setSquashed();
  
-        removeList.push(instIt);
+        if (!(*instIt)->isRemoveList()) {            
+            DPRINTF(InOrderCPU, "Pushing instruction [tid:%i] PC %#x "
+                    "[sn:%lli] to remove list\n",
+                    (*instIt)->threadNumber, (*instIt)->readPC(), 
+                    (*instIt)->seqNum);
+            (*instIt)->setRemoveList();        
+            removeList.push(instIt);
+        } else {
+            DPRINTF(InOrderCPU, "Ignoring instruction removal for [tid:%i]"
+                    " PC %#x [sn:%lli], already on remove list\n",
+                    (*instIt)->threadNumber, (*instIt)->readPC(), 
+                    (*instIt)->seqNum);
+        }
+    
      }
+    
  }
  
  
@@ -1193,7 +1339,7 @@ InOrderCPU::cleanUpRemovedInsts()
                  "[tid:%i] [sn:%lli] PC %#x\n",
                  (*removeList.front())->threadNumber,
                  (*removeList.front())->seqNum,
-                (*removeList.front())->readPC());
+               (*removeList.front())->readPC());
  
          DynInstPtr inst = *removeList.front();
          ThreadID tid = inst->threadNumber;
@@ -1201,7 +1347,7 @@ InOrderCPU::cleanUpRemovedInsts()
          // Make Sure Resource Schedule Is Emptied Out
          ThePipeline::ResSchedule *inst_sched = &inst->resSched;
          while (!inst_sched->empty()) {
-            ThePipeline::ScheduleEntry* sch_entry = inst_sched->top();
+            ScheduleEntry* sch_entry = inst_sched->top();
              inst_sched->pop();
              delete sch_entry;
          }
@@ -1221,11 +1367,6 @@ InOrderCPU::cleanUpRemovedInsts()
          instList[tid].erase(removeList.front());
  
          removeList.pop();
-
-        DPRINTF(RefCount, "pop from remove list: [sn:%i]: Refcount = %i.\n",
-                inst->seqNum,
-                0/*inst->curCount()*/);
-
      }
  
      removeInstsThisCycle = false;
@@ -1237,22 +1378,19 @@ InOrderCPU::cleanUpRemovedReqs()
      while (!reqRemoveList.empty()) {
          ResourceRequest *res_req = reqRemoveList.front();
  
-        DPRINTF(RefCount, "[tid:%i]: Removing Request, "
-                "[sn:%lli] [slot:%i] [stage_num:%i] [res:%s] [refcount:%i].\n",
+        DPRINTF(RefCount, "[tid:%i] [sn:%lli]: Removing Request "
+                "[stage_num:%i] [res:%s] [slot:%i] [completed:%i].\n",
                  res_req->inst->threadNumber,
                  res_req->inst->seqNum,
-                res_req->getSlot(),
                  res_req->getStageNum(),
                  res_req->res->name(),
-                0/*res_req->inst->curCount()*/);
+                (res_req->isCompleted()) ?
+                res_req->getComplSlot() : res_req->getSlot(),
+                res_req->isCompleted());
  
          reqRemoveList.pop();
  
          delete res_req;
-
-        DPRINTF(RefCount, "after remove request: [sn:%i]: Refcount = %i.\n",
-                res_req->inst->seqNum,
-                0/*res_req->inst->curCount()*/);
      }
  }
  
@@ -1297,10 +1435,16 @@ InOrderCPU::wakeCPU()
  
      DPRINTF(Activity, "Waking up CPU\n");
  
-    //@todo: figure out how to count idleCycles correctly
-    //idleCycles += (curTick - 1) - lastRunningCycle;
+    Tick extra_cycles = tickToCycles((curTick - 1) - lastRunningCycle);
+
+    idleCycles += extra_cycles;    
+    for (int stage_num = 0; stage_num < NumStages; stage_num++) {
+        pipelineStage[stage_num]->idleCycles += extra_cycles;
+    }    
+
+    numCycles += extra_cycles;
  
-    mainEventQueue.schedule(&tickEvent, curTick);
+    mainEventQueue.schedule(&tickEvent, nextCycle(curTick));
  }
  
  #if FULL_SYSTEM
@@ -1374,131 +1518,25 @@ InOrderCPU::getDTBPtr()
      return dtb_res->tlb();
  }
  
-template <class T>
  Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, T &data, unsigned flags)
+InOrderCPU::read(DynInstPtr inst, Addr addr,
+                 uint8_t *data, unsigned size, unsigned flags)
  {
      //@TODO: Generalize name "CacheUnit" to "MemUnit" just in case
      //       you want to run w/out caches?
-    CacheUnit *cache_res = dynamic_cast<CacheUnit*>(resPool->getResource(dataPortIdx));
-
-    return cache_res->read(inst, addr, data, flags);
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
-
-template
-Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, Twin32_t &data, unsigned flags);
-
-template
-Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, Twin64_t &data, unsigned flags);
-
-template
-Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, uint64_t &data, unsigned flags);
-
-template
-Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, uint32_t &data, unsigned flags);
-
-template
-Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, uint16_t &data, unsigned flags);
-
-template
-Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, uint8_t &data, unsigned flags);
-
-#endif //DOXYGEN_SHOULD_SKIP_THIS
-
-template<>
-Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, double &data, unsigned flags)
-{
-    return read(inst, addr, *(uint64_t*)&data, flags);
-}
-
-template<>
-Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, float &data, unsigned flags)
-{
-    return read(inst, addr, *(uint32_t*)&data, flags);
-}
-
+    CacheUnit *cache_res = 
+        dynamic_cast<CacheUnit*>(resPool->getResource(dataPortIdx));
  
-template<>
-Fault
-InOrderCPU::read(DynInstPtr inst, Addr addr, int32_t &data, unsigned flags)
-{
-    return read(inst, addr, (uint32_t&)data, flags);
+    return cache_res->read(inst, addr, data, size, flags);
  }
  
-template <class T>
  Fault
-InOrderCPU::write(DynInstPtr inst, T data, Addr addr, unsigned flags,
-                  uint64_t *write_res)
+InOrderCPU::write(DynInstPtr inst, uint8_t *data, unsigned size,
+                  Addr addr, unsigned flags, uint64_t *write_res)
  {
      //@TODO: Generalize name "CacheUnit" to "MemUnit" just in case
      //       you want to run w/out caches?
      CacheUnit *cache_res =
          dynamic_cast<CacheUnit*>(resPool->getResource(dataPortIdx));
-    return cache_res->write(inst, data, addr, flags, write_res);
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
-
-template
-Fault
-InOrderCPU::write(DynInstPtr inst, Twin32_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-InOrderCPU::write(DynInstPtr inst, Twin64_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-InOrderCPU::write(DynInstPtr inst, uint64_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-InOrderCPU::write(DynInstPtr inst, uint32_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-InOrderCPU::write(DynInstPtr inst, uint16_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-InOrderCPU::write(DynInstPtr inst, uint8_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-#endif //DOXYGEN_SHOULD_SKIP_THIS
-
-template<>
-Fault
-InOrderCPU::write(DynInstPtr inst, double data, Addr addr, unsigned flags, uint64_t *res)
-{
-    return write(inst, *(uint64_t*)&data, addr, flags, res);
-}
-
-template<>
-Fault
-InOrderCPU::write(DynInstPtr inst, float data, Addr addr, unsigned flags, uint64_t *res)
-{
-    return write(inst, *(uint32_t*)&data, addr, flags, res);
-}
-
-
-template<>
-Fault
-InOrderCPU::write(DynInstPtr inst, int32_t data, Addr addr, unsigned flags, uint64_t *res)
-{
-    return write(inst, (uint32_t)data, addr, flags, res);
+    return cache_res->write(inst, data, size, addr, flags, write_res);
  }