From e3d5588ca70c88318c1e41e438102034c92c561e Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Fri, 19 May 2006 15:53:17 -0400
Subject: [PATCH] O3 code update/cleanup.

cpu/o3/commit_impl.hh:
    O3 code update/cleanup.  Fetch fault code no longer needed (see previous checkin).

--HG--
extra : convert_revision : f602e7f978e19b8900dce482f38f9c7a195e94da
---
 cpu/o3/2bit_local_pred.cc   |   2 +-
 cpu/o3/2bit_local_pred.hh   |   2 +-
 cpu/o3/alpha_cpu.hh         |  18 +-
 cpu/o3/bpred_unit.cc        |   2 +-
 cpu/o3/bpred_unit.hh        |   7 +-
 cpu/o3/bpred_unit_impl.hh   |   6 +-
 cpu/o3/comm.hh              |   6 +-
 cpu/o3/commit.hh            |   5 +-
 cpu/o3/commit_impl.hh       |  59 +------
 cpu/o3/decode.hh            |  12 +-
 cpu/o3/decode_impl.hh       |  14 +-
 cpu/o3/fetch.hh             |  31 ++--
 cpu/o3/fetch_impl.hh        |  29 +---
 cpu/o3/lsq.hh               |  65 +++++---
 cpu/o3/lsq_impl.hh          | 138 +---------------
 cpu/o3/lsq_unit.hh          | 218 ++++++-------------------
 cpu/o3/lsq_unit_impl.hh     | 317 ++++++++++++------------------------
 cpu/o3/mem_dep_unit.hh      |   9 +-
 cpu/o3/mem_dep_unit_impl.hh |  20 +--
 cpu/o3/rename.hh            |  32 ++--
 cpu/o3/rename_impl.hh       |  35 ++--
 cpu/o3/rename_map.cc        |  81 ++-------
 cpu/o3/rename_map.hh        |   5 +-
 cpu/o3/rob.hh               |  34 ++--
 cpu/o3/rob_impl.hh          |  38 ++---
 cpu/o3/scoreboard.cc        |   1 +
 cpu/o3/store_set.cc         |   7 +-
 cpu/o3/thread_state.hh      |  95 ++++-------
 28 files changed, 381 insertions(+), 907 deletions(-)
diff --git a/cpu/o3/2bit_local_pred.cc b/cpu/o3/2bit_local_pred.cc
index eab98531d..c3fb2fdb8 100644
--- a/cpu/o3/2bit_local_pred.cc
+++ b/cpu/o3/2bit_local_pred.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cpu/o3/2bit_local_pred.hh b/cpu/o3/2bit_local_pred.hh
index 0dfe53819..cd65978ca 100644
--- a/cpu/o3/2bit_local_pred.hh
+++ b/cpu/o3/2bit_local_pred.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cpu/o3/alpha_cpu.hh b/cpu/o3/alpha_cpu.hh
index f70793aaa..78ad5f7d8 100644
--- a/cpu/o3/alpha_cpu.hh
+++ b/cpu/o3/alpha_cpu.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -87,7 +87,8 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 
         virtual Status status() const { return thread->status(); }
 
-        virtual void setStatus(Status new_status) { thread->setStatus(new_status); }
+        virtual void setStatus(Status new_status)
+        { thread->setStatus(new_status); }
 
         /// Set the status to Active.  Optional delay indicates number of
         /// cycles to wait before beginning execution.
@@ -168,12 +169,15 @@ class AlphaFullCPU : public FullO3CPU<Impl>
         virtual Fault setMiscRegWithEffect(int misc_reg, const MiscReg &val);
 
         // @todo: Figure out where these store cond failures should go.
-        virtual unsigned readStCondFailures() { return thread->storeCondFailures; }
+        virtual unsigned readStCondFailures()
+        { return thread->storeCondFailures; }
 
-        virtual void setStCondFailures(unsigned sc_failures) { thread->storeCondFailures = sc_failures; }
+        virtual void setStCondFailures(unsigned sc_failures)
+        { thread->storeCondFailures = sc_failures; }
 
 #if FULL_SYSTEM
-        virtual bool inPalMode() { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
+        virtual bool inPalMode()
+        { return TheISA::PcPAL(cpu->readPC(thread->tid)); }
 #endif
 
         // Only really makes sense for old CPU model.  Lots of code
@@ -194,10 +198,6 @@ class AlphaFullCPU : public FullO3CPU<Impl>
 #endif
     };
 
-//    friend class AlphaXC;
-
-//    std::vector<ExecContext *> xcProxies;
-
 #if FULL_SYSTEM
     /** ITB pointer. */
     AlphaITB *itb;
diff --git a/cpu/o3/bpred_unit.cc b/cpu/o3/bpred_unit.cc
index a78dcf463..92344111f 100644
--- a/cpu/o3/bpred_unit.cc
+++ b/cpu/o3/bpred_unit.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cpu/o3/bpred_unit.hh b/cpu/o3/bpred_unit.hh
index ee7ffc183..b7814b2e9 100644
--- a/cpu/o3/bpred_unit.hh
+++ b/cpu/o3/bpred_unit.hh
@@ -43,12 +43,7 @@
 
 /**
  * Basically a wrapper class to hold both the branch predictor
- * and the BTB.  Right now I'm unsure of the implementation; it would
- * be nicer to have something closer to the CPUPolicy or the Impl where
- * this is just typedefs, but it forces the upper level stages to be
- * aware of the constructors of the BP and the BTB.  The nicer thing
- * to do is have this templated on the Impl, accept the usual Params
- * object, and be able to call the constructors on the BP and BTB.
+ * and the BTB.
  */
 template<class Impl>
 class TwobitBPredUnit
diff --git a/cpu/o3/bpred_unit_impl.hh b/cpu/o3/bpred_unit_impl.hh
index d20b31e55..c37df606b 100644
--- a/cpu/o3/bpred_unit_impl.hh
+++ b/cpu/o3/bpred_unit_impl.hh
@@ -26,13 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <list>
+#include <vector>
+
 #include "base/trace.hh"
 #include "base/traceflags.hh"
 #include "cpu/o3/bpred_unit.hh"
 
-#include <vector>
-#include <list>
-
 using namespace std;
 
 template<class Impl>
diff --git a/cpu/o3/comm.hh b/cpu/o3/comm.hh
index 1a8f394ca..c36c58d3d 100644
--- a/cpu/o3/comm.hh
+++ b/cpu/o3/comm.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -169,10 +169,6 @@ struct TimeBufStruct {
         bool commitInsts;
         InstSeqNum squashSeqNum;
 
-        // Extra bit of information so that the LDSTQ only updates when it
-        // needs to.
-        bool commitIsLoad;
-
         // Communication specifically to the IQ to tell the IQ that it can
         // schedule a non-speculative instruction.
         InstSeqNum nonSpecSeqNum;
diff --git a/cpu/o3/commit.hh b/cpu/o3/commit.hh
index 73eccd2b0..66abf8dc6 100644
--- a/cpu/o3/commit.hh
+++ b/cpu/o3/commit.hh
@@ -30,10 +30,10 @@
 #define __CPU_O3_COMMIT_HH__
 
 #include "arch/faults.hh"
-#include "cpu/inst_seq.hh"
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/exetrace.hh"
+#include "cpu/inst_seq.hh"
 #include "mem/memory_interface.hh"
 
 template <class>
@@ -59,8 +59,7 @@ class O3ThreadState;
  * squashing instruction's sequence number, and only broadcasting a
  * redirect if it corresponds to an older instruction. Commit also
  * supports multiple cycle squashing, to model a ROB that can only
- * remove a certain number of instructions per cycle. Eventually traps
- * and interrupts will most likely be handled here as well.
+ * remove a certain number of instructions per cycle.
  */
 template<class Impl>
 class DefaultCommit
diff --git a/cpu/o3/commit_impl.hh b/cpu/o3/commit_impl.hh
index 170f5b01f..346a8bc1c 100644
--- a/cpu/o3/commit_impl.hh
+++ b/cpu/o3/commit_impl.hh
@@ -27,12 +27,7 @@
  */
 
 #include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iomanip>
-#include <stdio.h>
-#include <string.h>
+#include <string>
 
 #include "base/loader/symtab.hh"
 #include "base/timebuf.hh"
@@ -835,58 +830,6 @@ DefaultCommit<Impl>::commitInsts()
     unsigned num_committed = 0;
 
     DynInstPtr head_inst;
-#if FULL_SYSTEM
-    // Not the best way to check if the front end is empty, but it should
-    // work.
-    // @todo: Try to avoid directly accessing fetch.
-    if (commitStatus[0] == FetchTrapPending && rob->isEmpty()) {
-        DPRINTF(Commit, "Fault from fetch is pending.\n");
-
-        fetchTrapWait++;
-        if (fetchTrapWait > 10000000) {
-            panic("Fetch trap has been pending for a long time!");
-        }
-        if (fetchFaultTick > curTick) {
-            DPRINTF(Commit, "Not enough cycles since fault, fault will "
-                    "happen on %lli\n",
-                    fetchFaultTick);
-            cpu->activityThisCycle();
-            return;
-        } else if (iewStage->hasStoresToWB()) {
-            DPRINTF(Commit, "IEW still has stores to WB.  Waiting until "
-                    "they are completed. fetchTrapWait:%i\n",
-                    fetchTrapWait);
-            cpu->activityThisCycle();
-            return;
-        } else if (cpu->inPalMode(readPC())) {
-            DPRINTF(Commit, "In pal mode right now. fetchTrapWait:%i\n",
-                    fetchTrapWait);
-            return;
-        } else if (fetchStage->getYoungestSN() > youngestSeqNum[0]) {
-            DPRINTF(Commit, "Waiting for front end to drain. fetchTrapWait:%i\n",
-                    fetchTrapWait);
-            return;
-        }
-        fetchTrapWait = 0;
-        DPRINTF(Commit, "ROB is empty, handling fetch trap.\n");
-
-        assert(!thread[0]->inSyscall);
-
-        thread[0]->inSyscall = true;
-
-        // Consider holding onto the trap and waiting until the trap event
-        // happens for this to be executed.
-        cpu->trap(fetchFault, 0);
-
-        // Exit state update mode to avoid accidental updating.
-        thread[0]->inSyscall = false;
-
-        commitStatus[0] = TrapPending;
-        // Set it up so that we squash next cycle
-        trapSquash[0] = true;
-        return;
-    }
-#endif
 
     // Commit as many instructions as possible until the commit bandwidth
     // limit is reached, or it becomes impossible to commit any more.
diff --git a/cpu/o3/decode.hh b/cpu/o3/decode.hh
index 3f3f68247..3035b3387 100644
--- a/cpu/o3/decode.hh
+++ b/cpu/o3/decode.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,11 +35,11 @@
 #include "base/timebuf.hh"
 
 /**
- * DefaultDecode class handles both single threaded and SMT decode. Its width is
- * specified by the parameters; each cycles it tries to decode that many
- * instructions. Because instructions are actually decoded when the StaticInst
- * is created, this stage does not do much other than check any PC-relative
- * branches.
+ * DefaultDecode class handles both single threaded and SMT
+ * decode. Its width is specified by the parameters; each cycles it
+ * tries to decode that many instructions. Because instructions are
+ * actually decoded when the StaticInst is created, this stage does
+ * not do much other than check any PC-relative branches.
  */
 template<class Impl>
 class DefaultDecode
diff --git a/cpu/o3/decode_impl.hh b/cpu/o3/decode_impl.hh
index a419a8932..2ed7ec6fc 100644
--- a/cpu/o3/decode_impl.hh
+++ b/cpu/o3/decode_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,6 @@ DefaultDecode<Impl>::DefaultDecode(Params *params)
       decodeWidth(params->decodeWidth),
       numThreads(params->numberOfThreads)
 {
-    DPRINTF(Decode, "decodeWidth=%i.\n", decodeWidth);
     _status = Inactive;
 
     for (int i = 0; i < numThreads; ++i) {
@@ -249,8 +248,6 @@ template<class Impl>
 bool
 DefaultDecode<Impl>::unblock(unsigned tid)
 {
-    DPRINTF(Decode, "[tid:%u]: Trying to unblock.\n", tid);
-
     // Decode is done unblocking only if the skid buffer is empty.
     if (skidBuffer[tid].empty()) {
         DPRINTF(Decode, "[tid:%u]: Done unblocking.\n", tid);
@@ -261,6 +258,8 @@ DefaultDecode<Impl>::unblock(unsigned tid)
         return true;
     }
 
+    DPRINTF(Decode, "[tid:%u]: Currently unblocking.\n", tid);
+
     return false;
 }
 
@@ -318,6 +317,7 @@ DefaultDecode<Impl>::squash(unsigned tid)
         // In syscall emulation, we can have both a block and a squash due
         // to a syscall in the same cycle.  This would cause both signals to
         // be high.  This shouldn't happen in full system.
+        // @todo: Determine if this still happens.
         if (toFetch->decodeBlock[tid]) {
             toFetch->decodeBlock[tid] = 0;
         } else {
@@ -372,7 +372,7 @@ DefaultDecode<Impl>::skidInsert(unsigned tid)
         skidBuffer[tid].push(inst);
     }
 
-    // Eventually need to enforce this by not letting a thread
+    // @todo: Eventually need to enforce this by not letting a thread
     // fetch past its skidbuffer
     assert(skidBuffer[tid].size() <= skidBufferMax);
 }
@@ -436,10 +436,10 @@ void
 DefaultDecode<Impl>::sortInsts()
 {
     int insts_from_fetch = fromFetch->size;
-
+#ifdef DEBUG
     for (int i=0; i < numThreads; i++)
         assert(insts[i].empty());
-
+#endif
     for (int i = 0; i < insts_from_fetch; ++i) {
         insts[fromFetch->insts[i]->threadNumber].push(fromFetch->insts[i]);
     }
diff --git a/cpu/o3/fetch.hh b/cpu/o3/fetch.hh
index b03d4afe3..3fcfdc3a1 100644
--- a/cpu/o3/fetch.hh
+++ b/cpu/o3/fetch.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,12 +38,12 @@
 class Sampler;
 
 /**
- * DefaultFetch class handles both single threaded and SMT fetch. Its width is
- * specified by the parameters; each cycle it tries to fetch that many
- * instructions. It supports using a branch predictor to predict direction and
- * targets.
- * It supports the idling functionalitiy of the CPU by indicating to the CPU
- * when it is active and inactive.
+ * DefaultFetch class handles both single threaded and SMT fetch. Its
+ * width is specified by the parameters; each cycle it tries to fetch
+ * that many instructions. It supports using a branch predictor to
+ * predict direction and targets.
+ * It supports the idling functionalitiy of the CPU by indicating to
+ * the CPU when it is active and inactive.
  */
 template <class Impl>
 class DefaultFetch
@@ -66,8 +66,8 @@ class DefaultFetch
     typedef TheISA::ExtMachInst ExtMachInst;
 
   public:
-    /** Overall fetch status. Used to determine if the CPU can deschedule itsef
-     * due to a lack of activity.
+    /** Overall fetch status. Used to determine if the CPU can
+     * deschedule itsef due to a lack of activity.
      */
     enum FetchStatus {
         Active,
@@ -174,13 +174,13 @@ class DefaultFetch
     void wakeFromQuiesce();
 
   private:
-    /** Changes the status of this stage to active, and indicates this to the
-     * CPU.
+    /** Changes the status of this stage to active, and indicates this
+     * to the CPU.
      */
     inline void switchToActive();
 
-    /** Changes the status of this stage to inactive, and indicates this to the
-     * CPU.
+    /** Changes the status of this stage to inactive, and indicates
+     * this to the CPU.
      */
     inline void switchToInactive();
 
@@ -373,11 +373,6 @@ class DefaultFetch
 
     bool switchedOut;
 
-  public:
-    InstSeqNum &getYoungestSN() { return youngestSN; }
-  private:
-    InstSeqNum youngestSN;
-
 #if !FULL_SYSTEM
     /** Page table pointer. */
 //    PageTable *pTable;
diff --git a/cpu/o3/fetch_impl.hh b/cpu/o3/fetch_impl.hh
index 523719945..1c5e508f6 100644
--- a/cpu/o3/fetch_impl.hh
+++ b/cpu/o3/fetch_impl.hh
@@ -938,10 +938,6 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         DPRINTF(Fetch, "[tid:%i]: Adding instructions to queue to "
                 "decode.\n",tid);
 
-        //////////////////////////
-        // Fetch first instruction
-        //////////////////////////
-
         // Need to keep track of whether or not a predicted branch
         // ended this fetch block.
         bool predicted_branch = false;
@@ -1004,7 +1000,8 @@ DefaultFetch<Impl>::fetch(bool &status_change)
             fetch_PC = next_PC;
 
             if (instruction->isQuiesce()) {
-                warn("%lli: Quiesce instruction encountered, halting fetch!", curTick);
+                warn("%lli: Quiesce instruction encountered, halting fetch!",
+                     curTick);
                 fetchStatus[tid] = QuiescePending;
                 ++numInst;
                 status_change = true;
@@ -1022,24 +1019,20 @@ DefaultFetch<Impl>::fetch(bool &status_change)
     // Now that fetching is completed, update the PC to signify what the next
     // cycle will be.
     if (fault == NoFault) {
-
         DPRINTF(Fetch, "[tid:%i]: Setting PC to %08p.\n",tid, next_PC);
 
-
         PC[tid] = next_PC;
         nextPC[tid] = next_PC + instSize;
     } else {
-        // If the issue was an icache miss, then we can just return and
-        // wait until it is handled.
+        // We shouldn't be in an icache miss and also have a fault (an ITB
+        // miss)
         if (fetchStatus[tid] == IcacheMissStall) {
             panic("Fetch should have exited prior to this!");
         }
 
-        // Handle the fault.
-        // This stage will not be able to continue until all the ROB
-        // slots are empty, at which point the fault can be handled.
-        // The only other way it can wake up is if a squash comes along
-        // and changes the PC.
+        // Send the fault to commit.  This thread will not do anything
+        // until commit handles the fault.  The only other way it can
+        // wake up is if a squash comes along and changes the PC.
 #if FULL_SYSTEM
         assert(numInst != fetchWidth);
         // Get a sequence number.
@@ -1067,20 +1060,12 @@ DefaultFetch<Impl>::fetch(bool &status_change)
         toDecode->insts[numInst] = instruction;
         toDecode->size++;
 
-        // Tell the commit stage the fault we had.
-//        toDecode->fetchFault = fault;
-//        toDecode->fetchFaultSN = cpu->globalSeqNum;
-
         DPRINTF(Fetch, "[tid:%i]: Blocked, need to handle the trap.\n",tid);
 
         fetchStatus[tid] = TrapPending;
         status_change = true;
 
         warn("%lli fault (%d) detected @ PC %08p", curTick, fault, PC[tid]);
-//        cpu->trap(fault);
-        // Send a signal to the ROB indicating that there's a trap from the
-        // fetch stage that needs to be handled.  Need to indicate that
-        // there's a fault, and the fault type.
 #else // !FULL_SYSTEM
         fatal("fault (%d) detected @ PC %08p", fault, PC[tid]);
 #endif // FULL_SYSTEM
diff --git a/cpu/o3/lsq.hh b/cpu/o3/lsq.hh
index d5f893e57..a1eeccbe7 100644
--- a/cpu/o3/lsq.hh
+++ b/cpu/o3/lsq.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,10 +32,9 @@
 #include <map>
 #include <queue>
 
-#include "base/hashmap.hh"
 #include "config/full_system.hh"
 #include "cpu/inst_seq.hh"
-#include "cpu/o3/cpu_policy.hh"
+//#include "cpu/o3/cpu_policy.hh"
 #include "cpu/o3/lsq_unit.hh"
 #include "mem/mem_interface.hh"
 //#include "mem/page_table.hh"
@@ -85,7 +84,8 @@ class LSQ {
     /** Ticks the LSQ. */
     void tick();
     /** Ticks a specific LSQ Unit. */
-    void tick(unsigned tid);
+    void tick(unsigned tid)
+    { thread[tid].tick(); }
 
     /** Inserts a load into the LSQ. */
     void insertLoad(DynInstPtr &load_inst);
@@ -95,18 +95,23 @@ class LSQ {
     /** Executes a load. */
     Fault executeLoad(DynInstPtr &inst);
 
-    Fault executeLoad(int lq_idx, unsigned tid);
+    Fault executeLoad(int lq_idx, unsigned tid)
+    { return thread[tid].executeLoad(lq_idx); }
+
     /** Executes a store. */
     Fault executeStore(DynInstPtr &inst);
 
     /**
      * Commits loads up until the given sequence number for a specific thread.
      */
-    void commitLoads(InstSeqNum &youngest_inst, unsigned tid);
+    void commitLoads(InstSeqNum &youngest_inst, unsigned tid)
+    { thread[tid].commitLoads(youngest_inst); }
+
     /**
      * Commits stores up until the given sequence number for a specific thread.
      */
-    void commitStores(InstSeqNum &youngest_inst, unsigned tid);
+    void commitStores(InstSeqNum &youngest_inst, unsigned tid)
+    { thread[tid].commitStores(youngest_inst); }
 
     /**
      * Attempts to write back stores until all cache ports are used or the
@@ -119,7 +124,8 @@ class LSQ {
     /**
      * Squash instructions from a thread until the specified sequence number.
      */
-    void squash(const InstSeqNum &squashed_num, unsigned tid);
+    void squash(const InstSeqNum &squashed_num, unsigned tid)
+    { thread[tid].squash(squashed_num); }
 
     /** Returns whether or not there was a memory ordering violation. */
     bool violation();
@@ -127,12 +133,14 @@ class LSQ {
      * Returns whether or not there was a memory ordering violation for a
      * specific thread.
      */
-    bool violation(unsigned tid);
+    bool violation(unsigned tid)
+    { return thread[tid].violation(); }
 
     /** Returns if a load is blocked due to the memory system for a specific
      *  thread.
      */
-    bool loadBlocked(unsigned tid);
+    bool loadBlocked(unsigned tid)
+    { return thread[tid].loadBlocked(); }
 
     bool isLoadBlockedHandled(unsigned tid)
     { return thread[tid].isLoadBlockedHandled(); }
@@ -141,10 +149,13 @@ class LSQ {
     { thread[tid].setLoadBlockedHandled(); }
 
     /** Gets the instruction that caused the memory ordering violation. */
-    DynInstPtr getMemDepViolator(unsigned tid);
+    DynInstPtr getMemDepViolator(unsigned tid)
+    { return thread[tid].getMemDepViolator(); }
 
     /** Returns the head index of the load queue for a specific thread. */
-    int getLoadHead(unsigned tid);
+    int getLoadHead(unsigned tid)
+    { return thread[tid].getLoadHead(); }
+
     /** Returns the sequence number of the head of the load queue. */
     InstSeqNum getLoadHeadSeqNum(unsigned tid)
     {
@@ -152,7 +163,9 @@ class LSQ {
     }
 
     /** Returns the head index of the store queue. */
-    int getStoreHead(unsigned tid);
+    int getStoreHead(unsigned tid)
+    { return thread[tid].getStoreHead(); }
+
     /** Returns the sequence number of the head of the store queue. */
     InstSeqNum getStoreHeadSeqNum(unsigned tid)
     {
@@ -162,22 +175,26 @@ class LSQ {
     /** Returns the number of instructions in all of the queues. */
     int getCount();
     /** Returns the number of instructions in the queues of one thread. */
-    int getCount(unsigned tid);
+    int getCount(unsigned tid)
+    { return thread[tid].getCount(); }
 
     /** Returns the total number of loads in the load queue. */
     int numLoads();
     /** Returns the total number of loads for a single thread. */
-    int numLoads(unsigned tid);
+    int numLoads(unsigned tid)
+    { return thread[tid].numLoads(); }
 
     /** Returns the total number of stores in the store queue. */
     int numStores();
     /** Returns the total number of stores for a single thread. */
-    int numStores(unsigned tid);
+    int numStores(unsigned tid)
+    { return thread[tid].numStores(); }
 
     /** Returns the total number of loads that are ready. */
     int numLoadsReady();
     /** Returns the number of loads that are ready for a single thread. */
-    int numLoadsReady(unsigned tid);
+    int numLoadsReady(unsigned tid)
+    { return thread[tid].numLoadsReady(); }
 
     /** Returns the number of free entries. */
     unsigned numFreeEntries();
@@ -215,24 +232,30 @@ class LSQ {
 
     /** Returns whether or not there are any stores to write back to memory. */
     bool hasStoresToWB();
+
     /** Returns whether or not a specific thread has any stores to write back
      * to memory.
      */
-    bool hasStoresToWB(unsigned tid);
+    bool hasStoresToWB(unsigned tid)
+    { return thread[tid].hasStoresToWB(); }
+
     /** Returns the number of stores a specific thread has to write back. */
-    int  numStoresToWB(unsigned tid);
+    int  numStoresToWB(unsigned tid)
+    { return thread[tid].numStoresToWB(); }
 
     /** Returns if the LSQ will write back to memory this cycle. */
     bool willWB();
     /** Returns if the LSQ of a specific thread will write back to memory this
      * cycle.
      */
-    bool willWB(unsigned tid);
+    bool willWB(unsigned tid)
+    { return thread[tid].willWB(); }
 
     /** Debugging function to print out all instructions. */
     void dumpInsts();
     /** Debugging function to print out instructions from a specific thread. */
-    void dumpInsts(unsigned tid);
+    void dumpInsts(unsigned tid)
+    { thread[tid].dumpInsts(); }
 
     /** Executes a read operation, using the load specified at the load index. */
     template <class T>
diff --git a/cpu/o3/lsq_impl.hh b/cpu/o3/lsq_impl.hh
index c43c19619..a6ad27522 100644
--- a/cpu/o3/lsq_impl.hh
+++ b/cpu/o3/lsq_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,6 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <algorithm>
+#include <string>
+
 #include "cpu/o3/lsq.hh"
 
 using namespace std;
@@ -89,7 +92,7 @@ LSQ<Impl>::LSQ(Params *params)
 
     //Initialize LSQs
     for (int tid=0; tid < numThreads; tid++) {
-        thread[tid].init(params, maxLQEntries+1, maxSQEntries+1, tid);
+        thread[tid].init(params, maxLQEntries, maxSQEntries, tid);
     }
 }
 
@@ -226,13 +229,6 @@ LSQ<Impl>::tick()
     }
 }
 
-template<class Impl>
-void
-LSQ<Impl>::tick(unsigned tid)
-{
-    thread[tid].tick();
-}
-
 template<class Impl>
 void
 LSQ<Impl>::insertLoad(DynInstPtr &load_inst)
@@ -260,13 +256,6 @@ LSQ<Impl>::executeLoad(DynInstPtr &inst)
     return thread[tid].executeLoad(inst);
 }
 
-template<class Impl>
-Fault
-LSQ<Impl>::executeLoad(int lq_idx, unsigned tid)
-{
-    return thread[tid].executeLoad(lq_idx);
-}
-
 template<class Impl>
 Fault
 LSQ<Impl>::executeStore(DynInstPtr &inst)
@@ -276,20 +265,6 @@ LSQ<Impl>::executeStore(DynInstPtr &inst)
     return thread[tid].executeStore(inst);
 }
 
-template<class Impl>
-void
-LSQ<Impl>::commitLoads(InstSeqNum &youngest_inst,unsigned tid)
-{
-    thread[tid].commitLoads(youngest_inst);
-}
-
-template<class Impl>
-void
-LSQ<Impl>::commitStores(InstSeqNum &youngest_inst,unsigned tid)
-{
-    thread[tid].commitStores(youngest_inst);
-}
-
 template<class Impl>
 void
 LSQ<Impl>::writebackStores()
@@ -300,28 +275,14 @@ LSQ<Impl>::writebackStores()
         unsigned tid = *active_threads++;
 
         if (numStoresToWB(tid) > 0) {
-            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores available"
-                " for Writeback.\n", tid, numStoresToWB(tid));
+            DPRINTF(Writeback,"[tid:%i] Writing back stores. %i stores "
+                "available for Writeback.\n", tid, numStoresToWB(tid));
         }
 
         thread[tid].writebackStores();
     }
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numStoresToWB(unsigned tid)
-{
-    return thread[tid].numStoresToWB();
-}
-
-template<class Impl>
-void
-LSQ<Impl>::squash(const InstSeqNum &squashed_num, unsigned tid)
-{
-        thread[tid].squash(squashed_num);
-}
-
 template<class Impl>
 bool
 LSQ<Impl>::violation()
@@ -338,41 +299,6 @@ LSQ<Impl>::violation()
     return false;
 }
 
-template<class Impl>
-bool
-LSQ<Impl>::violation(unsigned tid)
-{
-    return thread[tid].violation();
-}
-
-template<class Impl>
-bool
-LSQ<Impl>::loadBlocked(unsigned tid)
-{
-    return thread[tid].loadBlocked();
-}
-
-template<class Impl>
-typename Impl::DynInstPtr
-LSQ<Impl>::getMemDepViolator(unsigned tid)
-{
-    return thread[tid].getMemDepViolator();
-}
-
-template<class Impl>
-int
-LSQ<Impl>::getLoadHead(unsigned tid)
-{
-    return thread[tid].getLoadHead();
-}
-
-template<class Impl>
-int
-LSQ<Impl>::getStoreHead(unsigned tid)
-{
-    return thread[tid].getStoreHead();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::getCount()
@@ -389,13 +315,6 @@ LSQ<Impl>::getCount()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::getCount(unsigned tid)
-{
-    return thread[tid].getCount();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::numLoads()
@@ -412,13 +331,6 @@ LSQ<Impl>::numLoads()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numLoads(unsigned tid)
-{
-    return thread[tid].numLoads();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::numStores()
@@ -435,13 +347,6 @@ LSQ<Impl>::numStores()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numStores(unsigned tid)
-{
-    return thread[tid].numStores();
-}
-
 template<class Impl>
 int
 LSQ<Impl>::numLoadsReady()
@@ -458,13 +363,6 @@ LSQ<Impl>::numLoadsReady()
     return total;
 }
 
-template<class Impl>
-int
-LSQ<Impl>::numLoadsReady(unsigned tid)
-{
-    return thread[tid].numLoadsReady();
-}
-
 template<class Impl>
 unsigned
 LSQ<Impl>::numFreeEntries()
@@ -612,14 +510,6 @@ LSQ<Impl>::hasStoresToWB()
     return true;
 }
 
-
-template<class Impl>
-bool
-LSQ<Impl>::hasStoresToWB(unsigned tid)
-{
-    return thread[tid].hasStoresToWB();
-}
-
 template<class Impl>
 bool
 LSQ<Impl>::willWB()
@@ -635,13 +525,6 @@ LSQ<Impl>::willWB()
     return true;
 }
 
-template<class Impl>
-bool
-LSQ<Impl>::willWB(unsigned tid)
-{
-    return thread[tid].willWB();
-}
-
 template<class Impl>
 void
 LSQ<Impl>::dumpInsts()
@@ -653,10 +536,3 @@ LSQ<Impl>::dumpInsts()
         thread[tid].dumpInsts();
     }
 }
-
-template<class Impl>
-void
-LSQ<Impl>::dumpInsts(unsigned tid)
-{
-    thread[tid].dumpInsts();
-}
diff --git a/cpu/o3/lsq_unit.hh b/cpu/o3/lsq_unit.hh
index 623dbdb4b..942b4583d 100644
--- a/cpu/o3/lsq_unit.hh
+++ b/cpu/o3/lsq_unit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,29 +29,30 @@
 #ifndef __CPU_O3_LSQ_UNIT_HH__
 #define __CPU_O3_LSQ_UNIT_HH__
 
+#include <algorithm>
 #include <map>
 #include <queue>
-#include <algorithm>
 
+#include "arch/faults.hh"
 #include "config/full_system.hh"
 #include "base/hashmap.hh"
 #include "cpu/inst_seq.hh"
 #include "mem/mem_interface.hh"
 //#include "mem/page_table.hh"
-#include "sim/debug.hh"
-#include "sim/sim_object.hh"
-#include "arch/faults.hh"
+//#include "sim/debug.hh"
+//#include "sim/sim_object.hh"
 
 /**
- * Class that implements the actual LQ and SQ for each specific thread.
- * Both are circular queues; load entries are freed upon committing, while
- * store entries are freed once they writeback. The LSQUnit tracks if there
- * are memory ordering violations, and also detects partial load to store
- * forwarding cases (a store only has part of a load's data) that requires
- * the load to wait until the store writes back. In the former case it
- * holds onto the instruction until the dependence unit looks at it, and
- * in the latter it stalls the LSQ until the store writes back. At that
- * point the load is replayed.
+ * Class that implements the actual LQ and SQ for each specific
+ * thread.  Both are circular queues; load entries are freed upon
+ * committing, while store entries are freed once they writeback. The
+ * LSQUnit tracks if there are memory ordering violations, and also
+ * detects partial load to store forwarding cases (a store only has
+ * part of a load's data) that requires the load to wait until the
+ * store writes back. In the former case it holds onto the instruction
+ * until the dependence unit looks at it, and in the latter it stalls
+ * the LSQ until the store writes back. At that point the load is
+ * replayed.
  */
 template <class Impl>
 class LSQUnit {
@@ -76,21 +77,19 @@ class LSQUnit {
         /** Returns the description of this event. */
         const char *description();
 
-      private:
-        /** The store index of the store being written back. */
-        int storeIdx;
         /** The writeback event for the store.  Needed for store
          * conditionals.
          */
-      public:
         Event *wbEvent;
+
+      private:
+        /** The store index of the store being written back. */
+        int storeIdx;
       private:
         /** The pointer to the LSQ unit that issued the store. */
         LSQUnit<Impl> *lsqPtr;
     };
 
-    friend class StoreCompletionEvent;
-
   public:
     /** Constructs an LSQ unit. init() must be called prior to use. */
     LSQUnit();
@@ -136,14 +135,12 @@ class LSQUnit {
     /** Executes a load instruction. */
     Fault executeLoad(DynInstPtr &inst);
 
-    Fault executeLoad(int lq_idx);
+    Fault executeLoad(int lq_idx) { panic("Not implemented"); return NoFault; }
     /** Executes a store instruction. */
     Fault executeStore(DynInstPtr &inst);
 
     /** Commits the head load. */
     void commitLoad();
-    /** Commits a specific load, given by the sequence number. */
-    void commitLoad(InstSeqNum &inst);
     /** Commits loads older than a specific sequence number. */
     void commitLoads(InstSeqNum &youngest_inst);
 
@@ -179,9 +176,7 @@ class LSQUnit {
     /** Returns the memory ordering violator. */
     DynInstPtr getMemDepViolator();
 
-    /** Returns if a load became blocked due to the memory system.  It clears
-     *  the bool's value upon this being called.
-     */
+    /** Returns if a load became blocked due to the memory system. */
     bool loadBlocked()
     { return isLoadBlocked; }
 
@@ -215,9 +210,6 @@ class LSQUnit {
     /** Returns if the SQ is full. */
     bool sqFull() { return stores >= (SQEntries - 1); }
 
-    /** Debugging function to dump instructions in the LSQ. */
-    void dumpInsts();
-
     /** Returns the number of instructions in the LSQ. */
     unsigned getCount() { return loads + stores; }
 
@@ -245,6 +237,10 @@ class LSQUnit {
     /** Decrements the given load index (circular queue). */
     inline void decrLdIdx(int &load_idx);
 
+  public:
+    /** Debugging function to dump instructions in the LSQ. */
+    void dumpInsts();
+
   private:
     /** Pointer to the CPU. */
     FullCPU *cpu;
@@ -287,38 +283,29 @@ class LSQUnit {
         /** Whether or not the store is completed. */
         bool completed;
     };
-/*
-    enum Status {
-        Running,
-        Idle,
-        DcacheMissStall,
-        DcacheMissSwitch
-    };
-*/
+
   private:
     /** The LSQUnit thread id. */
     unsigned lsqID;
 
-    /** The status of the LSQ unit. */
-//    Status _status;
-
     /** The store queue. */
     std::vector<SQEntry> storeQueue;
 
     /** The load queue. */
     std::vector<DynInstPtr> loadQueue;
 
-    // Consider making these 16 bits
-    /** The number of LQ entries. */
+    /** The number of LQ entries, plus a sentinel entry (circular queue).
+     *  @todo: Consider having var that records the true number of LQ entries.
+     */
     unsigned LQEntries;
-    /** The number of SQ entries. */
+    /** The number of SQ entries, plus a sentinel entry (circular queue).
+     *  @todo: Consider having var that records the true number of SQ entries.
+     */
     unsigned SQEntries;
 
     /** The number of load instructions in the LQ. */
     int loads;
-    /** The number of store instructions in the SQ (excludes those waiting to
-     * writeback).
-     */
+    /** The number of store instructions in the SQ. */
     int stores;
     /** The number of store instructions in the SQ waiting to writeback. */
     int storesToWB;
@@ -330,8 +317,8 @@ class LSQUnit {
 
     /** The index of the head instruction in the SQ. */
     int storeHead;
-    /** The index of the first instruction that is ready to be written back,
-     * and has not yet been written back.
+    /** The index of the first instruction that may be ready to be
+     * written back, and has not yet been written back.
      */
     int storeWBIdx;
     /** The index of the tail instruction in the SQ. */
@@ -348,13 +335,9 @@ class LSQUnit {
 
     //list<InstSeqNum> mshrSeqNums;
 
-     //Stats::Scalar<> dcacheStallCycles;
-    Counter lastDcacheStall;
-
     /** Wire to read information from the issue stage time queue. */
     typename TimeBuffer<IssueStruct>::wire fromIssue;
 
-    // Make these per thread?
     /** Whether or not the LSQ is stalled. */
     bool stalled;
     /** The store that causes the stall due to partial store to load
@@ -364,20 +347,13 @@ class LSQUnit {
     /** The index of the above store. */
     int stallingLoadIdx;
 
-    /** Whether or not a load is blocked due to the memory system.  It is
-     *  cleared when this value is checked via loadBlocked().
-     */
+    /** Whether or not a load is blocked due to the memory system. */
     bool isLoadBlocked;
 
     bool loadBlockedHandled;
 
     InstSeqNum blockedLoadSeqNum;
 
-    /** The oldest faulting load instruction. */
-    DynInstPtr loadFaultInst;
-    /** The oldest faulting store instruction. */
-    DynInstPtr storeFaultInst;
-
     /** The oldest load that caused a memory ordering violation. */
     DynInstPtr memDepViolator;
 
@@ -447,23 +423,14 @@ template <class T>
 Fault
 LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 {
-    //Depending on issue2execute delay a squashed load could
-    //execute if it is found to be squashed in the same
-    //cycle it is scheduled to execute
     assert(loadQueue[load_idx]);
 
-    if (loadQueue[load_idx]->isExecuted()) {
-        panic("Should not reach this point with split ops!");
-        memcpy(&data,req->data,req->size);
-
-        return NoFault;
-    }
+    assert(!loadQueue[load_idx]->isExecuted());
 
     // Make sure this isn't an uncacheable access
     // A bit of a hackish way to get uncached accesses to work only if they're
     // at the head of the LSQ and are ready to commit (at the head of the ROB
     // too).
-    // @todo: Fix uncached accesses.
     if (req->flags & UNCACHEABLE &&
         (load_idx != loadHead || !loadQueue[load_idx]->reachedCommit)) {
         iewStage->rescheduleMemInst(loadQueue[load_idx]);
@@ -479,12 +446,16 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
             "storeHead: %i addr: %#x\n",
             load_idx, store_idx, storeHead, req->paddr);
 
-#ifdef FULL_SYSTEM
+#if 0
     if (req->flags & LOCKED) {
         cpu->lockAddr = req->paddr;
         cpu->lockFlag = true;
     }
 #endif
+            req->cmd = Read;
+            assert(!req->completionEvent);
+            req->completionEvent = NULL;
+            req->time = curTick;
 
     while (store_idx != -1) {
         // End once we've reached the top of the LSQ
@@ -518,18 +489,14 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
         // If the store's data has all of the data needed, we can forward.
         if (store_has_lower_limit && store_has_upper_limit) {
-
+            // Get shift amount for offset into the store's data.
             int shift_amt = req->vaddr & (store_size - 1);
-            // Assumes byte addressing
+            // @todo: Magic number, assumes byte addressing
             shift_amt = shift_amt << 3;
 
             // Cast this to type T?
             data = storeQueue[store_idx].data >> shift_amt;
 
-            req->cmd = Read;
-            assert(!req->completionEvent);
-            req->completionEvent = NULL;
-            req->time = curTick;
             assert(!req->data);
             req->data = new uint8_t[64];
 
@@ -579,7 +546,6 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
             // Do not generate a writeback event as this instruction is not
             // complete.
-
             DPRINTF(LSQUnit, "Load-store forwarding mis-match. "
                     "Store idx %i to load addr %#x\n",
                     store_idx, req->vaddr);
@@ -588,16 +554,13 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
         }
     }
 
-
     // If there's no forwarding case, then go access memory
     DynInstPtr inst = loadQueue[load_idx];
 
-    DPRINTF(LSQUnit, "Doing functional access for inst PC %#x\n",
-            loadQueue[load_idx]->readPC());
+    DPRINTF(LSQUnit, "Doing functional access for inst [sn:%lli] PC %#x\n",
+            loadQueue[load_idx]->seqNum, loadQueue[load_idx]->readPC());
+
     assert(!req->data);
-    req->cmd = Read;
-    req->completionEvent = NULL;
-    req->time = curTick;
     req->data = new uint8_t[64];
     Fault fault = cpu->read(req, data);
     memcpy(req->data, &data, sizeof(T));
@@ -611,20 +574,19 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
             if (isLoadBlocked && blockedLoadSeqNum < inst->seqNum)
                 return NoFault;
 
+            // Record that the load was blocked due to memory.  This
+            // load will squash all instructions after it, be
+            // refetched, and re-executed.
             isLoadBlocked = true;
             loadBlockedHandled = false;
             blockedLoadSeqNum = inst->seqNum;
             // No fault occurred, even though the interface is blocked.
             return NoFault;
         }
+
         DPRINTF(LSQUnit, "Doing timing access for inst PC %#x\n",
                 loadQueue[load_idx]->readPC());
-/*
-        Addr debug_addr = ULL(0xfffffc0000be81a8);
-        if (req->vaddr == debug_addr) {
-            debug_break();
-        }
-*/
+
         assert(!req->completionEvent);
         req->completionEvent =
             new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
@@ -632,75 +594,16 @@ LSQUnit<Impl>::read(MemReqPtr &req, T &data, int load_idx)
 
         assert(dcacheInterface->doEvents());
 
-        // Ugly hack to get an event scheduled *only* if the access is
-        // a miss.  We really should add first-class support for this
-        // at some point.
         if (result != MA_HIT) {
             DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
             DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
                     inst->seqNum);
-
-            lastDcacheStall = curTick;
-
-//            _status = DcacheMissStall;
-
         } else {
-            DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
-                    inst->seqNum);
-
             DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
-        }
-    }
-#if 0
-    // if we have a cache, do cache access too
-    if (dcacheInterface) {
-        if (dcacheInterface->isBlocked()) {
-            isLoadBlocked = true;
-            // No fault occurred, even though the interface is blocked.
-            return NoFault;
-        }
-
-        DPRINTF(LSQUnit, "LSQUnit: D-cache: PC:%#x reading from paddr:%#x "
-                "vaddr:%#x flags:%i\n",
-                inst->readPC(), req->paddr, req->vaddr, req->flags);
-
-        // Setup MemReq pointer
-        req->cmd = Read;
-        req->completionEvent = NULL;
-        req->time = curTick;
-        assert(!req->data);
-        req->data = new uint8_t[64];
-
-        assert(!req->completionEvent);
-        req->completionEvent =
-            new typename IEW::LdWritebackEvent(loadQueue[load_idx], iewStage);
-
-        // Do Cache Access
-        MemAccessResult result = dcacheInterface->access(req);
-
-        // Ugly hack to get an event scheduled *only* if the access is
-        // a miss.  We really should add first-class support for this
-        // at some point.
-        // @todo: Probably should support having no events
-        if (result != MA_HIT) {
-            DPRINTF(LSQUnit, "LSQUnit: D-cache miss!\n");
-            DPRINTF(Activity, "Activity: ld accessing mem miss [sn:%lli]\n",
-                    inst->seqNum);
-
-            lastDcacheStall = curTick;
-
-            _status = DcacheMissStall;
-
-        } else {
             DPRINTF(Activity, "Activity: ld accessing mem hit [sn:%lli]\n",
                     inst->seqNum);
-
-            DPRINTF(LSQUnit, "LSQUnit: D-cache hit!\n");
         }
-    } else {
-        fatal("Must use D-cache with new memory system");
     }
-#endif
 
     return fault;
 }
@@ -716,24 +619,11 @@ LSQUnit<Impl>::write(MemReqPtr &req, T &data, int store_idx)
             " | storeHead:%i [sn:%i]\n",
             store_idx, req->paddr, data, storeHead,
             storeQueue[store_idx].inst->seqNum);
-/*
-    if (req->flags & LOCKED) {
-        if (req->flags & UNCACHEABLE) {
-            req->result = 2;
-        } else {
-            req->result = 1;
-        }
-    }
-*/
+
     storeQueue[store_idx].req = req;
     storeQueue[store_idx].size = sizeof(T);
     storeQueue[store_idx].data = data;
-/*
-    Addr debug_addr = ULL(0xfffffc0000be81a8);
-    if (req->vaddr == debug_addr) {
-        debug_break();
-    }
-*/
+
     // This function only writes the data to the store queue, so no fault
     // can happen here.
     return NoFault;
diff --git a/cpu/o3/lsq_unit_impl.hh b/cpu/o3/lsq_unit_impl.hh
index dca808ac9..f0b4405ed 100644
--- a/cpu/o3/lsq_unit_impl.hh
+++ b/cpu/o3/lsq_unit_impl.hh
@@ -35,8 +35,8 @@ LSQUnit<Impl>::StoreCompletionEvent::StoreCompletionEvent(int store_idx,
                                                           Event *wb_event,
                                                           LSQUnit<Impl> *lsq_ptr)
     : Event(&mainEventQueue),
-      storeIdx(store_idx),
       wbEvent(wb_event),
+      storeIdx(store_idx),
       lsqPtr(lsq_ptr)
 {
     this->setFlags(Event::AutoDelete);
@@ -86,15 +86,13 @@ LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
 
     lsqID = id;
 
-    LQEntries = maxLQEntries;
-    SQEntries = maxSQEntries;
+    // Add 1 for the sentinel entry (they are circular queues).
+    LQEntries = maxLQEntries + 1;
+    SQEntries = maxSQEntries + 1;
 
     loadQueue.resize(LQEntries);
     storeQueue.resize(SQEntries);
 
-
-    // May want to initialize these entries to NULL
-
     loadHead = loadTail = 0;
 
     storeHead = storeWBIdx = storeTail = 0;
@@ -104,7 +102,7 @@ LSQUnit<Impl>::init(Params *params, unsigned maxLQEntries,
 
     dcacheInterface = params->dcacheInterface;
 
-    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+    memDepViolator = NULL;
 
     blockedLoadSeqNum = 0;
 }
@@ -152,6 +150,8 @@ LSQUnit<Impl>::switchOut()
     for (int i = 0; i < loadQueue.size(); ++i)
         loadQueue[i] = NULL;
 
+    assert(storesToWB == 0);
+
     while (storesToWB > 0 &&
            storeWBIdx != storeTail &&
            storeQueue[storeWBIdx].inst &&
@@ -218,7 +218,7 @@ LSQUnit<Impl>::takeOverFrom()
 
     usedPorts = 0;
 
-    loadFaultInst = storeFaultInst = memDepViolator = NULL;
+    memDepViolator = NULL;
 
     blockedLoadSeqNum = 0;
 
@@ -231,16 +231,17 @@ template<class Impl>
 void
 LSQUnit<Impl>::resizeLQ(unsigned size)
 {
-    assert( size >= LQEntries);
+    unsigned size_plus_sentinel = size + 1;
+    assert(size_plus_sentinel >= LQEntries);
 
-    if (size > LQEntries) {
-        while (size > loadQueue.size()) {
+    if (size_plus_sentinel > LQEntries) {
+        while (size_plus_sentinel > loadQueue.size()) {
             DynInstPtr dummy;
             loadQueue.push_back(dummy);
             LQEntries++;
         }
     } else {
-        LQEntries = size;
+        LQEntries = size_plus_sentinel;
     }
 
 }
@@ -249,14 +250,15 @@ template<class Impl>
 void
 LSQUnit<Impl>::resizeSQ(unsigned size)
 {
-    if (size > SQEntries) {
-        while (size > storeQueue.size()) {
+    unsigned size_plus_sentinel = size + 1;
+    if (size_plus_sentinel > SQEntries) {
+        while (size_plus_sentinel > storeQueue.size()) {
             SQEntry dummy;
             storeQueue.push_back(dummy);
             SQEntries++;
         }
     } else {
-        SQEntries = size;
+        SQEntries = size_plus_sentinel;
     }
 }
 
@@ -264,10 +266,8 @@ template <class Impl>
 void
 LSQUnit<Impl>::insert(DynInstPtr &inst)
 {
-    // Make sure we really have a memory reference.
     assert(inst->isMemRef());
 
-    // Make sure it's one of the two classes of memory references.
     assert(inst->isLoad() || inst->isStore());
 
     if (inst->isLoad()) {
@@ -283,7 +283,8 @@ template <class Impl>
 void
 LSQUnit<Impl>::insertLoad(DynInstPtr &load_inst)
 {
-    assert((loadTail + 1) % LQEntries != loadHead && loads < LQEntries);
+    assert((loadTail + 1) % LQEntries != loadHead);
+    assert(loads < LQEntries);
 
     DPRINTF(LSQUnit, "Inserting load PC %#x, idx:%i [sn:%lli]\n",
             load_inst->readPC(), loadTail, load_inst->seqNum);
@@ -322,7 +323,6 @@ LSQUnit<Impl>::insertStore(DynInstPtr &store_inst)
     incrStIdx(storeTail);
 
     ++stores;
-
 }
 
 template <class Impl>
@@ -370,39 +370,6 @@ LSQUnit<Impl>::numLoadsReady()
     return retval;
 }
 
-#if 0
-template <class Impl>
-Fault
-LSQUnit<Impl>::executeLoad()
-{
-    Fault load_fault = NoFault;
-    DynInstPtr load_inst;
-
-    assert(readyLoads.size() != 0);
-
-    // Execute a ready load.
-    LdMapIt ready_it = readyLoads.begin();
-
-    load_inst = (*ready_it).second;
-
-    // Execute the instruction, which is held in the data portion of the
-    // iterator.
-    load_fault = load_inst->execute();
-
-    // If it executed successfully, then switch it over to the executed
-    // loads list.
-    if (load_fault == NoFault) {
-        executedLoads[load_inst->seqNum] = load_inst;
-
-        readyLoads.erase(ready_it);
-    } else {
-        loadFaultInst = load_inst;
-    }
-
-    return load_fault;
-}
-#endif
-
 template <class Impl>
 Fault
 LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
@@ -413,33 +380,14 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
     DPRINTF(LSQUnit, "Executing load PC %#x, [sn:%lli]\n",
             inst->readPC(),inst->seqNum);
 
-    // Make sure it's really in the list.
-    // Normally it should always be in the list.  However,
-    /* due to a syscall it may not be the list.
-#ifdef DEBUG
-    int i = loadHead;
-    while (1) {
-        if (i == loadTail && !find(inst)) {
-            assert(0 && "Load not in the queue!");
-        } else if (loadQueue[i] == inst) {
-            break;
-        }
-
-        i = i + 1;
-        if (i >= LQEntries) {
-            i = 0;
-        }
-    }
-#endif // DEBUG*/
-
 //    load_fault = inst->initiateAcc();
     load_fault = inst->execute();
 
     // If the instruction faulted, then we need to send it along to commit
     // without the instruction completing.
     if (load_fault != NoFault) {
-        // Maybe just set it as can commit here, although that might cause
-        // some other problems with sending traps to the ROB too quickly.
+        // Send this instruction to commit, also make sure iew stage
+        // realizes there is activity.
         iewStage->instToCommit(inst);
         iewStage->activityThisCycle();
     }
@@ -447,20 +395,6 @@ LSQUnit<Impl>::executeLoad(DynInstPtr &inst)
     return load_fault;
 }
 
-template <class Impl>
-Fault
-LSQUnit<Impl>::executeLoad(int lq_idx)
-{
-    // Very hackish.  Not sure the best way to check that this
-    // instruction is at the head of the ROB.  I should have some sort
-    // of extra information here so that I'm not overloading the
-    // canCommit signal for 15 different things.
-    loadQueue[lq_idx]->setCanCommit();
-    Fault ret_fault = executeLoad(loadQueue[lq_idx]);
-    loadQueue[lq_idx]->clearCanCommit();
-    return ret_fault;
-}
-
 template <class Impl>
 Fault
 LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
@@ -481,11 +415,7 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
     Fault store_fault = store_inst->initiateAcc();
 //    Fault store_fault = store_inst->execute();
 
-    // Store size should now be available.  Use it to get proper offset for
-    // addr comparisons.
-    int size = storeQueue[store_idx].size;
-
-    if (size == 0) {
+    if (storeQueue[store_idx].size == 0) {
         DPRINTF(LSQUnit,"Fault on Store PC %#x, [sn:%lli],Size = 0\n",
                 store_inst->readPC(),store_inst->seqNum);
 
@@ -494,30 +424,25 @@ LSQUnit<Impl>::executeStore(DynInstPtr &store_inst)
 
     assert(store_fault == NoFault);
 
-    if (!storeFaultInst) {
-        if (store_fault != NoFault) {
-            panic("Fault in a store instruction!");
-            storeFaultInst = store_inst;
-        } else if (store_inst->isNonSpeculative()) {
-            // Nonspeculative accesses (namely store conditionals)
-            // need to set themselves as able to writeback if we
-            // haven't had a fault by here.
-            storeQueue[store_idx].canWB = true;
+    if (store_inst->isNonSpeculative()) {
+        // Nonspeculative accesses (namely store conditionals)
+        // need to set themselves as able to writeback if we
+        // haven't had a fault by here.
+        storeQueue[store_idx].canWB = true;
 
-            ++storesToWB;
-        }
+        ++storesToWB;
     }
 
     if (!memDepViolator) {
         while (load_idx != loadTail) {
-            // Actually should only check loads that have actually executed
-            // Might be safe because effAddr is set to InvalAddr when the
-            // dyn inst is created.
-
-            // Must actually check all addrs in the proper size range
-            // Which is more correct than needs to be.  What if for now we just
-            // assume all loads are quad-word loads, and do the addr based
-            // on that.
+            // Really only need to check loads that have actually executed
+            // It's safe to check all loads because effAddr is set to
+            // InvalAddr when the dyn inst is created.
+
+            // @todo: For now this is extra conservative, detecting a
+            // violation if the addresses match assuming all accesses
+            // are quad word accesses.
+
             // @todo: Fix this, magic number being used here
             if ((loadQueue[load_idx]->effAddr >> 8) ==
                 (store_inst->effAddr >> 8)) {
@@ -555,32 +480,6 @@ LSQUnit<Impl>::commitLoad()
     --loads;
 }
 
-template <class Impl>
-void
-LSQUnit<Impl>::commitLoad(InstSeqNum &inst)
-{
-    // Hopefully I don't use this function too much
-    panic("Don't use this function!");
-
-    int i = loadHead;
-    while (1) {
-        if (i == loadTail) {
-            assert(0 && "Load not in the queue!");
-        } else if (loadQueue[i]->seqNum == inst) {
-            break;
-        }
-
-        ++i;
-        if (i >= LQEntries) {
-            i = 0;
-        }
-    }
-
-    loadQueue[i]->removeInLSQ();
-    loadQueue[i] = NULL;
-    --loads;
-}
-
 template <class Impl>
 void
 LSQUnit<Impl>::commitLoads(InstSeqNum &youngest_inst)
@@ -602,6 +501,8 @@ LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
 
     while (store_idx != storeTail) {
         assert(storeQueue[store_idx].inst);
+        // Mark any stores that are now committed and have not yet
+        // been marked as able to write back.
         if (!storeQueue[store_idx].canWB) {
             if (storeQueue[store_idx].inst->seqNum > youngest_inst) {
                 break;
@@ -613,7 +514,6 @@ LSQUnit<Impl>::commitStores(InstSeqNum &youngest_inst)
 
             storeQueue[store_idx].canWB = true;
 
-//            --stores;
             ++storesToWB;
         }
 
@@ -631,6 +531,8 @@ LSQUnit<Impl>::writebackStores()
            storeQueue[storeWBIdx].canWB &&
            usedPorts < cachePorts) {
 
+        // Store didn't write any data so no need to write it back to
+        // memory.
         if (storeQueue[storeWBIdx].size == 0) {
             completeStore(storeWBIdx);
 
@@ -659,7 +561,6 @@ LSQUnit<Impl>::writebackStores()
         MemReqPtr req = storeQueue[storeWBIdx].req;
         storeQueue[storeWBIdx].committed = true;
 
-//	Fault fault = cpu->translateDataWriteReq(req);
         req->cmd = Write;
         req->completionEvent = NULL;
         req->time = curTick;
@@ -689,6 +590,12 @@ LSQUnit<Impl>::writebackStores()
           default:
             panic("Unexpected store size!\n");
         }
+
+        // Stores other than store conditionals are completed at this
+        // time.  Mark them as completed and, if we have a checker,
+        // tell it that the instruction is completed.
+        // @todo: Figure out what time I can say stores are complete in
+        // the timing memory.
         if (!(req->flags & LOCKED)) {
             storeQueue[storeWBIdx].inst->setCompleted();
             if (cpu->checker) {
@@ -714,57 +621,35 @@ LSQUnit<Impl>::writebackStores()
                 iewStage->replayMemInst(loadQueue[stallingLoadIdx]);
             }
 
-            if (result != MA_HIT && dcacheInterface->doEvents()) {
-                typename IEW::LdWritebackEvent *wb = NULL;
-                if (req->flags & LOCKED) {
-                    // Stx_C should not generate a system port transaction,
-                    // but that might be hard to accomplish.
-                    wb = new typename
-                        IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
+            typename IEW::LdWritebackEvent *wb = NULL;
+            if (req->flags & LOCKED) {
+                // Stx_C should not generate a system port transaction
+                // if it misses in the cache, but that might be hard
+                // to accomplish without explicit cache support.
+                wb = new typename
+                    IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
                                               iewStage);
-                    store_event->wbEvent = wb;
-                }
+                store_event->wbEvent = wb;
+            }
 
-                DPRINTF(LSQUnit,"D-Cache Write Miss!\n");
+            if (result != MA_HIT && dcacheInterface->doEvents()) {
+                DPRINTF(LSQUnit,"D-Cache Write Miss on idx:%i!\n",
+                        storeWBIdx);
 
                 DPRINTF(Activity, "Active st accessing mem miss [sn:%lli]\n",
                         storeQueue[storeWBIdx].inst->seqNum);
 
-                lastDcacheStall = curTick;
-
-//                _status = DcacheMissStall;
-
                 //mshrSeqNums.push_back(storeQueue[storeWBIdx].inst->seqNum);
 
                 //DPRINTF(LSQUnit, "Added MSHR. count = %i\n",mshrSeqNums.size());
 
-                // Increment stat here or something
+                // @todo: Increment stat here.
             } else {
                 DPRINTF(LSQUnit,"D-Cache: Write Hit on idx:%i !\n",
                         storeWBIdx);
 
                 DPRINTF(Activity, "Active st accessing mem hit [sn:%lli]\n",
                         storeQueue[storeWBIdx].inst->seqNum);
-
-
-                if (req->flags & LOCKED) {
-                    // Stx_C does not generate a system port transaction.
-/*
-                    if (req->flags & UNCACHEABLE) {
-                        req->result = 2;
-                    } else {
-                        if (cpu->lockFlag && cpu->lockAddr == req->paddr) {
-                            req->result=1;
-                        } else {
-                            req->result = 0;
-                        }
-                    }
-*/
-                    typename IEW::LdWritebackEvent *wb =
-                        new typename IEW::LdWritebackEvent(storeQueue[storeWBIdx].inst,
-                                                           iewStage);
-                    store_event->wbEvent = wb;
-                }
             }
 
             incrStIdx(storeWBIdx);
@@ -798,14 +683,12 @@ void
 LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
 {
     DPRINTF(LSQUnit, "Squashing until [sn:%lli]!"
-            "(Loads:%i Stores:%i)\n",squashed_num,loads,stores);
+            "(Loads:%i Stores:%i)\n", squashed_num, loads, stores);
 
     int load_idx = loadTail;
     decrLdIdx(load_idx);
 
     while (loads != 0 && loadQueue[load_idx]->seqNum > squashed_num) {
-
-        // Clear the smart pointer to make sure it is decremented.
         DPRINTF(LSQUnit,"Load Instruction PC %#x squashed, "
                 "[sn:%lli]\n",
                 loadQueue[load_idx]->readPC(),
@@ -817,6 +700,7 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
             stallingLoadIdx = 0;
         }
 
+        // Clear the smart pointer to make sure it is decremented.
         loadQueue[load_idx]->squashed = true;
         loadQueue[load_idx] = NULL;
         --loads;
@@ -840,19 +724,18 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
 
     while (stores != 0 &&
            storeQueue[store_idx].inst->seqNum > squashed_num) {
-
+        // Instructions marked as can WB are already committed.
         if (storeQueue[store_idx].canWB) {
             break;
         }
 
-        // Clear the smart pointer to make sure it is decremented.
         DPRINTF(LSQUnit,"Store Instruction PC %#x squashed, "
                 "idx:%i [sn:%lli]\n",
                 storeQueue[store_idx].inst->readPC(),
                 store_idx, storeQueue[store_idx].inst->seqNum);
 
-        // I don't think this can happen.  It should have been cleared by the
-        // stalling load.
+        // I don't think this can happen.  It should have been cleared
+        // by the stalling load.
         if (isStalled() &&
             storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
             panic("Is stalled should have been cleared by stalling load!\n");
@@ -860,13 +743,17 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
             stallingStoreIsn = 0;
         }
 
+        // Clear the smart pointer to make sure it is decremented.
         storeQueue[store_idx].inst->squashed = true;
         storeQueue[store_idx].inst = NULL;
         storeQueue[store_idx].canWB = 0;
 
         if (storeQueue[store_idx].req) {
+            // There should not be a completion event if the store has
+            // not yet committed.
             assert(!storeQueue[store_idx].req->completionEvent);
         }
+
         storeQueue[store_idx].req = NULL;
         --stores;
 
@@ -877,36 +764,6 @@ LSQUnit<Impl>::squash(const InstSeqNum &squashed_num)
     }
 }
 
-template <class Impl>
-void
-LSQUnit<Impl>::dumpInsts()
-{
-    cprintf("Load store queue: Dumping instructions.\n");
-    cprintf("Load queue size: %i\n", loads);
-    cprintf("Load queue: ");
-
-    int load_idx = loadHead;
-
-    while (load_idx != loadTail && loadQueue[load_idx]) {
-        cprintf("%#x ", loadQueue[load_idx]->readPC());
-
-        incrLdIdx(load_idx);
-    }
-
-    cprintf("Store queue size: %i\n", stores);
-    cprintf("Store queue: ");
-
-    int store_idx = storeHead;
-
-    while (store_idx != storeTail && storeQueue[store_idx].inst) {
-        cprintf("%#x ", storeQueue[store_idx].inst->readPC());
-
-        incrStIdx(store_idx);
-    }
-
-    cprintf("\n");
-}
-
 template <class Impl>
 void
 LSQUnit<Impl>::completeStore(int store_idx)
@@ -930,7 +787,9 @@ LSQUnit<Impl>::completeStore(int store_idx)
         iewStage->updateLSQNextCycle = true;
     }
 
-    DPRINTF(LSQUnit, "Store head idx:%i\n", storeHead);
+    DPRINTF(LSQUnit, "Completing store [sn:%lli], idx:%i, store head "
+            "idx:%i\n",
+            storeQueue[store_idx].inst->seqNum, store_idx, storeHead);
 
     if (isStalled() &&
         storeQueue[store_idx].inst->seqNum == stallingStoreIsn) {
@@ -943,6 +802,10 @@ LSQUnit<Impl>::completeStore(int store_idx)
     }
 
     storeQueue[store_idx].inst->setCompleted();
+
+    // Tell the checker we've completed this instruction.  Some stores
+    // may get reported twice to the checker, but the checker can
+    // handle that case.
     if (cpu->checker) {
         cpu->checker->tick(storeQueue[store_idx].inst);
     }
@@ -979,3 +842,33 @@ LSQUnit<Impl>::decrLdIdx(int &load_idx)
     if (--load_idx < 0)
         load_idx += LQEntries;
 }
+
+template <class Impl>
+void
+LSQUnit<Impl>::dumpInsts()
+{
+    cprintf("Load store queue: Dumping instructions.\n");
+    cprintf("Load queue size: %i\n", loads);
+    cprintf("Load queue: ");
+
+    int load_idx = loadHead;
+
+    while (load_idx != loadTail && loadQueue[load_idx]) {
+        cprintf("%#x ", loadQueue[load_idx]->readPC());
+
+        incrLdIdx(load_idx);
+    }
+
+    cprintf("Store queue size: %i\n", stores);
+    cprintf("Store queue: ");
+
+    int store_idx = storeHead;
+
+    while (store_idx != storeTail && storeQueue[store_idx].inst) {
+        cprintf("%#x ", storeQueue[store_idx].inst->readPC());
+
+        incrStIdx(store_idx);
+    }
+
+    cprintf("\n");
+}
diff --git a/cpu/o3/mem_dep_unit.hh b/cpu/o3/mem_dep_unit.hh
index 141e0fdc4..acbe08ec2 100644
--- a/cpu/o3/mem_dep_unit.hh
+++ b/cpu/o3/mem_dep_unit.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -201,13 +201,6 @@ class MemDepUnit {
         static int memdep_erase;
     };
 
-    struct ltMemDepEntry {
-        bool operator() (const MemDepEntryPtr &lhs, const MemDepEntryPtr &rhs)
-        {
-            return lhs->inst->seqNum < rhs->inst->seqNum;
-        }
-    };
-
     /** Finds the memory dependence entry in the hash map. */
     inline MemDepEntryPtr &findInHash(const DynInstPtr &inst);
 
diff --git a/cpu/o3/mem_dep_unit_impl.hh b/cpu/o3/mem_dep_unit_impl.hh
index 05a33685d..8b195baab 100644
--- a/cpu/o3/mem_dep_unit_impl.hh
+++ b/cpu/o3/mem_dep_unit_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -141,12 +141,12 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
         std::pair<InstSeqNum, MemDepEntryPtr>(inst->seqNum, inst_entry));
     MemDepEntry::memdep_insert++;
 
-    // Add the instruction to the instruction list.
     instList[tid].push_back(inst);
 
     inst_entry->listIt = --(instList[tid].end());
 
-    // Check the dependence predictor for any producing stores.
+    // Check any barriers and the dependence predictor for any
+    // producing stores.
     InstSeqNum producing_store;
     if (inst->isLoad() && loadBarrier) {
         producing_store = loadBarrierSN;
@@ -181,7 +181,7 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
             moveToReady(inst_entry);
         }
     } else {
-        // Otherwise make the instruction dependent on the store.
+        // Otherwise make the instruction dependent on the store/barrier.
         DPRINTF(MemDepUnit, "Adding to dependency list; "
                 "inst PC %#x is dependent on [sn:%lli].\n",
                 inst->readPC(), producing_store);
@@ -193,8 +193,6 @@ MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
         // Add this instruction to the list of dependents.
         store_entry->dependInsts.push_back(inst_entry);
 
-//        inst_entry->producingStore = store_entry;
-
         if (inst->isLoad()) {
             ++conflictingLoads;
         } else {
@@ -370,8 +368,6 @@ MemDepUnit<MemDepPred, Impl>::completed(DynInstPtr &inst)
 
     instList[tid].erase((*hash_it).second->listIt);
 
-//    (*hash_it).second->inst = NULL;
-
     (*hash_it).second = NULL;
 
     memDepHash.erase(hash_it);
@@ -416,7 +412,6 @@ MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
 
         if (!woken_inst->inst) {
             // Potentially removed mem dep entries could be on this list
-//            inst_entry->dependInsts[i] = NULL;
             continue;
         }
 
@@ -429,7 +424,6 @@ MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
         } else {
             woken_inst->memDepReady = true;
         }
-//        inst_entry->dependInsts[i] = NULL;
     }
 
     inst_entry->dependInsts.clear();
@@ -468,13 +462,7 @@ MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num,
         assert(hash_it != memDepHash.end());
 
         (*hash_it).second->squashed = true;
-/*
-        for (int i = 0; i < (*hash_it).second->dependInsts.size(); ++i) {
-            (*hash_it).second->dependInsts[i] = NULL;
-        }
 
-        (*hash_it).second->inst = NULL;
-*/
         (*hash_it).second = NULL;
 
         memDepHash.erase(hash_it);
diff --git a/cpu/o3/rename.hh b/cpu/o3/rename.hh
index dd2cb0c18..3f1a27bb5 100644
--- a/cpu/o3/rename.hh
+++ b/cpu/o3/rename.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,15 +35,16 @@
 #include "base/timebuf.hh"
 
 /**
- * DefaultRename handles both single threaded and SMT rename. Its width is
- * specified by the parameters; each cycle it tries to rename that many
- * instructions. It holds onto the rename history of all instructions with
- * destination registers, storing the arch. register, the new physical
- * register, and the old physical register, to allow for undoing of mappings
- * if squashing happens, or freeing up registers upon commit. Rename handles
- * blocking if the ROB, IQ, or LSQ is going to be full. Rename also handles
- * barriers, and does so by stalling on the instruction until the ROB is
- * empty and there are no instructions in flight to the ROB.
+ * DefaultRename handles both single threaded and SMT rename. Its
+ * width is specified by the parameters; each cycle it tries to rename
+ * that many instructions. It holds onto the rename history of all
+ * instructions with destination registers, storing the
+ * arch. register, the new physical register, and the old physical
+ * register, to allow for undoing of mappings if squashing happens, or
+ * freeing up registers upon commit. Rename handles blocking if the
+ * ROB, IQ, or LSQ is going to be full. Rename also handles barriers,
+ * and does so by stalling on the instruction until the ROB is empty
+ * and there are no instructions in flight to the ROB.
  */
 template<class Impl>
 class DefaultRename
@@ -68,14 +69,15 @@ class DefaultRename
     // Typedefs from the ISA.
     typedef TheISA::RegIndex RegIndex;
 
-    // A deque is used to queue the instructions.  Barrier insts must be
-    // added to the front of the deque, which is the only reason for using
-    // a deque instead of a queue. (Most other stages use a queue)
+    // A list is used to queue the instructions.  Barrier insts must
+    // be added to the front of the list, which is the only reason for
+    // using a list instead of a queue. (Most other stages use a
+    // queue)
     typedef std::list<DynInstPtr> InstQueue;
 
   public:
-    /** Overall rename status. Used to determine if the CPU can deschedule
-     * itself due to a lack of activity.
+    /** Overall rename status. Used to determine if the CPU can
+     * deschedule itself due to a lack of activity.
      */
     enum RenameStatus {
         Active,
diff --git a/cpu/o3/rename_impl.hh b/cpu/o3/rename_impl.hh
index db4bb2ffe..081581c92 100644
--- a/cpu/o3/rename_impl.hh
+++ b/cpu/o3/rename_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -209,17 +209,13 @@ template <class Impl>
 void
 DefaultRename<Impl>::initStage()
 {
+    // Grab the number of free entries directly from the stages.
     for (int tid=0; tid < numThreads; tid++) {
         freeEntries[tid].iqEntries = iew_ptr->instQueue.numFreeEntries(tid);
         freeEntries[tid].lsqEntries = iew_ptr->ldstQueue.numFreeEntries(tid);
         freeEntries[tid].robEntries = commit_ptr->numROBFreeEntries(tid);
         emptyROB[tid] = true;
     }
-
-    // Clear these pointers so they are not accidentally used in
-    // non-initialization code.
-//    iew_ptr = NULL;
-//    commit_ptr = NULL;
 }
 
 template<class Impl>
@@ -299,6 +295,7 @@ DefaultRename<Impl>::takeOverFrom()
     _status = Inactive;
     initStage();
 
+    // Reset all state prior to taking over from the other CPU.
     for (int i=0; i< numThreads; i++) {
         renameStatus[i] = Idle;
 
@@ -326,7 +323,7 @@ DefaultRename<Impl>::squash(unsigned tid)
     if (renameStatus[tid] == Blocked ||
         renameStatus[tid] == Unblocking ||
         renameStatus[tid] == SerializeStall) {
-#if !FULL_SYSTEM
+#if 0
         // In syscall emulation, we can have both a block and a squash due
         // to a syscall in the same cycle.  This would cause both signals to
         // be high.  This shouldn't happen in full system.
@@ -344,7 +341,7 @@ DefaultRename<Impl>::squash(unsigned tid)
     // Set the status to Squashing.
     renameStatus[tid] = Squashing;
 
-    // Clear the skid buffer in case it has any data in it.
+    // Squash any instructions from decode.
     unsigned squashCount = 0;
 
     for (int i=0; i<fromDecode->size; i++) {
@@ -367,9 +364,6 @@ template <class Impl>
 void
 DefaultRename<Impl>::tick()
 {
-    // Rename will need to try to rename as many instructions as it
-    // has bandwidth, unless it is blocked.
-
     wroteToTimeBuffer = false;
 
     blockThisCycle = false;
@@ -454,8 +448,6 @@ DefaultRename<Impl>::rename(bool &status_change, unsigned tid)
     } else if (renameStatus[tid] == Unblocking) {
         renameInsts(tid);
 
-//        ++renameUnblockCycles;
-
         if (validInsts()) {
             // Add the current inputs to the skid buffer so they can be
             // reprocessed when this stage unblocks.
@@ -575,7 +567,6 @@ DefaultRename<Impl>::renameInsts(unsigned tid)
 
         insts_to_rename.pop_front();
 
-        //Use skidBuffer with oldest instructions
         if (renameStatus[tid] == Unblocking) {
             DPRINTF(Rename,"[tid:%u]: Removing [sn:%lli] PC:%#x from rename "
                     "skidBuffer\n",
@@ -711,10 +702,10 @@ void
 DefaultRename<Impl>::sortInsts()
 {
     int insts_from_decode = fromDecode->size;
-
+#ifdef DEBUG
     for (int i=0; i < numThreads; i++)
         assert(insts[i].empty());
-
+#endif
     for (int i = 0; i < insts_from_decode; ++i) {
         DynInstPtr inst = fromDecode->insts[i];
         insts[inst->threadNumber].push_back(inst);
@@ -794,8 +785,8 @@ DefaultRename<Impl>::block(unsigned tid)
             wroteToTimeBuffer = true;
         }
 
-        // Rename can not go from SerializeStall to Blocked, otherwise it would
-        // not know to complete the serialize stall.
+        // Rename can not go from SerializeStall to Blocked, otherwise
+        // it would not know to complete the serialize stall.
         if (renameStatus[tid] != SerializeStall) {
             // Set status to Blocked.
             renameStatus[tid] = Blocked;
@@ -835,15 +826,11 @@ DefaultRename<Impl>::doSquash(unsigned tid)
 
     InstSeqNum squashed_seq_num = fromCommit->commitInfo[tid].doneSeqNum;
 
-//#if FULL_SYSTEM
-//    assert(!historyBuffer[tid].empty());
-//#else
     // After a syscall squashes everything, the history buffer may be empty
     // but the ROB may still be squashing instructions.
     if (historyBuffer[tid].empty()) {
         return;
     }
-//#endif // FULL_SYSTEM
 
     // Go through the most recent instructions, undoing the mappings
     // they did and freeing up the registers.
@@ -896,8 +883,8 @@ DefaultRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num, unsigned tid)
            hb_it != historyBuffer[tid].end() &&
            (*hb_it).instSeqNum <= inst_seq_num) {
 
-        DPRINTF(Rename, "[tid:%u]: Freeing up older rename of reg %i, sequence"
-                " number %i.\n",
+        DPRINTF(Rename, "[tid:%u]: Freeing up older rename of reg %i, "
+                "[sn:%lli].\n",
                 tid, (*hb_it).prevPhysReg, (*hb_it).instSeqNum);
 
         freeList->addReg((*hb_it).prevPhysReg);
diff --git a/cpu/o3/rename_map.cc b/cpu/o3/rename_map.cc
index 8ba632e65..fc59058a1 100644
--- a/cpu/o3/rename_map.cc
+++ b/cpu/o3/rename_map.cc
@@ -32,18 +32,12 @@
 
 using namespace std;
 
-// Todo: Consider making functions inline.  Avoid having things that are
-// using the zero register or misc registers from adding on the registers
-// to the free list.  Possibly remove the direct communication between
-// this and the freelist.  Considering making inline bool functions that
-// determine if the register is a logical int, logical fp, physical int,
-// physical fp, etc.
+// @todo: Consider making inline bool functions that determine if the
+// register is a logical int, logical fp, physical int, physical fp,
+// etc.
 
 SimpleRenameMap::~SimpleRenameMap()
 {
-    // Delete the rename maps as they were allocated with new.
-    //delete [] intRenameMap;
-    //delete [] floatRenameMap;
 }
 
 void
@@ -105,7 +99,8 @@ SimpleRenameMap::init(unsigned _numLogicalIntRegs,
         // Although the index refers purely to architected registers, because
         // the floating reg indices come after the integer reg indices, they
         // may exceed the size of a normal RegIndex (short).
-        for (PhysRegIndex index = numLogicalIntRegs; index < numLogicalRegs; ++index)
+        for (PhysRegIndex index = numLogicalIntRegs;
+             index < numLogicalRegs; ++index)
         {
             floatRenameMap[index].physical_reg = freg_idx++;
         }
@@ -132,14 +127,10 @@ SimpleRenameMap::init(unsigned _numLogicalIntRegs,
 void
 SimpleRenameMap::setFreeList(SimpleFreeList *fl_ptr)
 {
-    //Setup the interface to the freelist.
     freeList = fl_ptr;
 }
 
 
-// Don't allow this stage to fault; force that check to the rename stage.
-// Simply ask to rename a logical register and get back a new physical
-// register index.
 SimpleRenameMap::RenameInfo
 SimpleRenameMap::rename(RegIndex arch_reg)
 {
@@ -152,13 +143,11 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         // requested architected register.
         prev_reg = intRenameMap[arch_reg].physical_reg;
 
-        // If it's not referencing the zero register, then mark the register
-        // as not ready.
+        // If it's not referencing the zero register, then rename the
+        // register.
         if (arch_reg != intZeroReg) {
-            // Get a free physical register to rename to.
             renamed_reg = freeList->getIntReg();
 
-            // Update the integer rename map.
             intRenameMap[arch_reg].physical_reg = renamed_reg;
 
             assert(renamed_reg >= 0 && renamed_reg < numPhysicalIntRegs);
@@ -168,20 +157,15 @@ SimpleRenameMap::rename(RegIndex arch_reg)
             renamed_reg = intZeroReg;
         }
     } else if (arch_reg < numLogicalRegs) {
-        // Subtract off the base offset for floating point registers.
-//        arch_reg = arch_reg - numLogicalIntRegs;
-
         // Record the current physical register that is renamed to the
         // requested architected register.
         prev_reg = floatRenameMap[arch_reg].physical_reg;
 
-        // If it's not referencing the zero register, then mark the register
-        // as not ready.
+        // If it's not referencing the zero register, then rename the
+        // register.
         if (arch_reg != floatZeroReg) {
-            // Get a free floating point register to rename to.
             renamed_reg = freeList->getFloatReg();
 
-            // Update the floating point rename map.
             floatRenameMap[arch_reg].physical_reg = renamed_reg;
 
             assert(renamed_reg < numPhysicalRegs &&
@@ -194,10 +178,10 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         // Subtract off the base offset for miscellaneous registers.
         arch_reg = arch_reg - numLogicalRegs;
 
-        // No renaming happens to the misc. registers.  They are simply the
-        // registers that come after all the  physical registers; thus
-        // take the base architected register and add the physical registers
-        // to it.
+        // No renaming happens to the misc. registers.  They are
+        // simply the registers that come after all the physical
+        // registers; thus take the base architected register and add
+        // the physical registers to it.
         renamed_reg = arch_reg + numPhysicalRegs;
 
         // Set the previous register to the same register; mainly it must be
@@ -211,17 +195,12 @@ SimpleRenameMap::rename(RegIndex arch_reg)
     return RenameInfo(renamed_reg, prev_reg);
 }
 
-//Perhaps give this a pair as a return value, of the physical register
-//and whether or not it's ready.
 PhysRegIndex
 SimpleRenameMap::lookup(RegIndex arch_reg)
 {
     if (arch_reg < numLogicalIntRegs) {
         return intRenameMap[arch_reg].physical_reg;
     } else if (arch_reg < numLogicalRegs) {
-        // Subtract off the base FP offset.
-//        arch_reg = arch_reg - numLogicalIntRegs;
-
         return floatRenameMap[arch_reg].physical_reg;
     } else {
         // Subtract off the misc registers offset.
@@ -233,51 +212,23 @@ SimpleRenameMap::lookup(RegIndex arch_reg)
     }
 }
 
-// In this implementation the miscellaneous registers do not actually rename,
-// so this function does not allow you to try to change their mappings.
 void
 SimpleRenameMap::setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg)
 {
+    // In this implementation the miscellaneous registers do not
+    // actually rename, so this function does not allow you to try to
+    // change their mappings.
     if (arch_reg < numLogicalIntRegs) {
         DPRINTF(Rename, "Rename Map: Integer register %i being set to %i.\n",
                 (int)arch_reg, renamed_reg);
 
         intRenameMap[arch_reg].physical_reg = renamed_reg;
     } else if (arch_reg < numLogicalIntRegs + numLogicalFloatRegs) {
-
-
         DPRINTF(Rename, "Rename Map: Float register %i being set to %i.\n",
                 (int)arch_reg - numLogicalIntRegs, renamed_reg);
 
         floatRenameMap[arch_reg].physical_reg = renamed_reg;
     }
-
-    //assert(arch_reg < (numLogicalIntRegs + numLogicalFloatRegs));
-}
-
-void
-SimpleRenameMap::squash(vector<RegIndex> freed_regs,
-                        vector<UnmapInfo> unmaps)
-{
-    panic("Not sure this function should be called.");
-
-    // Not sure the rename map should be able to access the free list
-    // like this.
-    while (!freed_regs.empty()) {
-        RegIndex free_register = freed_regs.back();
-
-        if (free_register < numPhysicalIntRegs) {
-            freeList->addIntReg(free_register);
-        } else {
-            // Subtract off the base FP dependence tag.
-            free_register = free_register - numPhysicalIntRegs;
-            freeList->addFloatReg(free_register);
-        }
-
-        freed_regs.pop_back();
-    }
-
-    // Take unmap info and roll back the rename map.
 }
 
 int
diff --git a/cpu/o3/rename_map.hh b/cpu/o3/rename_map.hh
index 3ecbe45c3..d7e49ae83 100644
--- a/cpu/o3/rename_map.hh
+++ b/cpu/o3/rename_map.hh
@@ -101,9 +101,6 @@ class SimpleRenameMap
      */
     void setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg);
 
-    void squash(std::vector<RegIndex> freed_regs,
-                std::vector<UnmapInfo> unmaps);
-
     int numFreeEntries();
 
   private:
@@ -153,7 +150,7 @@ class SimpleRenameMap
     };
 
     //Change this to private
-  public:
+  private:
     /** Integer rename map. */
     std::vector<RenameEntry> intRenameMap;
 
diff --git a/cpu/o3/rob.hh b/cpu/o3/rob.hh
index 0748850ea..e05eebe5a 100644
--- a/cpu/o3/rob.hh
+++ b/cpu/o3/rob.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -53,9 +53,7 @@ class ROB
     enum Status {
         Running,
         Idle,
-        ROBSquashing,
-        DcacheMissStall,
-        DcacheMissComplete
+        ROBSquashing
     };
 
     /** SMT ROB Sharing Policy */
@@ -112,7 +110,7 @@ class ROB
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the head of the ROB.
      */
-    DynInstPtr readHeadInst();
+//    DynInstPtr readHeadInst();
 
     /** Returns a pointer to the head instruction of a specific thread within
      *  the ROB.
@@ -124,7 +122,7 @@ class ROB
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the tail of the ROB.
      */
-    DynInstPtr readTailInst();
+//    DynInstPtr readTailInst();
 
     /** Returns a pointer to the tail instruction of a specific thread within
      *  the ROB.
@@ -133,7 +131,7 @@ class ROB
     DynInstPtr readTailInst(unsigned tid);
 
     /** Retires the head instruction, removing it from the ROB. */
-    void retireHead();
+//    void retireHead();
 
     /** Retires the head instruction of a specific thread, removing it from the
      *  ROB.
@@ -141,7 +139,7 @@ class ROB
     void retireHead(unsigned tid);
 
     /** Is the oldest instruction across all threads ready. */
-    bool isHeadReady();
+//    bool isHeadReady();
 
     /** Is the oldest instruction across a particular thread ready. */
     bool isHeadReady(unsigned tid);
@@ -200,35 +198,35 @@ class ROB
     void updateTail();
 
     /** Reads the PC of the oldest head instruction. */
-    uint64_t readHeadPC();
+//    uint64_t readHeadPC();
 
     /** Reads the PC of the head instruction of a specific thread. */
-    uint64_t readHeadPC(unsigned tid);
+//    uint64_t readHeadPC(unsigned tid);
 
     /** Reads the next PC of the oldest head instruction. */
-    uint64_t readHeadNextPC();
+//    uint64_t readHeadNextPC();
 
     /** Reads the next PC of the head instruction of a specific thread. */
-    uint64_t readHeadNextPC(unsigned tid);
+//    uint64_t readHeadNextPC(unsigned tid);
 
     /** Reads the sequence number of the oldest head instruction. */
-    InstSeqNum readHeadSeqNum();
+//    InstSeqNum readHeadSeqNum();
 
     /** Reads the sequence number of the head instruction of a specific thread.
      */
-    InstSeqNum readHeadSeqNum(unsigned tid);
+//    InstSeqNum readHeadSeqNum(unsigned tid);
 
     /** Reads the PC of the youngest tail instruction. */
-    uint64_t readTailPC();
+//    uint64_t readTailPC();
 
     /** Reads the PC of the tail instruction of a specific thread. */
-    uint64_t readTailPC(unsigned tid);
+//    uint64_t readTailPC(unsigned tid);
 
     /** Reads the sequence number of the youngest tail instruction. */
-    InstSeqNum readTailSeqNum();
+//    InstSeqNum readTailSeqNum();
 
     /** Reads the sequence number of tail instruction of a specific thread. */
-    InstSeqNum readTailSeqNum(unsigned tid);
+//    InstSeqNum readTailSeqNum(unsigned tid);
 
     /** Checks if the ROB is still in the process of squashing instructions.
      *  @retval Whether or not the ROB is done squashing.
diff --git a/cpu/o3/rob_impl.hh b/cpu/o3/rob_impl.hh
index 02a4bfbee..25e0c80fd 100644
--- a/cpu/o3/rob_impl.hh
+++ b/cpu/o3/rob_impl.hh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -201,20 +201,15 @@ template <class Impl>
 void
 ROB<Impl>::insertInst(DynInstPtr &inst)
 {
-    // Make sure we have the right number of instructions.
     //assert(numInstsInROB == countInsts());
-
-    // Make sure the instruction is valid.
     assert(inst);
 
     DPRINTF(ROB, "Adding inst PC %#x to the ROB.\n", inst->readPC());
 
-    // If the ROB is full then exit.
     assert(numInstsInROB != numEntries);
 
     int tid = inst->threadNumber;
 
-    // Place into ROB
     instList[tid].push_back(inst);
 
     //Set Up head iterator if this is the 1st instruction in the ROB
@@ -228,10 +223,8 @@ ROB<Impl>::insertInst(DynInstPtr &inst)
     tail = instList[tid].end();
     tail--;
 
-    // Mark as set in ROB
     inst->setInROB();
 
-    // Increment ROB count
     ++numInstsInROB;
     ++threadEntries[tid];
 
@@ -242,6 +235,7 @@ ROB<Impl>::insertInst(DynInstPtr &inst)
 
 // Whatever calls this function needs to ensure that it properly frees up
 // registers prior to this function.
+/*
 template <class Impl>
 void
 ROB<Impl>::retireHead()
@@ -249,7 +243,6 @@ ROB<Impl>::retireHead()
     //assert(numInstsInROB == countInsts());
     assert(numInstsInROB > 0);
 
-    // Get the head ROB instruction's TID.
     int tid = (*head)->threadNumber;
 
     retireHead(tid);
@@ -258,6 +251,7 @@ ROB<Impl>::retireHead()
         tail = instList[tid].end();
     }
 }
+*/
 
 template <class Impl>
 void
@@ -271,18 +265,15 @@ ROB<Impl>::retireHead(unsigned tid)
 
     DynInstPtr head_inst = (*head_it);
 
-    // Make certain this can retire.
     assert(head_inst->readyToCommit());
 
     DPRINTF(ROB, "[tid:%u]: Retiring head instruction, "
             "instruction PC %#x,[sn:%lli]\n", tid, head_inst->readPC(),
             head_inst->seqNum);
 
-    // Keep track of how many instructions are in the ROB.
     --numInstsInROB;
     --threadEntries[tid];
 
-    //Mark DynInstFlags
     head_inst->removeInROB();
     head_inst->setCommitted();
 
@@ -291,12 +282,12 @@ ROB<Impl>::retireHead(unsigned tid)
     //Update "Global" Head of ROB
     updateHead();
 
-    // A special case is needed if the instruction being retired is the
-    // only instruction in the ROB; otherwise the tail iterator will become
-    // invalidated.
+    // @todo: A special case is needed if the instruction being
+    // retired is the only instruction in the ROB; otherwise the tail
+    // iterator will become invalidated.
     cpu->removeFrontInst(head_inst);
 }
-
+/*
 template <class Impl>
 bool
 ROB<Impl>::isHeadReady()
@@ -307,7 +298,7 @@ ROB<Impl>::isHeadReady()
 
     return false;
 }
-
+*/
 template <class Impl>
 bool
 ROB<Impl>::isHeadReady(unsigned tid)
@@ -537,7 +528,7 @@ ROB<Impl>::squash(InstSeqNum squash_num,unsigned tid)
         doSquash(tid);
     }
 }
-
+/*
 template <class Impl>
 typename Impl::DynInstPtr
 ROB<Impl>::readHeadInst()
@@ -549,7 +540,7 @@ ROB<Impl>::readHeadInst()
         return dummyInst;
     }
 }
-
+*/
 template <class Impl>
 typename Impl::DynInstPtr
 ROB<Impl>::readHeadInst(unsigned tid)
@@ -564,7 +555,7 @@ ROB<Impl>::readHeadInst(unsigned tid)
         return dummyInst;
     }
 }
-
+/*
 template <class Impl>
 uint64_t
 ROB<Impl>::readHeadPC()
@@ -608,7 +599,6 @@ ROB<Impl>::readHeadNextPC(unsigned tid)
     return (*head_thread)->readNextPC();
 }
 
-
 template <class Impl>
 InstSeqNum
 ROB<Impl>::readHeadSeqNum()
@@ -637,7 +627,7 @@ ROB<Impl>::readTailInst()
 
     return (*tail);
 }
-
+*/
 template <class Impl>
 typename Impl::DynInstPtr
 ROB<Impl>::readTailInst(unsigned tid)
@@ -650,7 +640,7 @@ ROB<Impl>::readTailInst(unsigned tid)
     return *tail_thread;
 }
 
-
+/*
 template <class Impl>
 uint64_t
 ROB<Impl>::readTailPC()
@@ -698,4 +688,4 @@ ROB<Impl>::readTailSeqNum(unsigned tid)
 
     return (*tail_thread)->seqNum;
 }
-
+*/
diff --git a/cpu/o3/scoreboard.cc b/cpu/o3/scoreboard.cc
index 87b0aee94..b0e433620 100644
--- a/cpu/o3/scoreboard.cc
+++ b/cpu/o3/scoreboard.cc
@@ -99,6 +99,7 @@ Scoreboard::unsetReg(PhysRegIndex ready_reg)
     if (ready_reg == zeroRegIdx ||
         ready_reg == (zeroRegIdx + numPhysicalIntRegs)) {
         // Don't do anything if int or fp zero reg.
+        return;
     }
 
     regScoreBoard[ready_reg] = 0;
diff --git a/cpu/o3/store_set.cc b/cpu/o3/store_set.cc
index a685646f3..0c957c8c7 100644
--- a/cpu/o3/store_set.cc
+++ b/cpu/o3/store_set.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 The Regents of The University of Michigan
+ * Copyright (c) 2004-2006 The Regents of The University of Michigan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -278,11 +278,6 @@ StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
 void
 StoreSet::squash(InstSeqNum squashed_num, unsigned tid)
 {
-    // Not really sure how to do this well.
-    // Generally this is small enough that it should be okay; short circuit
-    // evaluation should take care of invalid entries.
-    // Maybe keep a list of valid LFST's?  Really ugly either way...
-
     DPRINTF(StoreSet, "StoreSet: Squashing until inum %i\n",
             squashed_num);
 
diff --git a/cpu/o3/thread_state.hh b/cpu/o3/thread_state.hh
index 17719bdeb..2c9788e4b 100644
--- a/cpu/o3/thread_state.hh
+++ b/cpu/o3/thread_state.hh
@@ -1,3 +1,30 @@
+/*
+ * Copyright (c) 2006 The Regents of The University of Michigan
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
 #ifndef __CPU_O3_THREAD_STATE_HH__
 #define __CPU_O3_THREAD_STATE_HH__
@@ -15,27 +42,17 @@ class EndQuiesceEvent;
 class FunctionProfile;
 class ProfileNode;
 #else
-class Process;
 class FunctionalMemory;
+class Process;
 #endif
 
-// In the new CPU case this may be quite small...It depends on what I define
-// ThreadState to be.  Currently it's only the state that exists within
-// ExecContext basically.  Leaves the interface and manipulation up to the
-// CPU.  Not sure this is useful/flexible...probably can be if I can avoid
-// including state here that parts of the pipeline can't modify directly,
-// or at least don't let them.  The only problem is for state that's needed
-// per thread, per structure.  I.e. rename table, memreqs.
-// On the other hand, it might be nice to not have to pay the extra pointer
-// lookup to get frequently used state such as a memreq (that isn't used much
-// elsewhere)...
-
-// Maybe this ozone thread state should only really have committed state?
-// I need to think about why I'm using this and what it's useful for.  Clearly
-// has benefits for SMT; basically serves same use as CPUExecContext.
-// Makes the ExecContext proxy easier.  Gives organization/central access point
-// to state of a thread that can be accessed normally (i.e. not in-flight
-// stuff within a OoO processor).  Does this need an XC proxy within it?
+/**
+ * Class that has various thread state, such as the status, the
+ * current instruction being processed, whether or not the thread has
+ * a trap pending or is being externally updated, the ExecContext
+ * proxy pointer, etc.  It also handles anything related to a specific
+ * thread's process, such as syscalls and checking valid addresses.
+ */
 template <class Impl>
 struct O3ThreadState : public ThreadState {
     typedef ExecContext::Status Status;
@@ -43,7 +60,7 @@ struct O3ThreadState : public ThreadState {
 
     Status _status;
 
-    // Current instruction?
+    // Current instruction
     TheISA::MachInst inst;
   private:
     FullCPU *cpu;
@@ -80,51 +97,11 @@ struct O3ThreadState : public ThreadState {
     void setStatus(Status new_status) { _status = new_status; }
 
 #if !FULL_SYSTEM
-
-    Fault dummyTranslation(MemReqPtr &req)
-    {
-#if 0
-        assert((req->vaddr >> 48 & 0xffff) == 0);
-#endif
-
-        // put the asid in the upper 16 bits of the paddr
-        req->paddr = req->vaddr & ~((Addr)0xffff << sizeof(Addr) * 8 - 16);
-        req->paddr = req->paddr | (Addr)req->asid << sizeof(Addr) * 8 - 16;
-        return NoFault;
-    }
-    Fault translateInstReq(MemReqPtr &req)
-    {
-        return dummyTranslation(req);
-    }
-    Fault translateDataReadReq(MemReqPtr &req)
-    {
-        return dummyTranslation(req);
-    }
-    Fault translateDataWriteReq(MemReqPtr &req)
-    {
-        return dummyTranslation(req);
-    }
-
     bool validInstAddr(Addr addr)
     { return process->validInstAddr(addr); }
 
     bool validDataAddr(Addr addr)
     { return process->validDataAddr(addr); }
-#else
-    Fault translateInstReq(MemReqPtr &req)
-    {
-        return cpu->itb->translate(req);
-    }
-
-    Fault translateDataReadReq(MemReqPtr &req)
-    {
-        return cpu->dtb->translate(req, false);
-    }
-
-    Fault translateDataWriteReq(MemReqPtr &req)
-    {
-        return cpu->dtb->translate(req, true);
-    }
 #endif
 
     bool misspeculating() { return false; }
-- 
2.30.2