From e3fb9afa79e37cb8c60a48b9ff3976665c2c7675 Mon Sep 17 00:00:00 2001
From: Kevin Lim <ktlim@umich.edu>
Date: Thu, 23 Sep 2004 14:06:03 -0400
Subject: [PATCH] Update to make multiple instruction issue and different
 latencies work. Also change to ref counted DynInst.

SConscript:
    Add branch predictor, BTB, load store queue, and storesets.
arch/isa_parser.py:
    Specify the template parameter for AlphaDynInst
base/traceflags.py:
    Add load store queue, store set, and mem dependence unit to the
    list of trace flags.
cpu/base_dyn_inst.cc:
    Change formating, add in debug statement.
cpu/base_dyn_inst.hh:
    Change DynInst to be RefCounted, add flag to clear whether or not this
    instruction can commit.  This is likely to be removed in the future.
cpu/beta_cpu/alpha_dyn_inst.cc:
    AlphaDynInst has been changed to be templated, so now this CC file
    is just used to force instantiations of AlphaDynInst.
cpu/beta_cpu/alpha_dyn_inst.hh:
    Changed AlphaDynInst to be templated on Impl.  Removed some unnecessary
    functions.
cpu/beta_cpu/alpha_full_cpu.cc:
    AlphaFullCPU has been changed to be templated, so this CC file is now
    just used to force instantation of AlphaFullCPU.
cpu/beta_cpu/alpha_full_cpu.hh:
    Change AlphaFullCPU to be templated on Impl.
cpu/beta_cpu/alpha_impl.hh:
    Update it to reflect AlphaDynInst and AlphaFullCPU being templated
    on Impl.  Also removed time buffers from here, as they are really
    a part of the CPU and are thus in the CPU policy now.
cpu/beta_cpu/alpha_params.hh:
    Make AlphaSimpleParams inherit from the BaseFullCPU so that it doesn't
    need to specifically declare any parameters that are already in the
    BaseFullCPU.
cpu/beta_cpu/comm.hh:
    Changed the structure of the time buffer communication structs.  Now
    they include the size of the packet of instructions it is sending.
    Added some parameters to the backwards communication struct, mainly
    for squashing.
cpu/beta_cpu/commit.hh:
    Update typenames to reflect change in location of time buffer structs.
    Update DynInst to DynInstPtr (it is refcounted now).
cpu/beta_cpu/commit_impl.hh:
    Formatting changes mainly.  Also sends back proper information
    on branch mispredicts so that the bpred unit can update itself.
    Updated behavior for non-speculative instructions (stores, any
    other non-spec instructions): once they reach the head of the ROB,
    the ROB signals back to the IQ that it can go ahead and issue the
    non-speculative instruction.  The instruction itself is updated so that
    commit won't try to commit it again until it is done executing.
cpu/beta_cpu/cpu_policy.hh:
    Added branch prediction unit, mem dependence prediction unit, load
    store queue.  Moved time buffer structs from AlphaSimpleImpl to here.
cpu/beta_cpu/decode.hh:
    Changed typedefs to reflect change in location of time buffer structs
    and also the change from DynInst to ref counted DynInstPtr.
cpu/beta_cpu/decode_impl.hh:
    Continues to buffer instructions even while unblocking now.  Changed
    how it loops through groups of instructions so it can properly block
    during the middle of a group of instructions.
cpu/beta_cpu/fetch.hh:
    Changed typedefs to reflect change in location of time buffer structs
    and the change to ref counted DynInsts.  Also added in branch
    brediction unit.
cpu/beta_cpu/fetch_impl.hh:
    Add in branch prediction.  Changed how fetch checks inputs and its
    current state to make for easier logic.
cpu/beta_cpu/free_list.cc:
    Changed int regs and float regs to logically use one flat namespace.
    Future change will be moving them to a single scoreboard to conserve
    space.
cpu/beta_cpu/free_list.hh:
    Mostly debugging statements.  Might be removed for performance in future.
cpu/beta_cpu/full_cpu.cc:
    Added in some debugging statements.  Updated BaseFullCPU to take
    a params object.
cpu/beta_cpu/full_cpu.hh:
    Added params class within BaseCPU that other param classes will be
    able to inherit from.  Updated typedefs to reflect change in location
    of time buffer structs and ref counted DynInst.
cpu/beta_cpu/iew.hh:
    Updated typedefs to reflect change in location of time buffer structs
    and use of ref counted DynInsts.
cpu/beta_cpu/iew_impl.hh:
    Added in load store queue, updated iew to be able to execute non-
    speculative instructions, instead of having them execute in commit.
cpu/beta_cpu/inst_queue.hh:
    Updated change to ref counted DynInsts.  Changed inst queue to hold
    non-speculative instructions as well, which are issued only when
    commit signals backwards that a nonspeculative instruction is at
    the head of the ROB.
cpu/beta_cpu/inst_queue_impl.hh:
    Updated to allow for non-speculative instructions to be in the inst
    queue.  Also added some debug functions.
cpu/beta_cpu/regfile.hh:
    Added debugging statements, changed formatting.
cpu/beta_cpu/rename.hh:
    Updated typedefs, added some functions to clean up code.
cpu/beta_cpu/rename_impl.hh:
    Moved some code into functions to make it easier to read.
cpu/beta_cpu/rename_map.cc:
    Changed int and float reg behavior to use a single flat namespace.  In
    the future, the rename maps can be combined to a single rename map to
    save space.
cpu/beta_cpu/rename_map.hh:
    Added destructor.
cpu/beta_cpu/rob.hh:
    Updated it with change from DynInst to ref counted DynInst.
cpu/beta_cpu/rob_impl.hh:
    Formatting, updated to use ref counted DynInst.
cpu/static_inst.hh:
    Updated forward declaration for AlphaDynInst now that it is templated.

--HG--
extra : convert_revision : 1045f240ee9b6a4bd368e1806aca029ebbdc6dd3
---
 SConscript                             |   7 +
 arch/isa_parser.py                     |   2 +-
 base/traceflags.py                     |   5 +-
 cpu/base_dyn_inst.cc                   |  29 +-
 cpu/base_dyn_inst.hh                   |  13 +-
 cpu/beta_cpu/2bit_local_pred.cc        | 110 +++
 cpu/beta_cpu/2bit_local_pred.hh        |  99 +++
 cpu/beta_cpu/alpha_dyn_inst.cc         | 105 +--
 cpu/beta_cpu/alpha_dyn_inst.hh         |  67 +-
 cpu/beta_cpu/alpha_dyn_inst_impl.hh    | 109 +++
 cpu/beta_cpu/alpha_full_cpu.cc         | 918 +------------------------
 cpu/beta_cpu/alpha_full_cpu.hh         |  13 +-
 cpu/beta_cpu/alpha_full_cpu_builder.cc | 306 +++++++++
 cpu/beta_cpu/alpha_full_cpu_impl.hh    | 690 +++++++++++++++++++
 cpu/beta_cpu/alpha_impl.hh             |  48 +-
 cpu/beta_cpu/alpha_params.hh           |  49 +-
 cpu/beta_cpu/bpred_unit.cc             |   5 +
 cpu/beta_cpu/bpred_unit.hh             |  51 ++
 cpu/beta_cpu/bpred_unit_impl.hh        |  13 +
 cpu/beta_cpu/btb.cc                    |  85 +++
 cpu/beta_cpu/btb.hh                    |  52 ++
 cpu/beta_cpu/comm.hh                   |  65 +-
 cpu/beta_cpu/commit.hh                 |  28 +-
 cpu/beta_cpu/commit_impl.hh            | 120 ++--
 cpu/beta_cpu/cpu_policy.hh             |  38 +-
 cpu/beta_cpu/decode.hh                 |  24 +-
 cpu/beta_cpu/decode_impl.hh            |  79 ++-
 cpu/beta_cpu/fetch.hh                  |  29 +-
 cpu/beta_cpu/fetch_impl.hh             | 259 ++++---
 cpu/beta_cpu/free_list.cc              |  23 +-
 cpu/beta_cpu/free_list.hh              |  35 +-
 cpu/beta_cpu/full_cpu.cc               |  85 +--
 cpu/beta_cpu/full_cpu.hh               |  56 +-
 cpu/beta_cpu/iew.hh                    |  29 +-
 cpu/beta_cpu/iew_impl.hh               | 156 +++--
 cpu/beta_cpu/inst_queue.hh             | 133 ++--
 cpu/beta_cpu/inst_queue_impl.hh        | 509 ++++++++++----
 cpu/beta_cpu/mem_dep_unit.cc           |   9 +
 cpu/beta_cpu/mem_dep_unit.hh           |  70 ++
 cpu/beta_cpu/mem_dep_unit_impl.hh      | 166 +++++
 cpu/beta_cpu/regfile.hh                |  42 +-
 cpu/beta_cpu/rename.hh                 |  45 +-
 cpu/beta_cpu/rename_impl.hh            | 397 ++++++-----
 cpu/beta_cpu/rename_map.cc             |  76 +-
 cpu/beta_cpu/rename_map.hh             |  15 +-
 cpu/beta_cpu/rob.hh                    |  43 +-
 cpu/beta_cpu/rob_impl.hh               |  78 ++-
 cpu/beta_cpu/store_set.cc              | 192 ++++++
 cpu/beta_cpu/store_set.hh              |  58 ++
 cpu/static_inst.hh                     |   5 +-
 50 files changed, 3715 insertions(+), 1925 deletions(-)
 create mode 100644 cpu/beta_cpu/2bit_local_pred.cc
 create mode 100644 cpu/beta_cpu/2bit_local_pred.hh
 create mode 100644 cpu/beta_cpu/alpha_dyn_inst_impl.hh
 create mode 100644 cpu/beta_cpu/alpha_full_cpu_builder.cc
 create mode 100644 cpu/beta_cpu/alpha_full_cpu_impl.hh
 create mode 100644 cpu/beta_cpu/bpred_unit.cc
 create mode 100644 cpu/beta_cpu/bpred_unit.hh
 create mode 100644 cpu/beta_cpu/bpred_unit_impl.hh
 create mode 100644 cpu/beta_cpu/btb.cc
 create mode 100644 cpu/beta_cpu/btb.hh
 create mode 100644 cpu/beta_cpu/mem_dep_unit.cc
 create mode 100644 cpu/beta_cpu/mem_dep_unit.hh
 create mode 100644 cpu/beta_cpu/mem_dep_unit_impl.hh
 create mode 100644 cpu/beta_cpu/store_set.cc
 create mode 100644 cpu/beta_cpu/store_set.hh

diff --git a/SConscript b/SConscript
index 07cdcfdee..fb2b40325 100644
--- a/SConscript
+++ b/SConscript
@@ -91,8 +91,12 @@ base_sources = Split('''
 	cpu/exetrace.cc
 	cpu/pc_event.cc
 	cpu/static_inst.cc
+        cpu/beta_cpu/2bit_local_pred.cc
         cpu/beta_cpu/alpha_dyn_inst.cc
         cpu/beta_cpu/alpha_full_cpu.cc
+        cpu/beta_cpu/alpha_full_cpu_builder.cc
+        cpu/beta_cpu/bpred_unit.cc
+        cpu/beta_cpu/btb.cc
         cpu/beta_cpu/commit.cc
         cpu/beta_cpu/decode.cc
         cpu/beta_cpu/fetch.cc
@@ -100,9 +104,12 @@ base_sources = Split('''
         cpu/beta_cpu/full_cpu.cc
         cpu/beta_cpu/iew.cc
         cpu/beta_cpu/inst_queue.cc
+        cpu/beta_cpu/ldstq.cc
+        cpu/beta_cpu/mem_dep_unit.cc
         cpu/beta_cpu/rename.cc
         cpu/beta_cpu/rename_map.cc
         cpu/beta_cpu/rob.cc
+        cpu/beta_cpu/store_set.cc
 	cpu/fast_cpu/fast_cpu.cc
 	cpu/full_cpu/bpred.cc
 	cpu/full_cpu/commit.cc
diff --git a/arch/isa_parser.py b/arch/isa_parser.py
index f7278628b..f86e6193d 100755
--- a/arch/isa_parser.py
+++ b/arch/isa_parser.py
@@ -638,7 +638,7 @@ CpuModel('FullCPU', 'full_cpu_exec.cc',
          { 'CPU_exec_context': 'DynInst' })
 CpuModel('AlphaFullCPU', 'alpha_full_cpu_exec.cc',
          '#include "cpu/beta_cpu/alpha_dyn_inst.hh"',
-         { 'CPU_exec_context': 'AlphaDynInst' })
+         { 'CPU_exec_context': 'AlphaDynInst<AlphaSimpleImpl>' })
 
 # Expand template with CPU-specific references into a dictionary with
 # an entry for each CPU model name.  The entry key is the model name
diff --git a/base/traceflags.py b/base/traceflags.py
index 8b4208660..a1fb45177 100644
--- a/base/traceflags.py
+++ b/base/traceflags.py
@@ -132,6 +132,9 @@ baseFlags = [
     'ROB',
     'FreeList',
     'RenameMap',
+    'LDSTQ',
+    'StoreSet',
+    'MemDepUnit',
     'DynInst',
     'FullCPU'
     ]
@@ -150,7 +153,7 @@ compoundFlagMap = {
     'DiskImageAll' : [ 'DiskImage', 'DiskImageRead', 'DiskImageWrite' ],
     'EthernetAll' : [ 'Ethernet', 'EthernetPIO', 'EthernetDMA', 'EthernetData' , 'EthernetDesc', 'EthernetIntr', 'EthernetSM', 'EthernetCksum' ],
     'IdeAll' : [ 'IdeCtrl', 'IdeDisk' ],
-    'FullCPUAll' : [ 'Fetch', 'Decode', 'Rename', 'IEW', 'Commit', 'IQ', 'ROB', 'FreeList', 'RenameMap', 'DynInst', 'FullCPU']
+    'FullCPUAll' : [ 'Fetch', 'Decode', 'Rename', 'IEW', 'Commit', 'IQ', 'ROB', 'FreeList', 'RenameMap', 'LDSTQ', 'StoreSet', 'MemDepUnit', 'DynInst', 'FullCPU']
 }
 
 #############################################################
diff --git a/cpu/base_dyn_inst.cc b/cpu/base_dyn_inst.cc
index bd681e1dc..c527eb08b 100644
--- a/cpu/base_dyn_inst.cc
+++ b/cpu/base_dyn_inst.cc
@@ -34,6 +34,7 @@
 #include <sstream>
 
 #include "base/cprintf.hh"
+#include "base/trace.hh"
 
 #include "arch/alpha/faults.hh"
 #include "cpu/exetrace.hh"
@@ -67,12 +68,14 @@ my_hash_t thishash;
 
 //int break_inst = -1;
 
-template<class Impl>
+template <class Impl>
 BaseDynInst<Impl>::BaseDynInst(MachInst machInst, Addr inst_PC,
                                Addr pred_PC, InstSeqNum seq_num,
                                FullCPU *cpu)
     : staticInst(machInst), traceData(NULL), cpu(cpu), xc(cpu->xcBase())
 {
+    DPRINTF(FullCPU, "DynInst: Creating new DynInst.\n");
+
     effAddr = MemReq::inval_addr;
     physEffAddr = MemReq::inval_addr;
 
@@ -123,11 +126,13 @@ BaseDynInst<Impl>::BaseDynInst(MachInst machInst, Addr inst_PC,
 
     ++instcount;
 
+//    assert(instcount < 50);
+
     DPRINTF(FullCPU, "DynInst: Instruction created.  Instcount=%i\n",
             instcount);
 }
 
-template<class Impl>
+template <class Impl>
 BaseDynInst<Impl>::BaseDynInst(StaticInstPtr<ISA> &_staticInst)
     : staticInst(_staticInst), traceData(NULL)
 {
@@ -155,7 +160,7 @@ BaseDynInst<Impl>::BaseDynInst(StaticInstPtr<ISA> &_staticInst)
     }
 }
 
-template<class Impl>
+template <class Impl>
 BaseDynInst<Impl>::~BaseDynInst()
 {
 /*
@@ -169,21 +174,21 @@ BaseDynInst<Impl>::~BaseDynInst()
             instcount);
 }
 
-template<class Impl>
+template <class Impl>
 FunctionalMemory *
 BaseDynInst<Impl>::getMemory(void)
 {
     return xc->mem;
 }
 /*
-template<class Impl>
+template <class Impl>
 IntReg *
 BaseDynInst<Impl>::getIntegerRegs(void)
 {
     return (spec_mode ? xc->specIntRegFile : xc->regs.intRegFile);
 }
 */
-template<class Impl>
+template <class Impl>
 void
 BaseDynInst<Impl>::prefetch(Addr addr, unsigned flags)
 {
@@ -229,7 +234,7 @@ BaseDynInst<Impl>::prefetch(Addr addr, unsigned flags)
     }
 }
 
-template<class Impl>
+template <class Impl>
 void
 BaseDynInst<Impl>::writeHint(Addr addr, int size, unsigned flags)
 {
@@ -261,7 +266,7 @@ BaseDynInst<Impl>::writeHint(Addr addr, int size, unsigned flags)
 /**
  * @todo Need to find a way to get the cache block size here.
  */
-template<class Impl>
+template <class Impl>
 Fault
 BaseDynInst<Impl>::copySrcTranslate(Addr src)
 {
@@ -284,7 +289,7 @@ BaseDynInst<Impl>::copySrcTranslate(Addr src)
 /**
  * @todo Need to find a way to get the cache block size here.
  */
-template<class Impl>
+template <class Impl>
 Fault
 BaseDynInst<Impl>::copy(Addr dest)
 {
@@ -308,7 +313,7 @@ BaseDynInst<Impl>::copy(Addr dest)
     return fault;
 }
 
-template<class Impl>
+template <class Impl>
 void
 BaseDynInst<Impl>::dump()
 {
@@ -317,7 +322,7 @@ BaseDynInst<Impl>::dump()
     cprintf("'\n");
 }
 
-template<class Impl>
+template <class Impl>
 void
 BaseDynInst<Impl>::dump(std::string &outstring)
 {
@@ -330,7 +335,7 @@ BaseDynInst<Impl>::dump(std::string &outstring)
 
 
 #if 0
-template<class Impl>
+template <class Impl>
 Fault
 BaseDynInst<Impl>::mem_access(mem_cmd cmd, Addr addr, void *p, int nbytes)
 {
diff --git a/cpu/base_dyn_inst.hh b/cpu/base_dyn_inst.hh
index 7651b517e..fe30b5195 100644
--- a/cpu/base_dyn_inst.hh
+++ b/cpu/base_dyn_inst.hh
@@ -53,12 +53,12 @@ namespace Trace {
     class InstRecord;
 };
 
-class BaseInst
-{
-};
+// Forward declaration.
+template <class blah>
+class StaticInstPtr;
 
 template <class Impl>
-class BaseDynInst : public FastAlloc
+class BaseDynInst : public FastAlloc, public RefCounted
 {
   public:
     // Typedef for the CPU.
@@ -74,7 +74,7 @@ class BaseDynInst : public FastAlloc
     /// Logical register index type.
     typedef typename ISA::RegIndex RegIndex;
     /// Integer register index type.
-    typedef typename ISA::IntReg IntReg;
+    typedef typename ISA::IntReg   IntReg;
 
     enum {
         MaxInstSrcRegs = ISA::MaxInstSrcRegs,	//< Max source regs
@@ -430,6 +430,9 @@ class BaseDynInst : public FastAlloc
     /** Sets this instruction as ready to commit. */
     void setCanCommit() { canCommit = true; }
 
+    /** Clears this instruction as being ready to commit. */
+    void clearCanCommit() { canCommit = false; }
+
     /** Returns whether or not this instruction is ready to commit. */
     bool readyToCommit() const { return canCommit; }
 
diff --git a/cpu/beta_cpu/2bit_local_pred.cc b/cpu/beta_cpu/2bit_local_pred.cc
new file mode 100644
index 000000000..88c39a9b0
--- /dev/null
+++ b/cpu/beta_cpu/2bit_local_pred.cc
@@ -0,0 +1,110 @@
+#include "base/trace.hh"
+#include "cpu/beta_cpu/2bit_local_pred.hh"
+
+DefaultBP::SatCounter::SatCounter(unsigned bits)
+    : maxVal((1 << bits) - 1), counter(0)
+{
+}
+
+DefaultBP::SatCounter::SatCounter(unsigned bits, unsigned initial_val)
+    : maxVal((1 << bits) - 1), counter(initial_val)
+{
+    // Check to make sure initial value doesn't exceed the max counter value.
+    if (initial_val > maxVal) {
+        panic("BP: Initial counter value exceeds max size.");
+    }
+}
+
+void
+DefaultBP::SatCounter::increment()
+{
+    if(counter < maxVal) {
+        ++counter;
+    }
+}
+
+void
+DefaultBP::SatCounter::decrement()
+{
+    if(counter > 0) {
+        --counter;
+    }
+}
+
+DefaultBP::DefaultBP(unsigned _localPredictorSize,
+                     unsigned _localCtrBits,
+                     unsigned _instShiftAmt)
+    : localPredictorSize(_localPredictorSize),
+      localCtrBits(_localCtrBits),
+      instShiftAmt(_instShiftAmt)
+{
+    // Should do checks here to make sure sizes are correct (powers of 2).
+
+    // Setup the index mask.
+    indexMask = localPredictorSize - 1;
+
+    DPRINTF(Fetch, "Branch predictor: index mask: %#x\n", indexMask);
+
+    // Setup the array of counters for the local predictor.
+    localCtrs = new SatCounter[localPredictorSize](localCtrBits);
+
+    DPRINTF(Fetch, "Branch predictor: local predictor size: %i\n",
+            localPredictorSize);
+
+    DPRINTF(Fetch, "Branch predictor: local counter bits: %i\n", localCtrBits);
+
+    DPRINTF(Fetch, "Branch predictor: instruction shift amount: %i\n",
+            instShiftAmt);
+}
+
+inline
+bool
+DefaultBP::getPrediction(uint8_t &count)
+{
+    // Get the MSB of the count
+    return (count >> (localCtrBits - 1));
+}
+
+inline
+unsigned
+DefaultBP::getLocalIndex(Addr &branch_addr)
+{
+    return (branch_addr >> instShiftAmt) & indexMask;
+}
+
+bool
+DefaultBP::lookup(Addr &branch_addr)
+{
+    uint8_t local_prediction;
+    unsigned local_predictor_idx = getLocalIndex(branch_addr);
+
+    DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
+            local_predictor_idx);
+
+    local_prediction = localCtrs[local_predictor_idx].read();
+
+    DPRINTF(Fetch, "Branch predictor: prediction is %i.\n",
+            (int)local_prediction);
+
+    return getPrediction(local_prediction);
+}
+
+void
+DefaultBP::update(Addr &branch_addr, bool taken)
+{
+    unsigned local_predictor_idx;
+
+    // Update the local predictor.
+    local_predictor_idx = getLocalIndex(branch_addr);
+
+    DPRINTF(Fetch, "Branch predictor: Looking up index %#x\n",
+            local_predictor_idx);
+
+    if (taken) {
+        DPRINTF(Fetch, "Branch predictor: Branch updated as taken.\n");
+        localCtrs[local_predictor_idx].increment();
+    } else {
+        DPRINTF(Fetch, "Branch predictor: Branch updated as not taken.\n");
+        localCtrs[local_predictor_idx].decrement();
+    }
+}
diff --git a/cpu/beta_cpu/2bit_local_pred.hh b/cpu/beta_cpu/2bit_local_pred.hh
new file mode 100644
index 000000000..32a7972d0
--- /dev/null
+++ b/cpu/beta_cpu/2bit_local_pred.hh
@@ -0,0 +1,99 @@
+#ifndef __2BIT_LOCAL_PRED_HH__
+#define __2BIT_LOCAL_PRED_HH__
+
+// For Addr type.
+#include "arch/alpha/isa_traits.hh"
+
+class DefaultBP
+{
+  public:
+    /**
+     * Default branch predictor constructor.
+     */
+    DefaultBP(unsigned localPredictorSize, unsigned localCtrBits,
+              unsigned instShiftAmt);
+
+    /**
+     * Looks up the given address in the branch predictor and returns
+     * a true/false value as to whether it is taken.
+     * @param branch_addr The address of the branch to look up.
+     * @return Whether or not the branch is taken.
+     */
+    bool lookup(Addr &branch_addr);
+
+    /**
+     * Updates the branch predictor with the actual result of a branch.
+     * @param branch_addr The address of the branch to update.
+     * @param taken Whether or not the branch was taken.
+     */
+    void update(Addr &branch_addr, bool taken);
+
+  private:
+
+    inline bool getPrediction(uint8_t &count);
+
+    inline unsigned getLocalIndex(Addr &PC);
+
+    /**
+     * Private counter class for the internal saturating counters.
+     * Implements an n bit saturating counter and provides methods to
+     * increment, decrement, and read it.
+     * @todo Consider making this something that more closely mimics a
+     * built in class so you can use ++ or --.
+     */
+    class SatCounter
+    {
+      public:
+        /**
+         * Constructor for the counter.
+         * @param bits How many bits the counter will have.
+         */
+        SatCounter(unsigned bits);
+
+        /**
+         * Constructor for the counter.
+         * @param bits How many bits the counter will have.
+         * @param initial_val Starting value for each counter.
+         */
+        SatCounter(unsigned bits, unsigned initial_val);
+
+        /**
+         * Increments the counter's current value.
+         */
+        void increment();
+
+        /**
+         * Decrements the counter's current value.
+         */
+        void decrement();
+
+        /**
+         * Read the counter's value.
+         */
+        uint8_t read()
+        {
+            return counter;
+        }
+
+      private:
+        uint8_t maxVal;
+        uint8_t counter;
+    };
+
+    /** Array of counters that make up the local predictor. */
+    SatCounter *localCtrs;
+
+    /** Size of the local predictor. */
+    unsigned localPredictorSize;
+
+    /** Number of bits of the local predictor's counters. */
+    unsigned localCtrBits;
+
+    /** Number of bits to shift the PC when calculating index. */
+    unsigned instShiftAmt;
+
+    /** Mask to get index bits. */
+    unsigned indexMask;
+};
+
+#endif // __2BIT_LOCAL_PRED_HH__
diff --git a/cpu/beta_cpu/alpha_dyn_inst.cc b/cpu/beta_cpu/alpha_dyn_inst.cc
index a79d3082c..1bfcb8420 100644
--- a/cpu/beta_cpu/alpha_dyn_inst.cc
+++ b/cpu/beta_cpu/alpha_dyn_inst.cc
@@ -1,102 +1,7 @@
-#ifndef __ALPHA_DYN_INST_CC__
-#define __ALPHA_DYN_INST_CC__
 
-#include "cpu/beta_cpu/alpha_dyn_inst.hh"
+#include "cpu/beta_cpu/alpha_dyn_inst_impl.hh"
+#include "cpu/beta_cpu/alpha_impl.hh"
 
-// Force instantiation of BaseDynInst
-template BaseDynInst<AlphaSimpleImpl>;
-
-AlphaDynInst::AlphaDynInst(MachInst inst, Addr PC, Addr Pred_PC,
-                           InstSeqNum seq_num, FullCPU *cpu)
-    : BaseDynInst<AlphaSimpleImpl>(inst, PC, Pred_PC, seq_num, cpu)
-{
-    // Initialize these to illegal values.
-    robIdx = -1;
-    iqIdx = -1;
-}
-
-AlphaDynInst::AlphaDynInst(StaticInstPtr<AlphaISA> &_staticInst)
-    : BaseDynInst<AlphaSimpleImpl>(_staticInst)
-{
-}
-
-uint64_t
-AlphaDynInst::readUniq()
-{
-    return cpu->readUniq();
-}
-
-void
-AlphaDynInst::setUniq(uint64_t val)
-{
-    cpu->setUniq(val);
-}
-
-uint64_t
-AlphaDynInst::readFpcr()
-{
-    return cpu->readFpcr();
-}
-
-void
-AlphaDynInst::setFpcr(uint64_t val)
-{
-    cpu->setFpcr(val);
-}
-
-#ifdef FULL_SYSTEM
-uint64_t
-AlphaDynInst::readIpr(int idx, Fault &fault)
-{
-    return cpu->readIpr(idx, fault);
-}
-Fault
-AlphaDynInst::setIpr(int idx, uint64_t val)
-{
-    return cpu->setIpr(idx, val);
-}
-
-Fault
-AlphaDynInst::hwrei()
-{
-    return cpu->hwrei();
-}
-
-int
-AlphaDynInst::readIntrFlag()
-{
-return cpu->readIntrFlag();
-}
-
-void
-AlphaDynInst::setIntrFlag(int val)
-{
-    cpu->setIntrFlag(val);
-}
-
-bool
-AlphaDynInst::inPalMode()
-{
-    return cpu->inPalMode();
-}
-
-void
-AlphaDynInst::trap(Fault fault)
-{
-    cpu->trap(fault);
-}
-
-bool
-AlphaDynInst::simPalCheck(int palFunc)
-{
-    return cpu->simPalCheck(palFunc);
-}
-#else
-void
-AlphaDynInst::syscall()
-{
-    cpu->syscall();
-}
-#endif
-
-#endif // __ALPHA_DYN_INST_CC__
+// Force instantiation of AlphaDynInst for all the implementations that
+// are needed.
+template AlphaDynInst<AlphaSimpleImpl>;
diff --git a/cpu/beta_cpu/alpha_dyn_inst.hh b/cpu/beta_cpu/alpha_dyn_inst.hh
index 69d145355..4e1cebd11 100644
--- a/cpu/beta_cpu/alpha_dyn_inst.hh
+++ b/cpu/beta_cpu/alpha_dyn_inst.hh
@@ -8,10 +8,37 @@
 #include "cpu/beta_cpu/alpha_impl.hh"
 #include "cpu/inst_seq.hh"
 
-using namespace std;
-
-class AlphaDynInst : public BaseDynInst<AlphaSimpleImpl>
+/**
+ * Mostly implementation specific AlphaDynInst.  It is templated in case there
+ * are other implementations that are similar enough to be able to use this
+ * class without changes.  This is mainly useful if there are multiple similar
+ * CPU implementations of the same ISA.
+ */
+
+template <class Impl>
+class AlphaDynInst : public BaseDynInst<Impl>
 {
+  public:
+    // Typedef for the CPU.
+    typedef typename Impl::FullCPU FullCPU;
+
+    //Typedef to get the ISA.
+    typedef typename Impl::ISA ISA;
+
+    /// Binary machine instruction type.
+    typedef typename ISA::MachInst MachInst;
+    /// Memory address type.
+    typedef typename ISA::Addr	   Addr;
+    /// Logical register index type.
+    typedef typename ISA::RegIndex RegIndex;
+    /// Integer register index type.
+    typedef typename ISA::IntReg   IntReg;
+
+    enum {
+        MaxInstSrcRegs = ISA::MaxInstSrcRegs,	//< Max source regs
+        MaxInstDestRegs = ISA::MaxInstDestRegs,	//< Max dest regs
+    };
+
   public:
     /** BaseDynInst constructor given a binary instruction. */
     AlphaDynInst(MachInst inst, Addr PC, Addr Pred_PC, InstSeqNum seq_num,
@@ -27,40 +54,6 @@ class AlphaDynInst : public BaseDynInst<AlphaSimpleImpl>
         return fault;
     }
 
-    /** Location of this instruction within the ROB.  Might be somewhat
-     *  implementation specific.
-     *  Might not want this data in the inst as it may be deleted prior to
-     *  execution of the stage that needs it.
-     */
-    int robIdx;
-
-    int getROBEntry()
-    {
-        return robIdx;
-    }
-
-    void setROBEntry(int rob_idx)
-    {
-        robIdx = rob_idx;
-    }
-
-    /** Location of this instruction within the IQ.  Might be somewhat
-     *  implementation specific.
-     *  Might not want this data in the inst as it may be deleted prior to
-     *  execution of the stage that needs it.
-     */
-    int iqIdx;
-
-    int getIQEntry()
-    {
-        return iqIdx;
-    }
-
-    void setIQEntry(int iq_idx)
-    {
-        iqIdx = iq_idx;
-    }
-
     uint64_t readUniq();
     void setUniq(uint64_t val);
 
diff --git a/cpu/beta_cpu/alpha_dyn_inst_impl.hh b/cpu/beta_cpu/alpha_dyn_inst_impl.hh
new file mode 100644
index 000000000..8311067db
--- /dev/null
+++ b/cpu/beta_cpu/alpha_dyn_inst_impl.hh
@@ -0,0 +1,109 @@
+
+#include "cpu/beta_cpu/alpha_dyn_inst.hh"
+
+template <class Impl>
+AlphaDynInst<Impl>::AlphaDynInst(MachInst inst, Addr PC, Addr Pred_PC,
+                                 InstSeqNum seq_num, FullCPU *cpu)
+    : BaseDynInst<AlphaSimpleImpl>(inst, PC, Pred_PC, seq_num, cpu)
+{
+}
+
+template <class Impl>
+AlphaDynInst<Impl>::AlphaDynInst(StaticInstPtr<AlphaISA> &_staticInst)
+    : BaseDynInst<AlphaSimpleImpl>(_staticInst)
+{
+}
+
+template <class Impl>
+uint64_t
+AlphaDynInst<Impl>::readUniq()
+{
+    return cpu->readUniq();
+}
+
+template <class Impl>
+void
+AlphaDynInst<Impl>::setUniq(uint64_t val)
+{
+    cpu->setUniq(val);
+}
+
+template <class Impl>
+uint64_t
+AlphaDynInst<Impl>::readFpcr()
+{
+    return cpu->readFpcr();
+}
+
+template <class Impl>
+void
+AlphaDynInst<Impl>::setFpcr(uint64_t val)
+{
+    cpu->setFpcr(val);
+}
+
+#ifdef FULL_SYSTEM
+template <class Impl>
+uint64_t
+AlphaDynInst<Impl>::readIpr(int idx, Fault &fault)
+{
+    return cpu->readIpr(idx, fault);
+}
+
+template <class Impl>
+Fault
+AlphaDynInst<Impl>::setIpr(int idx, uint64_t val)
+{
+    return cpu->setIpr(idx, val);
+}
+
+template <class Impl>
+Fault
+AlphaDynInst<Impl>::hwrei()
+{
+    return cpu->hwrei();
+}
+
+template <class Impl>
+int
+AlphaDynInst<Impl>::readIntrFlag()
+{
+return cpu->readIntrFlag();
+}
+
+template <class Impl>
+void
+AlphaDynInst<Impl>::setIntrFlag(int val)
+{
+    cpu->setIntrFlag(val);
+}
+
+template <class Impl>
+bool
+AlphaDynInst<Impl>::inPalMode()
+{
+    return cpu->inPalMode();
+}
+
+template <class Impl>
+void
+AlphaDynInst<Impl>::trap(Fault fault)
+{
+    cpu->trap(fault);
+}
+
+template <class Impl>
+bool
+AlphaDynInst<Impl>::simPalCheck(int palFunc)
+{
+    return cpu->simPalCheck(palFunc);
+}
+#else
+template <class Impl>
+void
+AlphaDynInst<Impl>::syscall()
+{
+    cpu->syscall();
+}
+#endif
+
diff --git a/cpu/beta_cpu/alpha_full_cpu.cc b/cpu/beta_cpu/alpha_full_cpu.cc
index 880418146..80c4bdec8 100644
--- a/cpu/beta_cpu/alpha_full_cpu.cc
+++ b/cpu/beta_cpu/alpha_full_cpu.cc
@@ -1,911 +1,9 @@
 
-#include "base/cprintf.hh"
-#include "base/statistics.hh"
-#include "base/timebuf.hh"
-#include "cpu/full_cpu/dd_queue.hh"
-#include "cpu/full_cpu/full_cpu.hh"
-#include "cpu/full_cpu/rob_station.hh"
-#include "mem/cache/cache.hh" // for dynamic cast
-#include "mem/mem_interface.hh"
-#include "sim/builder.hh"
-#include "sim/sim_events.hh"
-#include "sim/stats.hh"
-
-#include "cpu/beta_cpu/alpha_full_cpu.hh"
-#include "cpu/beta_cpu/alpha_params.hh"
-#include "cpu/beta_cpu/comm.hh"
-
-AlphaFullCPU::AlphaFullCPU(Params &params)
-    : FullBetaCPU<AlphaSimpleImpl>(params)
-{
-
-    fetch.setCPU(this);
-    decode.setCPU(this);
-    rename.setCPU(this);
-    iew.setCPU(this);
-    commit.setCPU(this);
-
-    rob.setCPU(this);
-}
-
-#ifndef FULL_SYSTEM
-
-void
-AlphaFullCPU::syscall()
-{
-    DPRINTF(FullCPU, "AlphaFullCPU: Syscall() called.\n\n");
-
-    squashStages();
-
-    // Copy over all important state to xc once all the unrolling is done.
-    copyToXC();
-
-    process->syscall(xc);
-
-    // Copy over all important state back to normal.
-    copyFromXC();
-}
-
-// This is not a pretty function, and should only be used if it is necessary
-// to fake having everything squash all at once (ie for non-full system
-// syscalls).
-void
-AlphaFullCPU::squashStages()
-{
-    InstSeqNum rob_head = rob.readHeadSeqNum();
-
-    // Now hack the time buffer to put this sequence number in the places
-    // where the stages might read it.
-    for (int i = 0; i < 10; ++i)
-    {
-        timeBuffer.access(-i)->commitInfo.doneSeqNum = rob_head;
-    }
-
-    fetch.squash(rob.readHeadNextPC());
-    fetchQueue.advance();
-
-    decode.squash();
-    decodeQueue.advance();
-
-    rename.squash();
-    renameQueue.advance();
-    renameQueue.advance();
-
-    iew.squash();
-    iewQueue.advance();
-    iewQueue.advance();
-
-    rob.squash(rob_head);
-    commit.setSquashing();
-}
-
-#endif // FULL_SYSTEM
-
-void
-AlphaFullCPU::copyToXC()
-{
-    PhysRegIndex renamed_reg;
-
-    // First loop through the integer registers.
-    for (int i = 0; i < AlphaISA::NumIntRegs; ++i)
-    {
-        renamed_reg = renameMap.lookup(i);
-        xc->regs.intRegFile[i] = regFile.intRegFile[renamed_reg];
-        DPRINTF(FullCPU, "FullCPU: Copying register %i, has data %lli.\n",
-                renamed_reg, regFile.intRegFile[renamed_reg]);
-    }
-
-    // Then loop through the floating point registers.
-    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i)
-    {
-        renamed_reg = renameMap.lookup(i + AlphaISA::FP_Base_DepTag);
-        xc->regs.floatRegFile.d[i] = regFile.floatRegFile[renamed_reg].d;
-        xc->regs.floatRegFile.q[i] = regFile.floatRegFile[renamed_reg].q;
-    }
-
-    xc->regs.miscRegs.fpcr = regFile.miscRegs.fpcr;
-    xc->regs.miscRegs.uniq = regFile.miscRegs.uniq;
-    xc->regs.miscRegs.lock_flag = regFile.miscRegs.lock_flag;
-    xc->regs.miscRegs.lock_addr = regFile.miscRegs.lock_addr;
-
-    xc->regs.pc = rob.readHeadPC();
-    xc->regs.npc = xc->regs.pc+4;
-
-    xc->func_exe_inst = funcExeInst;
-}
-
-// This function will probably mess things up unless the ROB is empty and
-// there are no instructions in the pipeline.
-void
-AlphaFullCPU::copyFromXC()
-{
-    PhysRegIndex renamed_reg;
-
-    // First loop through the integer registers.
-    for (int i = 0; i < AlphaISA::NumIntRegs; ++i)
-    {
-        renamed_reg = renameMap.lookup(i);
-
-        DPRINTF(FullCPU, "FullCPU: Copying over register %i, had data %lli, "
-                "now has data %lli.\n",
-                renamed_reg, regFile.intRegFile[renamed_reg],
-                xc->regs.intRegFile[i]);
-
-        regFile.intRegFile[renamed_reg] = xc->regs.intRegFile[i];
-    }
-
-    // Then loop through the floating point registers.
-    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i)
-    {
-        renamed_reg = renameMap.lookup(i + AlphaISA::FP_Base_DepTag);
-        regFile.floatRegFile[renamed_reg].d = xc->regs.floatRegFile.d[i];
-        regFile.floatRegFile[renamed_reg].q = xc->regs.floatRegFile.q[i] ;
-    }
-
-    // Then loop through the misc registers.
-    regFile.miscRegs.fpcr = xc->regs.miscRegs.fpcr;
-    regFile.miscRegs.uniq = xc->regs.miscRegs.uniq;
-    regFile.miscRegs.lock_flag = xc->regs.miscRegs.lock_flag;
-    regFile.miscRegs.lock_addr = xc->regs.miscRegs.lock_addr;
-
-    // Then finally set the PC and the next PC.
-//    regFile.pc = xc->regs.pc;
-//    regFile.npc = xc->regs.npc;
-
-    funcExeInst = xc->func_exe_inst;
-}
-
-#ifdef FULL_SYSTEM
-
-uint64_t *
-AlphaFullCPU::getIpr()
-{
-    return regs.ipr;
-}
-
-uint64_t
-AlphaFullCPU::readIpr(int idx, Fault &fault)
-{
-    uint64_t *ipr = getIpr();
-    uint64_t retval = 0;	// return value, default 0
-
-    switch (idx) {
-      case AlphaISA::IPR_PALtemp0:
-      case AlphaISA::IPR_PALtemp1:
-      case AlphaISA::IPR_PALtemp2:
-      case AlphaISA::IPR_PALtemp3:
-      case AlphaISA::IPR_PALtemp4:
-      case AlphaISA::IPR_PALtemp5:
-      case AlphaISA::IPR_PALtemp6:
-      case AlphaISA::IPR_PALtemp7:
-      case AlphaISA::IPR_PALtemp8:
-      case AlphaISA::IPR_PALtemp9:
-      case AlphaISA::IPR_PALtemp10:
-      case AlphaISA::IPR_PALtemp11:
-      case AlphaISA::IPR_PALtemp12:
-      case AlphaISA::IPR_PALtemp13:
-      case AlphaISA::IPR_PALtemp14:
-      case AlphaISA::IPR_PALtemp15:
-      case AlphaISA::IPR_PALtemp16:
-      case AlphaISA::IPR_PALtemp17:
-      case AlphaISA::IPR_PALtemp18:
-      case AlphaISA::IPR_PALtemp19:
-      case AlphaISA::IPR_PALtemp20:
-      case AlphaISA::IPR_PALtemp21:
-      case AlphaISA::IPR_PALtemp22:
-      case AlphaISA::IPR_PALtemp23:
-      case AlphaISA::IPR_PAL_BASE:
-
-      case AlphaISA::IPR_IVPTBR:
-      case AlphaISA::IPR_DC_MODE:
-      case AlphaISA::IPR_MAF_MODE:
-      case AlphaISA::IPR_ISR:
-      case AlphaISA::IPR_EXC_ADDR:
-      case AlphaISA::IPR_IC_PERR_STAT:
-      case AlphaISA::IPR_DC_PERR_STAT:
-      case AlphaISA::IPR_MCSR:
-      case AlphaISA::IPR_ASTRR:
-      case AlphaISA::IPR_ASTER:
-      case AlphaISA::IPR_SIRR:
-      case AlphaISA::IPR_ICSR:
-      case AlphaISA::IPR_ICM:
-      case AlphaISA::IPR_DTB_CM:
-      case AlphaISA::IPR_IPLR:
-      case AlphaISA::IPR_INTID:
-      case AlphaISA::IPR_PMCTR:
-        // no side-effect
-        retval = ipr[idx];
-        break;
-
-      case AlphaISA::IPR_CC:
-        retval |= ipr[idx] & ULL(0xffffffff00000000);
-        retval |= curTick  & ULL(0x00000000ffffffff);
-        break;
-
-      case AlphaISA::IPR_VA:
-        retval = ipr[idx];
-        break;
-
-      case AlphaISA::IPR_VA_FORM:
-      case AlphaISA::IPR_MM_STAT:
-      case AlphaISA::IPR_IFAULT_VA_FORM:
-      case AlphaISA::IPR_EXC_MASK:
-      case AlphaISA::IPR_EXC_SUM:
-        retval = ipr[idx];
-        break;
-
-      case AlphaISA::IPR_DTB_PTE:
-        {
-            AlphaISA::PTE &pte = dtb->index(!misspeculating());
-
-            retval |= ((u_int64_t)pte.ppn & ULL(0x7ffffff)) << 32;
-            retval |= ((u_int64_t)pte.xre & ULL(0xf)) << 8;
-            retval |= ((u_int64_t)pte.xwe & ULL(0xf)) << 12;
-            retval |= ((u_int64_t)pte.fonr & ULL(0x1)) << 1;
-            retval |= ((u_int64_t)pte.fonw & ULL(0x1))<< 2;
-            retval |= ((u_int64_t)pte.asma & ULL(0x1)) << 4;
-            retval |= ((u_int64_t)pte.asn & ULL(0x7f)) << 57;
-        }
-        break;
-
-        // write only registers
-      case AlphaISA::IPR_HWINT_CLR:
-      case AlphaISA::IPR_SL_XMIT:
-      case AlphaISA::IPR_DC_FLUSH:
-      case AlphaISA::IPR_IC_FLUSH:
-      case AlphaISA::IPR_ALT_MODE:
-      case AlphaISA::IPR_DTB_IA:
-      case AlphaISA::IPR_DTB_IAP:
-      case AlphaISA::IPR_ITB_IA:
-      case AlphaISA::IPR_ITB_IAP:
-        fault = Unimplemented_Opcode_Fault;
-        break;
-
-      default:
-        // invalid IPR
-        fault = Unimplemented_Opcode_Fault;
-        break;
-    }
-
-    return retval;
-}
-
-Fault
-AlphaFullCPU::setIpr(int idx, uint64_t val)
-{
-    uint64_t *ipr = getIpr();
-    uint64_t old;
-
-    if (misspeculating())
-        return No_Fault;
-
-    switch (idx) {
-      case AlphaISA::IPR_PALtemp0:
-      case AlphaISA::IPR_PALtemp1:
-      case AlphaISA::IPR_PALtemp2:
-      case AlphaISA::IPR_PALtemp3:
-      case AlphaISA::IPR_PALtemp4:
-      case AlphaISA::IPR_PALtemp5:
-      case AlphaISA::IPR_PALtemp6:
-      case AlphaISA::IPR_PALtemp7:
-      case AlphaISA::IPR_PALtemp8:
-      case AlphaISA::IPR_PALtemp9:
-      case AlphaISA::IPR_PALtemp10:
-      case AlphaISA::IPR_PALtemp11:
-      case AlphaISA::IPR_PALtemp12:
-      case AlphaISA::IPR_PALtemp13:
-      case AlphaISA::IPR_PALtemp14:
-      case AlphaISA::IPR_PALtemp15:
-      case AlphaISA::IPR_PALtemp16:
-      case AlphaISA::IPR_PALtemp17:
-      case AlphaISA::IPR_PALtemp18:
-      case AlphaISA::IPR_PALtemp19:
-      case AlphaISA::IPR_PALtemp20:
-      case AlphaISA::IPR_PALtemp21:
-      case AlphaISA::IPR_PALtemp22:
-      case AlphaISA::IPR_PAL_BASE:
-      case AlphaISA::IPR_IC_PERR_STAT:
-      case AlphaISA::IPR_DC_PERR_STAT:
-      case AlphaISA::IPR_PMCTR:
-        // write entire quad w/ no side-effect
-        ipr[idx] = val;
-        break;
-
-      case AlphaISA::IPR_CC_CTL:
-        // This IPR resets the cycle counter.  We assume this only
-        // happens once... let's verify that.
-        assert(ipr[idx] == 0);
-        ipr[idx] = 1;
-        break;
-
-      case AlphaISA::IPR_CC:
-        // This IPR only writes the upper 64 bits.  It's ok to write
-        // all 64 here since we mask out the lower 32 in rpcc (see
-        // isa_desc).
-        ipr[idx] = val;
-        break;
-
-      case AlphaISA::IPR_PALtemp23:
-        // write entire quad w/ no side-effect
-        old = ipr[idx];
-        ipr[idx] = val;
-        kernelStats.context(old, val);
-        break;
-
-      case AlphaISA::IPR_DTB_PTE:
-        // write entire quad w/ no side-effect, tag is forthcoming
-        ipr[idx] = val;
-        break;
-
-      case AlphaISA::IPR_EXC_ADDR:
-        // second least significant bit in PC is always zero
-        ipr[idx] = val & ~2;
-        break;
-
-      case AlphaISA::IPR_ASTRR:
-      case AlphaISA::IPR_ASTER:
-        // only write least significant four bits - privilege mask
-        ipr[idx] = val & 0xf;
-        break;
-
-      case AlphaISA::IPR_IPLR:
-#ifdef DEBUG
-        if (break_ipl != -1 && break_ipl == (val & 0x1f))
-            debug_break();
-#endif
-
-        // only write least significant five bits - interrupt level
-        ipr[idx] = val & 0x1f;
-        kernelStats.swpipl(ipr[idx]);
-        break;
-
-      case AlphaISA::IPR_DTB_CM:
-        kernelStats.mode((val & 0x18) != 0);
-
-      case AlphaISA::IPR_ICM:
-        // only write two mode bits - processor mode
-        ipr[idx] = val & 0x18;
-        break;
-
-      case AlphaISA::IPR_ALT_MODE:
-        // only write two mode bits - processor mode
-        ipr[idx] = val & 0x18;
-        break;
-
-      case AlphaISA::IPR_MCSR:
-        // more here after optimization...
-        ipr[idx] = val;
-        break;
-
-      case AlphaISA::IPR_SIRR:
-        // only write software interrupt mask
-        ipr[idx] = val & 0x7fff0;
-        break;
-
-      case AlphaISA::IPR_ICSR:
-        ipr[idx] = val & ULL(0xffffff0300);
-        break;
-
-      case AlphaISA::IPR_IVPTBR:
-      case AlphaISA::IPR_MVPTBR:
-        ipr[idx] = val & ULL(0xffffffffc0000000);
-        break;
-
-      case AlphaISA::IPR_DC_TEST_CTL:
-        ipr[idx] = val & 0x1ffb;
-        break;
-
-      case AlphaISA::IPR_DC_MODE:
-      case AlphaISA::IPR_MAF_MODE:
-        ipr[idx] = val & 0x3f;
-        break;
-
-      case AlphaISA::IPR_ITB_ASN:
-        ipr[idx] = val & 0x7f0;
-        break;
-
-      case AlphaISA::IPR_DTB_ASN:
-        ipr[idx] = val & ULL(0xfe00000000000000);
-        break;
-
-      case AlphaISA::IPR_EXC_SUM:
-      case AlphaISA::IPR_EXC_MASK:
-        // any write to this register clears it
-        ipr[idx] = 0;
-        break;
-
-      case AlphaISA::IPR_INTID:
-      case AlphaISA::IPR_SL_RCV:
-      case AlphaISA::IPR_MM_STAT:
-      case AlphaISA::IPR_ITB_PTE_TEMP:
-      case AlphaISA::IPR_DTB_PTE_TEMP:
-        // read-only registers
-        return Unimplemented_Opcode_Fault;
-
-      case AlphaISA::IPR_HWINT_CLR:
-      case AlphaISA::IPR_SL_XMIT:
-      case AlphaISA::IPR_DC_FLUSH:
-      case AlphaISA::IPR_IC_FLUSH:
-        // the following are write only
-        ipr[idx] = val;
-        break;
-
-      case AlphaISA::IPR_DTB_IA:
-        // really a control write
-        ipr[idx] = 0;
-
-        dtb->flushAll();
-        break;
-
-      case AlphaISA::IPR_DTB_IAP:
-        // really a control write
-        ipr[idx] = 0;
-
-        dtb->flushProcesses();
-        break;
-
-      case AlphaISA::IPR_DTB_IS:
-        // really a control write
-        ipr[idx] = val;
-
-        dtb->flushAddr(val, DTB_ASN_ASN(ipr[AlphaISA::IPR_DTB_ASN]));
-        break;
-
-      case AlphaISA::IPR_DTB_TAG: {
-          struct AlphaISA::PTE pte;
-
-          // FIXME: granularity hints NYI...
-          if (DTB_PTE_GH(ipr[AlphaISA::IPR_DTB_PTE]) != 0)
-              panic("PTE GH field != 0");
-
-          // write entire quad
-          ipr[idx] = val;
-
-          // construct PTE for new entry
-          pte.ppn = DTB_PTE_PPN(ipr[AlphaISA::IPR_DTB_PTE]);
-          pte.xre = DTB_PTE_XRE(ipr[AlphaISA::IPR_DTB_PTE]);
-          pte.xwe = DTB_PTE_XWE(ipr[AlphaISA::IPR_DTB_PTE]);
-          pte.fonr = DTB_PTE_FONR(ipr[AlphaISA::IPR_DTB_PTE]);
-          pte.fonw = DTB_PTE_FONW(ipr[AlphaISA::IPR_DTB_PTE]);
-          pte.asma = DTB_PTE_ASMA(ipr[AlphaISA::IPR_DTB_PTE]);
-          pte.asn = DTB_ASN_ASN(ipr[AlphaISA::IPR_DTB_ASN]);
-
-          // insert new TAG/PTE value into data TLB
-          dtb->insert(val, pte);
-      }
-        break;
-
-      case AlphaISA::IPR_ITB_PTE: {
-          struct AlphaISA::PTE pte;
-
-          // FIXME: granularity hints NYI...
-          if (ITB_PTE_GH(val) != 0)
-              panic("PTE GH field != 0");
-
-          // write entire quad
-          ipr[idx] = val;
-
-          // construct PTE for new entry
-          pte.ppn = ITB_PTE_PPN(val);
-          pte.xre = ITB_PTE_XRE(val);
-          pte.xwe = 0;
-          pte.fonr = ITB_PTE_FONR(val);
-          pte.fonw = ITB_PTE_FONW(val);
-          pte.asma = ITB_PTE_ASMA(val);
-          pte.asn = ITB_ASN_ASN(ipr[AlphaISA::IPR_ITB_ASN]);
-
-          // insert new TAG/PTE value into data TLB
-          itb->insert(ipr[AlphaISA::IPR_ITB_TAG], pte);
-      }
-        break;
-
-      case AlphaISA::IPR_ITB_IA:
-        // really a control write
-        ipr[idx] = 0;
-
-        itb->flushAll();
-        break;
-
-      case AlphaISA::IPR_ITB_IAP:
-        // really a control write
-        ipr[idx] = 0;
-
-        itb->flushProcesses();
-        break;
-
-      case AlphaISA::IPR_ITB_IS:
-        // really a control write
-        ipr[idx] = val;
-
-        itb->flushAddr(val, ITB_ASN_ASN(ipr[AlphaISA::IPR_ITB_ASN]));
-        break;
-
-      default:
-        // invalid IPR
-        return Unimplemented_Opcode_Fault;
-    }
-
-    // no error...
-    return No_Fault;
-
-}
-
-int
-AlphaFullCPU::readIntrFlag()
-{
-    return regs.intrflag;
-}
-
-void
-AlphaFullCPU::setIntrFlag(int val)
-{
-    regs.intrflag = val;
-}
-
-// Maybe have this send back from IEW stage to squash and update PC.
-Fault
-AlphaFullCPU::hwrei()
-{
-    uint64_t *ipr = getIpr();
-
-    if (!PC_PAL(regs.pc))
-        return Unimplemented_Opcode_Fault;
-
-    setNextPC(ipr[AlphaISA::IPR_EXC_ADDR]);
-
-    if (!misspeculating()) {
-        kernelStats.hwrei();
-
-        if ((ipr[AlphaISA::IPR_EXC_ADDR] & 1) == 0)
-            AlphaISA::swap_palshadow(&regs, false);
-
-        AlphaISA::check_interrupts = true;
-    }
-
-    // FIXME: XXX check for interrupts? XXX
-    return No_Fault;
-}
-
-bool
-AlphaFullCPU::inPalMode()
-{
-    return PC_PAL(readPC());
-}
-
-bool
-AlphaFullCPU::simPalCheck(int palFunc)
-{
-    kernelStats.callpal(palFunc);
-
-    switch (palFunc) {
-      case PAL::halt:
-        halt();
-        if (--System::numSystemsRunning == 0)
-            new SimExitEvent("all cpus halted");
-        break;
-
-      case PAL::bpt:
-      case PAL::bugchk:
-        if (system->breakpoint())
-            return false;
-        break;
-    }
-
-    return true;
-}
-
-// Probably shouldn't be able to switch to the trap handler as quickly as
-// this.  Also needs to get the exception restart address from the commit
-// stage.
-void
-AlphaFullCPU::trap(Fault fault)
-{
-    uint64_t PC = commit.readPC();
-
-    DPRINTF(Fault, "Fault %s\n", FaultName(fault));
-    Stats::recordEvent(csprintf("Fault %s", FaultName(fault)));
-
-    assert(!misspeculating());
-    kernelStats.fault(fault);
-
-    if (fault == Arithmetic_Fault)
-        panic("Arithmetic traps are unimplemented!");
-
-    AlphaISA::InternalProcReg *ipr = getIpr();
-
-    // exception restart address - Get the commit PC
-    if (fault != Interrupt_Fault || !PC_PAL(PC))
-        ipr[AlphaISA::IPR_EXC_ADDR] = PC;
-
-    if (fault == Pal_Fault || fault == Arithmetic_Fault /* ||
-        fault == Interrupt_Fault && !PC_PAL(regs.pc) */) {
-        // traps...  skip faulting instruction
-        ipr[AlphaISA::IPR_EXC_ADDR] += 4;
-    }
-
-    if (!PC_PAL(PC))
-        AlphaISA::swap_palshadow(&regs, true);
-
-    setPC( ipr[AlphaISA::IPR_PAL_BASE] + AlphaISA::fault_addr[fault] );
-    setNextPC(PC + sizeof(MachInst));
-}
-
-void
-AlphaFullCPU::processInterrupts()
-{
-    // Check for interrupts here.  For now can copy the code that exists
-    // within isa_fullsys_traits.hh.
-}
-
-// swap_palshadow swaps in the values of the shadow registers and
-// swaps them with the values of the physical registers that map to the
-// same logical index.
-void
-AlphaFullCPU::swap_palshadow(RegFile *regs, bool use_shadow)
-{
-    if (palShadowEnabled == use_shadow)
-        panic("swap_palshadow: wrong PAL shadow state");
-
-    palShadowEnabled = use_shadow;
-
-    // Will have to lookup in rename map to get physical registers, then
-    // swap.
-    for (int i = 0; i < AlphaISA::NumIntRegs; i++) {
-        if (reg_redir[i]) {
-            AlphaISA::IntReg temp = regs->intRegFile[i];
-            regs->intRegFile[i] = regs->palregs[i];
-            regs->palregs[i] = temp;
-        }
-    }
-}
-
-#endif // FULL_SYSTEM
-
-BEGIN_DECLARE_SIM_OBJECT_PARAMS(AlphaFullCPU)
-
-    Param<int> numThreads;
-
-#ifdef FULL_SYSTEM
-SimObjectParam<System *> system;
-SimObjectParam<AlphaITB *> itb;
-SimObjectParam<AlphaDTB *> dtb;
-Param<int> mult;
-#else
-SimObjectVectorParam<Process *> workload;
-SimObjectParam<Process *> process;
-Param<short> asid;
-#endif // FULL_SYSTEM
-SimObjectParam<FunctionalMemory *> mem;
-
-Param<Counter> max_insts_any_thread;
-Param<Counter> max_insts_all_threads;
-Param<Counter> max_loads_any_thread;
-Param<Counter> max_loads_all_threads;
-
-SimObjectParam<BaseCache *> icache;
-SimObjectParam<BaseCache *> dcache;
-
-Param<unsigned> decodeToFetchDelay;
-Param<unsigned> renameToFetchDelay;
-Param<unsigned> iewToFetchDelay;
-Param<unsigned> commitToFetchDelay;
-Param<unsigned> fetchWidth;
-
-Param<unsigned> renameToDecodeDelay;
-Param<unsigned> iewToDecodeDelay;
-Param<unsigned> commitToDecodeDelay;
-Param<unsigned> fetchToDecodeDelay;
-Param<unsigned> decodeWidth;
-
-Param<unsigned> iewToRenameDelay;
-Param<unsigned> commitToRenameDelay;
-Param<unsigned> decodeToRenameDelay;
-Param<unsigned> renameWidth;
-
-Param<unsigned> commitToIEWDelay;
-Param<unsigned> renameToIEWDelay;
-Param<unsigned> issueToExecuteDelay;
-Param<unsigned> issueWidth;
-Param<unsigned> executeWidth;
-Param<unsigned> executeIntWidth;
-Param<unsigned> executeFloatWidth;
-
-Param<unsigned> iewToCommitDelay;
-Param<unsigned> renameToROBDelay;
-Param<unsigned> commitWidth;
-Param<unsigned> squashWidth;
-
-Param<unsigned> numPhysIntRegs;
-Param<unsigned> numPhysFloatRegs;
-Param<unsigned> numIQEntries;
-Param<unsigned> numROBEntries;
-
-Param<bool> defReg;
-
-END_DECLARE_SIM_OBJECT_PARAMS(AlphaFullCPU)
-
-BEGIN_INIT_SIM_OBJECT_PARAMS(AlphaFullCPU)
-
-    INIT_PARAM(numThreads, "number of HW thread contexts"),
-
-#ifdef FULL_SYSTEM
-    INIT_PARAM(system, "System object"),
-    INIT_PARAM(itb, "Instruction translation buffer"),
-    INIT_PARAM(dtb, "Data translation buffer"),
-    INIT_PARAM_DFLT(mult, "System clock multiplier", 1),
-#else
-    INIT_PARAM(workload, "Processes to run"),
-    INIT_PARAM_DFLT(process, "Process to run", NULL),
-    INIT_PARAM(asid, "Address space ID"),
-#endif // FULL_SYSTEM
-
-    INIT_PARAM_DFLT(mem, "Memory", NULL),
-
-    INIT_PARAM_DFLT(max_insts_any_thread,
-                    "Terminate when any thread reaches this inst count",
-                    0),
-    INIT_PARAM_DFLT(max_insts_all_threads,
-                    "Terminate when all threads have reached"
-                    "this inst count",
-                    0),
-    INIT_PARAM_DFLT(max_loads_any_thread,
-                    "Terminate when any thread reaches this load count",
-                    0),
-    INIT_PARAM_DFLT(max_loads_all_threads,
-                    "Terminate when all threads have reached this load"
-                    "count",
-                    0),
-
-    INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL),
-    INIT_PARAM_DFLT(dcache, "L1 data cache", NULL),
-
-    INIT_PARAM(decodeToFetchDelay, "Decode to fetch delay"),
-    INIT_PARAM(renameToFetchDelay, "Rename to fetch delay"),
-    INIT_PARAM(iewToFetchDelay, "Issue/Execute/Writeback to fetch"
-               "delay"),
-    INIT_PARAM(commitToFetchDelay, "Commit to fetch delay"),
-    INIT_PARAM(fetchWidth, "Fetch width"),
-
-    INIT_PARAM(renameToDecodeDelay, "Rename to decode delay"),
-    INIT_PARAM(iewToDecodeDelay, "Issue/Execute/Writeback to decode"
-               "delay"),
-    INIT_PARAM(commitToDecodeDelay, "Commit to decode delay"),
-    INIT_PARAM(fetchToDecodeDelay, "Fetch to decode delay"),
-    INIT_PARAM(decodeWidth, "Decode width"),
-
-    INIT_PARAM(iewToRenameDelay, "Issue/Execute/Writeback to rename"
-               "delay"),
-    INIT_PARAM(commitToRenameDelay, "Commit to rename delay"),
-    INIT_PARAM(decodeToRenameDelay, "Decode to rename delay"),
-    INIT_PARAM(renameWidth, "Rename width"),
-
-    INIT_PARAM(commitToIEWDelay, "Commit to "
-               "Issue/Execute/Writeback delay"),
-    INIT_PARAM(renameToIEWDelay, "Rename to "
-               "Issue/Execute/Writeback delay"),
-    INIT_PARAM(issueToExecuteDelay, "Issue to execute delay (internal"
-               "to the IEW stage)"),
-    INIT_PARAM(issueWidth, "Issue width"),
-    INIT_PARAM(executeWidth, "Execute width"),
-    INIT_PARAM(executeIntWidth, "Integer execute width"),
-    INIT_PARAM(executeFloatWidth, "Floating point execute width"),
-
-    INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
-               "delay"),
-    INIT_PARAM(renameToROBDelay, "Rename to reorder buffer delay"),
-    INIT_PARAM(commitWidth, "Commit width"),
-    INIT_PARAM(squashWidth, "Squash width"),
-
-    INIT_PARAM(numPhysIntRegs, "Number of physical integer registers"),
-    INIT_PARAM(numPhysFloatRegs, "Number of physical floating point "
-               "registers"),
-    INIT_PARAM(numIQEntries, "Number of instruction queue entries"),
-    INIT_PARAM(numROBEntries, "Number of reorder buffer entries"),
-
-    INIT_PARAM(defReg, "Defer registration")
-
-END_INIT_SIM_OBJECT_PARAMS(AlphaFullCPU)
-
-CREATE_SIM_OBJECT(AlphaFullCPU)
-{
-    AlphaFullCPU *cpu;
-
-#ifdef FULL_SYSTEM
-    if (mult != 1)
-        panic("Processor clock multiplier must be 1?\n");
-
-    // Full-system only supports a single thread for the moment.
-    int actual_num_threads = 1;
-#else
-    // In non-full-system mode, we infer the number of threads from
-    // the workload if it's not explicitly specified.
-    int actual_num_threads =
-        numThreads.isValid() ? numThreads : workload.size();
-
-    if (workload.size() == 0) {
-        fatal("Must specify at least one workload!");
-    }
-
-    Process *actual_process;
-
-    if (process == NULL) {
-        actual_process = workload[0];
-    } else {
-        actual_process = process;
-    }
-
-#endif
-
-    AlphaSimpleParams params;
-
-    params.name = getInstanceName();
-    params.numberOfThreads = actual_num_threads;
-
-#ifdef FULL_SYSTEM
-    params._system = system;
-    params.itb = itb;
-    params.dtb = dtb;
-    params.freq = ticksPerSecond * mult;
-#else
-    params.workload = workload;
-    params.process = actual_process;
-    params.asid = asid;
-#endif // FULL_SYSTEM
-
-    params.mem = mem;
-
-    params.maxInstsAnyThread = max_insts_any_thread;
-    params.maxInstsAllThreads = max_insts_all_threads;
-    params.maxLoadsAnyThread = max_loads_any_thread;
-    params.maxLoadsAllThreads = max_loads_all_threads;
-
-    //
-    // Caches
-    //
-    params.icacheInterface = icache ? icache->getInterface() : NULL;
-    params.dcacheInterface = dcache ? dcache->getInterface() : NULL;
-
-    params.decodeToFetchDelay = decodeToFetchDelay;
-    params.renameToFetchDelay = renameToFetchDelay;
-    params.iewToFetchDelay = iewToFetchDelay;
-    params.commitToFetchDelay = commitToFetchDelay;
-    params.fetchWidth = fetchWidth;
-
-    params.renameToDecodeDelay = renameToDecodeDelay;
-    params.iewToDecodeDelay = iewToDecodeDelay;
-    params.commitToDecodeDelay = commitToDecodeDelay;
-    params.fetchToDecodeDelay = fetchToDecodeDelay;
-    params.decodeWidth = decodeWidth;
-
-    params.iewToRenameDelay = iewToRenameDelay;
-    params.commitToRenameDelay = commitToRenameDelay;
-    params.decodeToRenameDelay = decodeToRenameDelay;
-    params.renameWidth = renameWidth;
-
-    params.commitToIEWDelay = commitToIEWDelay;
-    params.renameToIEWDelay = renameToIEWDelay;
-    params.issueToExecuteDelay = issueToExecuteDelay;
-    params.issueWidth = issueWidth;
-    params.executeWidth = executeWidth;
-    params.executeIntWidth = executeIntWidth;
-    params.executeFloatWidth = executeFloatWidth;
-
-    params.iewToCommitDelay = iewToCommitDelay;
-    params.renameToROBDelay = renameToROBDelay;
-    params.commitWidth = commitWidth;
-    params.squashWidth = squashWidth;
-
-    params.numPhysIntRegs = numPhysIntRegs;
-    params.numPhysFloatRegs = numPhysFloatRegs;
-    params.numIQEntries = numIQEntries;
-    params.numROBEntries = numROBEntries;
-
-    params.defReg = defReg;
-
-    cpu = new AlphaFullCPU(params);
-
-    return cpu;
-}
-
-REGISTER_SIM_OBJECT("AlphaFullCPU", AlphaFullCPU)
-
+#include "cpu/beta_cpu/alpha_impl.hh"
+#include "cpu/beta_cpu/alpha_full_cpu_impl.hh"
+#include "cpu/beta_cpu/alpha_dyn_inst.hh"
+
+// Force instantiation of AlphaFullCPU for all the implemntations that are
+// needed.  Consider merging this and alpha_dyn_inst.cc, and maybe all
+// classes that depend on a certain impl, into one file (alpha_impl.cc?).
+template AlphaFullCPU<AlphaSimpleImpl>;
diff --git a/cpu/beta_cpu/alpha_full_cpu.hh b/cpu/beta_cpu/alpha_full_cpu.hh
index b098aaac1..0e094b122 100644
--- a/cpu/beta_cpu/alpha_full_cpu.hh
+++ b/cpu/beta_cpu/alpha_full_cpu.hh
@@ -6,18 +6,19 @@
 #ifndef __ALPHA_FULL_CPU_HH__
 #define __ALPHA_FULL_CPU_HH__
 
-// To include: comm, impl, full cpu, ITB/DTB if full sys,
-#include "cpu/beta_cpu/comm.hh"
-#include "cpu/beta_cpu/alpha_impl.hh"
+// To include: comm, full cpu, ITB/DTB if full sys,
+//#include "cpu/beta_cpu/comm.hh"
+//#include "cpu/beta_cpu/alpha_impl.hh"
 #include "cpu/beta_cpu/full_cpu.hh"
 
 using namespace std;
 
-class AlphaFullCPU : public FullBetaCPU<AlphaSimpleImpl>
+template <class Impl>
+class AlphaFullCPU : public FullBetaCPU<Impl>
 {
   public:
-    typedef AlphaSimpleImpl::ISA AlphaISA;
-    typedef AlphaSimpleImpl::Params Params;
+    typedef typename Impl::ISA AlphaISA;
+    typedef typename Impl::Params Params;
 
   public:
     AlphaFullCPU(Params &params);
diff --git a/cpu/beta_cpu/alpha_full_cpu_builder.cc b/cpu/beta_cpu/alpha_full_cpu_builder.cc
new file mode 100644
index 000000000..5fe96d656
--- /dev/null
+++ b/cpu/beta_cpu/alpha_full_cpu_builder.cc
@@ -0,0 +1,306 @@
+#include "cpu/beta_cpu/alpha_impl.hh"
+#include "cpu/beta_cpu/alpha_full_cpu.hh"
+
+#include "mem/cache/base_cache.hh"
+
+#include "base/inifile.hh"
+#include "base/loader/symtab.hh"
+#include "base/misc.hh"
+#include "cpu/base_cpu.hh"
+#include "cpu/exec_context.hh"
+#include "cpu/exetrace.hh"
+#include "mem/base_mem.hh"
+#include "mem/mem_interface.hh"
+#include "sim/builder.hh"
+#include "sim/debug.hh"
+#include "sim/host.hh"
+#include "sim/process.hh"
+#include "sim/sim_events.hh"
+#include "sim/sim_object.hh"
+#include "sim/stats.hh"
+
+#ifdef FULL_SYSTEM
+#include "base/remote_gdb.hh"
+#include "dev/alpha_access.h"
+#include "dev/pciareg.h"
+#include "mem/functional_mem/memory_control.hh"
+#include "mem/functional_mem/physical_memory.hh"
+#include "sim/system.hh"
+#include "targetarch/alpha_memory.hh"
+#include "targetarch/vtophys.hh"
+#else // !FULL_SYSTEM
+#include "eio/eio.hh"
+#include "mem/functional_mem/functional_memory.hh"
+#endif // FULL_SYSTEM
+
+BEGIN_DECLARE_SIM_OBJECT_PARAMS(BaseFullCPU)
+
+    Param<int> numThreads;
+
+#ifdef FULL_SYSTEM
+SimObjectParam<System *> system;
+SimObjectParam<AlphaITB *> itb;
+SimObjectParam<AlphaDTB *> dtb;
+Param<int> mult;
+#else
+SimObjectVectorParam<Process *> workload;
+SimObjectParam<Process *> process;
+Param<short> asid;
+#endif // FULL_SYSTEM
+SimObjectParam<FunctionalMemory *> mem;
+
+Param<Counter> max_insts_any_thread;
+Param<Counter> max_insts_all_threads;
+Param<Counter> max_loads_any_thread;
+Param<Counter> max_loads_all_threads;
+
+SimObjectParam<BaseCache *> icache;
+SimObjectParam<BaseCache *> dcache;
+
+Param<unsigned> decodeToFetchDelay;
+Param<unsigned> renameToFetchDelay;
+Param<unsigned> iewToFetchDelay;
+Param<unsigned> commitToFetchDelay;
+Param<unsigned> fetchWidth;
+
+Param<unsigned> renameToDecodeDelay;
+Param<unsigned> iewToDecodeDelay;
+Param<unsigned> commitToDecodeDelay;
+Param<unsigned> fetchToDecodeDelay;
+Param<unsigned> decodeWidth;
+
+Param<unsigned> iewToRenameDelay;
+Param<unsigned> commitToRenameDelay;
+Param<unsigned> decodeToRenameDelay;
+Param<unsigned> renameWidth;
+
+Param<unsigned> commitToIEWDelay;
+Param<unsigned> renameToIEWDelay;
+Param<unsigned> issueToExecuteDelay;
+Param<unsigned> issueWidth;
+Param<unsigned> executeWidth;
+Param<unsigned> executeIntWidth;
+Param<unsigned> executeFloatWidth;
+
+Param<unsigned> iewToCommitDelay;
+Param<unsigned> renameToROBDelay;
+Param<unsigned> commitWidth;
+Param<unsigned> squashWidth;
+
+Param<unsigned> localPredictorSize;
+Param<unsigned> localPredictorCtrBits;
+Param<unsigned> BTBEntries;
+Param<unsigned> BTBTagSize;
+
+Param<unsigned> numPhysIntRegs;
+Param<unsigned> numPhysFloatRegs;
+Param<unsigned> numIQEntries;
+Param<unsigned> numROBEntries;
+
+Param<unsigned> instShiftAmt;
+
+Param<bool> defReg;
+
+END_DECLARE_SIM_OBJECT_PARAMS(BaseFullCPU)
+
+BEGIN_INIT_SIM_OBJECT_PARAMS(BaseFullCPU)
+
+    INIT_PARAM(numThreads, "number of HW thread contexts"),
+
+#ifdef FULL_SYSTEM
+    INIT_PARAM(system, "System object"),
+    INIT_PARAM(itb, "Instruction translation buffer"),
+    INIT_PARAM(dtb, "Data translation buffer"),
+    INIT_PARAM_DFLT(mult, "System clock multiplier", 1),
+#else
+    INIT_PARAM(workload, "Processes to run"),
+    INIT_PARAM_DFLT(process, "Process to run", NULL),
+    INIT_PARAM(asid, "Address space ID"),
+#endif // FULL_SYSTEM
+
+    INIT_PARAM_DFLT(mem, "Memory", NULL),
+
+    INIT_PARAM_DFLT(max_insts_any_thread,
+                    "Terminate when any thread reaches this inst count",
+                    0),
+    INIT_PARAM_DFLT(max_insts_all_threads,
+                    "Terminate when all threads have reached"
+                    "this inst count",
+                    0),
+    INIT_PARAM_DFLT(max_loads_any_thread,
+                    "Terminate when any thread reaches this load count",
+                    0),
+    INIT_PARAM_DFLT(max_loads_all_threads,
+                    "Terminate when all threads have reached this load"
+                    "count",
+                    0),
+
+    INIT_PARAM_DFLT(icache, "L1 instruction cache", NULL),
+    INIT_PARAM_DFLT(dcache, "L1 data cache", NULL),
+
+    INIT_PARAM(decodeToFetchDelay, "Decode to fetch delay"),
+    INIT_PARAM(renameToFetchDelay, "Rename to fetch delay"),
+    INIT_PARAM(iewToFetchDelay, "Issue/Execute/Writeback to fetch"
+               "delay"),
+    INIT_PARAM(commitToFetchDelay, "Commit to fetch delay"),
+    INIT_PARAM(fetchWidth, "Fetch width"),
+
+    INIT_PARAM(renameToDecodeDelay, "Rename to decode delay"),
+    INIT_PARAM(iewToDecodeDelay, "Issue/Execute/Writeback to decode"
+               "delay"),
+    INIT_PARAM(commitToDecodeDelay, "Commit to decode delay"),
+    INIT_PARAM(fetchToDecodeDelay, "Fetch to decode delay"),
+    INIT_PARAM(decodeWidth, "Decode width"),
+
+    INIT_PARAM(iewToRenameDelay, "Issue/Execute/Writeback to rename"
+               "delay"),
+    INIT_PARAM(commitToRenameDelay, "Commit to rename delay"),
+    INIT_PARAM(decodeToRenameDelay, "Decode to rename delay"),
+    INIT_PARAM(renameWidth, "Rename width"),
+
+    INIT_PARAM(commitToIEWDelay, "Commit to "
+               "Issue/Execute/Writeback delay"),
+    INIT_PARAM(renameToIEWDelay, "Rename to "
+               "Issue/Execute/Writeback delay"),
+    INIT_PARAM(issueToExecuteDelay, "Issue to execute delay (internal"
+               "to the IEW stage)"),
+    INIT_PARAM(issueWidth, "Issue width"),
+    INIT_PARAM(executeWidth, "Execute width"),
+    INIT_PARAM(executeIntWidth, "Integer execute width"),
+    INIT_PARAM(executeFloatWidth, "Floating point execute width"),
+
+    INIT_PARAM(iewToCommitDelay, "Issue/Execute/Writeback to commit "
+               "delay"),
+    INIT_PARAM(renameToROBDelay, "Rename to reorder buffer delay"),
+    INIT_PARAM(commitWidth, "Commit width"),
+    INIT_PARAM(squashWidth, "Squash width"),
+
+    INIT_PARAM(localPredictorSize, "Size of the local predictor in entries. "
+               "Must be a power of 2."),
+    INIT_PARAM(localPredictorCtrBits, "Number of bits per counter for bpred"),
+    INIT_PARAM(BTBEntries, "Number of BTB entries"),
+    INIT_PARAM(BTBTagSize, "Size of the BTB tags, in bits"),
+
+
+    INIT_PARAM(numPhysIntRegs, "Number of physical integer registers"),
+    INIT_PARAM(numPhysFloatRegs, "Number of physical floating point "
+               "registers"),
+    INIT_PARAM(numIQEntries, "Number of instruction queue entries"),
+    INIT_PARAM(numROBEntries, "Number of reorder buffer entries"),
+
+    INIT_PARAM(instShiftAmt, "Number of bits to shift instructions by"),
+
+    INIT_PARAM(defReg, "Defer registration")
+
+END_INIT_SIM_OBJECT_PARAMS(BaseFullCPU)
+
+CREATE_SIM_OBJECT(BaseFullCPU)
+{
+    AlphaFullCPU<AlphaSimpleImpl> *cpu;
+
+#ifdef FULL_SYSTEM
+    if (mult != 1)
+        panic("Processor clock multiplier must be 1?\n");
+
+    // Full-system only supports a single thread for the moment.
+    int actual_num_threads = 1;
+#else
+    // In non-full-system mode, we infer the number of threads from
+    // the workload if it's not explicitly specified.
+    int actual_num_threads =
+        numThreads.isValid() ? numThreads : workload.size();
+
+    if (workload.size() == 0) {
+        fatal("Must specify at least one workload!");
+    }
+
+    Process *actual_process;
+
+    if (process == NULL) {
+        actual_process = workload[0];
+    } else {
+        actual_process = process;
+    }
+
+#endif
+
+    AlphaSimpleParams params;
+
+    params.name = getInstanceName();
+    params.numberOfThreads = actual_num_threads;
+
+#ifdef FULL_SYSTEM
+    params._system = system;
+    params.itb = itb;
+    params.dtb = dtb;
+    params.freq = ticksPerSecond * mult;
+#else
+    params.workload = workload;
+    params.process = actual_process;
+    params.asid = asid;
+#endif // FULL_SYSTEM
+
+    params.mem = mem;
+
+    params.maxInstsAnyThread = max_insts_any_thread;
+    params.maxInstsAllThreads = max_insts_all_threads;
+    params.maxLoadsAnyThread = max_loads_any_thread;
+    params.maxLoadsAllThreads = max_loads_all_threads;
+
+    //
+    // Caches
+    //
+    params.icacheInterface = icache ? icache->getInterface() : NULL;
+    params.dcacheInterface = dcache ? dcache->getInterface() : NULL;
+
+    params.decodeToFetchDelay = decodeToFetchDelay;
+    params.renameToFetchDelay = renameToFetchDelay;
+    params.iewToFetchDelay = iewToFetchDelay;
+    params.commitToFetchDelay = commitToFetchDelay;
+    params.fetchWidth = fetchWidth;
+
+    params.renameToDecodeDelay = renameToDecodeDelay;
+    params.iewToDecodeDelay = iewToDecodeDelay;
+    params.commitToDecodeDelay = commitToDecodeDelay;
+    params.fetchToDecodeDelay = fetchToDecodeDelay;
+    params.decodeWidth = decodeWidth;
+
+    params.iewToRenameDelay = iewToRenameDelay;
+    params.commitToRenameDelay = commitToRenameDelay;
+    params.decodeToRenameDelay = decodeToRenameDelay;
+    params.renameWidth = renameWidth;
+
+    params.commitToIEWDelay = commitToIEWDelay;
+    params.renameToIEWDelay = renameToIEWDelay;
+    params.issueToExecuteDelay = issueToExecuteDelay;
+    params.issueWidth = issueWidth;
+    params.executeWidth = executeWidth;
+    params.executeIntWidth = executeIntWidth;
+    params.executeFloatWidth = executeFloatWidth;
+
+    params.iewToCommitDelay = iewToCommitDelay;
+    params.renameToROBDelay = renameToROBDelay;
+    params.commitWidth = commitWidth;
+    params.squashWidth = squashWidth;
+
+    params.localPredictorSize = localPredictorSize;
+    params.localPredictorCtrBits = localPredictorCtrBits;
+    params.BTBEntries = BTBEntries;
+    params.BTBTagSize = BTBTagSize;
+
+    params.numPhysIntRegs = numPhysIntRegs;
+    params.numPhysFloatRegs = numPhysFloatRegs;
+    params.numIQEntries = numIQEntries;
+    params.numROBEntries = numROBEntries;
+
+    params.instShiftAmt = 2;
+
+    params.defReg = defReg;
+
+    cpu = new AlphaFullCPU<AlphaSimpleImpl>(params);
+
+    return cpu;
+}
+
+REGISTER_SIM_OBJECT("AlphaFullCPU", BaseFullCPU)
+
diff --git a/cpu/beta_cpu/alpha_full_cpu_impl.hh b/cpu/beta_cpu/alpha_full_cpu_impl.hh
new file mode 100644
index 000000000..8bfc0777e
--- /dev/null
+++ b/cpu/beta_cpu/alpha_full_cpu_impl.hh
@@ -0,0 +1,690 @@
+
+#include "base/cprintf.hh"
+#include "base/statistics.hh"
+#include "base/timebuf.hh"
+#include "mem/cache/cache.hh" // for dynamic cast
+#include "mem/mem_interface.hh"
+#include "sim/builder.hh"
+#include "sim/sim_events.hh"
+#include "sim/stats.hh"
+
+#include "cpu/beta_cpu/alpha_full_cpu.hh"
+#include "cpu/beta_cpu/alpha_params.hh"
+#include "cpu/beta_cpu/comm.hh"
+
+template <class Impl>
+AlphaFullCPU<Impl>::AlphaFullCPU(Params &params)
+    : FullBetaCPU<AlphaSimpleImpl>(params)
+{
+    DPRINTF(FullCPU, "AlphaFullCPU: Creating AlphaFullCPU object.\n");
+
+    fetch.setCPU(this);
+    decode.setCPU(this);
+    rename.setCPU(this);
+    iew.setCPU(this);
+    commit.setCPU(this);
+
+    rob.setCPU(this);
+}
+
+#ifndef FULL_SYSTEM
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::syscall()
+{
+    DPRINTF(FullCPU, "AlphaFullCPU: Syscall() called.\n\n");
+
+    // Commit stage needs to run as well.
+    commit.tick();
+
+    squashStages();
+
+    // Temporarily increase this by one to account for the syscall
+    // instruction.
+    ++funcExeInst;
+
+    // Copy over all important state to xc once all the unrolling is done.
+    copyToXC();
+
+    process->syscall(xc);
+
+    // Copy over all important state back to CPU.
+    copyFromXC();
+
+    // Decrease funcExeInst by one as the normal commit will handle
+    // incrememnting it.
+    --funcExeInst;
+}
+
+// This is not a pretty function, and should only be used if it is necessary
+// to fake having everything squash all at once (ie for non-full system
+// syscalls).  Maybe put this at the FullCPU level?
+template <class Impl>
+void
+AlphaFullCPU<Impl>::squashStages()
+{
+    InstSeqNum rob_head = rob.readHeadSeqNum();
+
+    // Now hack the time buffer to put this sequence number in the places
+    // where the stages might read it.
+    for (int i = 0; i < 5; ++i)
+    {
+        timeBuffer.access(-i)->commitInfo.doneSeqNum = rob_head;
+    }
+
+    fetch.squash(rob.readHeadNextPC());
+    fetchQueue.advance();
+
+    decode.squash();
+    decodeQueue.advance();
+
+    rename.squash();
+    renameQueue.advance();
+    renameQueue.advance();
+
+    // Be sure to advance the IEW queues so that the commit stage doesn't
+    // try to set an instruction as completed at the same time that it
+    // might be deleting it.
+    iew.squash();
+    iewQueue.advance();
+    iewQueue.advance();
+
+    rob.squash(rob_head);
+    commit.setSquashing();
+}
+
+#endif // FULL_SYSTEM
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::copyToXC()
+{
+    PhysRegIndex renamed_reg;
+
+    // First loop through the integer registers.
+    for (int i = 0; i < AlphaISA::NumIntRegs; ++i)
+    {
+        renamed_reg = renameMap.lookup(i);
+        xc->regs.intRegFile[i] = regFile.intRegFile[renamed_reg];
+        DPRINTF(FullCPU, "FullCPU: Copying register %i, has data %lli.\n",
+                renamed_reg, regFile.intRegFile[renamed_reg]);
+    }
+
+    // Then loop through the floating point registers.
+    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i)
+    {
+        renamed_reg = renameMap.lookup(i + AlphaISA::FP_Base_DepTag);
+        xc->regs.floatRegFile.d[i] = regFile.floatRegFile[renamed_reg].d;
+        xc->regs.floatRegFile.q[i] = regFile.floatRegFile[renamed_reg].q;
+    }
+
+    xc->regs.miscRegs.fpcr = regFile.miscRegs.fpcr;
+    xc->regs.miscRegs.uniq = regFile.miscRegs.uniq;
+    xc->regs.miscRegs.lock_flag = regFile.miscRegs.lock_flag;
+    xc->regs.miscRegs.lock_addr = regFile.miscRegs.lock_addr;
+
+    xc->regs.pc = rob.readHeadPC();
+    xc->regs.npc = xc->regs.pc+4;
+
+    xc->func_exe_inst = funcExeInst;
+}
+
+// This function will probably mess things up unless the ROB is empty and
+// there are no instructions in the pipeline.
+template <class Impl>
+void
+AlphaFullCPU<Impl>::copyFromXC()
+{
+    PhysRegIndex renamed_reg;
+
+    // First loop through the integer registers.
+    for (int i = 0; i < AlphaISA::NumIntRegs; ++i)
+    {
+        renamed_reg = renameMap.lookup(i);
+
+        DPRINTF(FullCPU, "FullCPU: Copying over register %i, had data %lli, "
+                "now has data %lli.\n",
+                renamed_reg, regFile.intRegFile[renamed_reg],
+                xc->regs.intRegFile[i]);
+
+        regFile.intRegFile[renamed_reg] = xc->regs.intRegFile[i];
+    }
+
+    // Then loop through the floating point registers.
+    for (int i = 0; i < AlphaISA::NumFloatRegs; ++i)
+    {
+        renamed_reg = renameMap.lookup(i + AlphaISA::FP_Base_DepTag);
+        regFile.floatRegFile[renamed_reg].d = xc->regs.floatRegFile.d[i];
+        regFile.floatRegFile[renamed_reg].q = xc->regs.floatRegFile.q[i] ;
+    }
+
+    // Then loop through the misc registers.
+    regFile.miscRegs.fpcr = xc->regs.miscRegs.fpcr;
+    regFile.miscRegs.uniq = xc->regs.miscRegs.uniq;
+    regFile.miscRegs.lock_flag = xc->regs.miscRegs.lock_flag;
+    regFile.miscRegs.lock_addr = xc->regs.miscRegs.lock_addr;
+
+    // Then finally set the PC and the next PC.
+//    regFile.pc = xc->regs.pc;
+//    regFile.npc = xc->regs.npc;
+
+    funcExeInst = xc->func_exe_inst;
+}
+
+#ifdef FULL_SYSTEM
+
+template <class Impl>
+uint64_t *
+AlphaFullCPU<Impl>::getIpr()
+{
+    return regs.ipr;
+}
+
+template <class Impl>
+uint64_t
+AlphaFullCPU<Impl>::readIpr(int idx, Fault &fault)
+{
+    uint64_t *ipr = getIpr();
+    uint64_t retval = 0;	// return value, default 0
+
+    switch (idx) {
+      case AlphaISA::IPR_PALtemp0:
+      case AlphaISA::IPR_PALtemp1:
+      case AlphaISA::IPR_PALtemp2:
+      case AlphaISA::IPR_PALtemp3:
+      case AlphaISA::IPR_PALtemp4:
+      case AlphaISA::IPR_PALtemp5:
+      case AlphaISA::IPR_PALtemp6:
+      case AlphaISA::IPR_PALtemp7:
+      case AlphaISA::IPR_PALtemp8:
+      case AlphaISA::IPR_PALtemp9:
+      case AlphaISA::IPR_PALtemp10:
+      case AlphaISA::IPR_PALtemp11:
+      case AlphaISA::IPR_PALtemp12:
+      case AlphaISA::IPR_PALtemp13:
+      case AlphaISA::IPR_PALtemp14:
+      case AlphaISA::IPR_PALtemp15:
+      case AlphaISA::IPR_PALtemp16:
+      case AlphaISA::IPR_PALtemp17:
+      case AlphaISA::IPR_PALtemp18:
+      case AlphaISA::IPR_PALtemp19:
+      case AlphaISA::IPR_PALtemp20:
+      case AlphaISA::IPR_PALtemp21:
+      case AlphaISA::IPR_PALtemp22:
+      case AlphaISA::IPR_PALtemp23:
+      case AlphaISA::IPR_PAL_BASE:
+
+      case AlphaISA::IPR_IVPTBR:
+      case AlphaISA::IPR_DC_MODE:
+      case AlphaISA::IPR_MAF_MODE:
+      case AlphaISA::IPR_ISR:
+      case AlphaISA::IPR_EXC_ADDR:
+      case AlphaISA::IPR_IC_PERR_STAT:
+      case AlphaISA::IPR_DC_PERR_STAT:
+      case AlphaISA::IPR_MCSR:
+      case AlphaISA::IPR_ASTRR:
+      case AlphaISA::IPR_ASTER:
+      case AlphaISA::IPR_SIRR:
+      case AlphaISA::IPR_ICSR:
+      case AlphaISA::IPR_ICM:
+      case AlphaISA::IPR_DTB_CM:
+      case AlphaISA::IPR_IPLR:
+      case AlphaISA::IPR_INTID:
+      case AlphaISA::IPR_PMCTR:
+        // no side-effect
+        retval = ipr[idx];
+        break;
+
+      case AlphaISA::IPR_CC:
+        retval |= ipr[idx] & ULL(0xffffffff00000000);
+        retval |= curTick  & ULL(0x00000000ffffffff);
+        break;
+
+      case AlphaISA::IPR_VA:
+        retval = ipr[idx];
+        break;
+
+      case AlphaISA::IPR_VA_FORM:
+      case AlphaISA::IPR_MM_STAT:
+      case AlphaISA::IPR_IFAULT_VA_FORM:
+      case AlphaISA::IPR_EXC_MASK:
+      case AlphaISA::IPR_EXC_SUM:
+        retval = ipr[idx];
+        break;
+
+      case AlphaISA::IPR_DTB_PTE:
+        {
+            AlphaISA::PTE &pte = dtb->index(!misspeculating());
+
+            retval |= ((u_int64_t)pte.ppn & ULL(0x7ffffff)) << 32;
+            retval |= ((u_int64_t)pte.xre & ULL(0xf)) << 8;
+            retval |= ((u_int64_t)pte.xwe & ULL(0xf)) << 12;
+            retval |= ((u_int64_t)pte.fonr & ULL(0x1)) << 1;
+            retval |= ((u_int64_t)pte.fonw & ULL(0x1))<< 2;
+            retval |= ((u_int64_t)pte.asma & ULL(0x1)) << 4;
+            retval |= ((u_int64_t)pte.asn & ULL(0x7f)) << 57;
+        }
+        break;
+
+        // write only registers
+      case AlphaISA::IPR_HWINT_CLR:
+      case AlphaISA::IPR_SL_XMIT:
+      case AlphaISA::IPR_DC_FLUSH:
+      case AlphaISA::IPR_IC_FLUSH:
+      case AlphaISA::IPR_ALT_MODE:
+      case AlphaISA::IPR_DTB_IA:
+      case AlphaISA::IPR_DTB_IAP:
+      case AlphaISA::IPR_ITB_IA:
+      case AlphaISA::IPR_ITB_IAP:
+        fault = Unimplemented_Opcode_Fault;
+        break;
+
+      default:
+        // invalid IPR
+        fault = Unimplemented_Opcode_Fault;
+        break;
+    }
+
+    return retval;
+}
+
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::setIpr(int idx, uint64_t val)
+{
+    uint64_t *ipr = getIpr();
+    uint64_t old;
+
+    if (misspeculating())
+        return No_Fault;
+
+    switch (idx) {
+      case AlphaISA::IPR_PALtemp0:
+      case AlphaISA::IPR_PALtemp1:
+      case AlphaISA::IPR_PALtemp2:
+      case AlphaISA::IPR_PALtemp3:
+      case AlphaISA::IPR_PALtemp4:
+      case AlphaISA::IPR_PALtemp5:
+      case AlphaISA::IPR_PALtemp6:
+      case AlphaISA::IPR_PALtemp7:
+      case AlphaISA::IPR_PALtemp8:
+      case AlphaISA::IPR_PALtemp9:
+      case AlphaISA::IPR_PALtemp10:
+      case AlphaISA::IPR_PALtemp11:
+      case AlphaISA::IPR_PALtemp12:
+      case AlphaISA::IPR_PALtemp13:
+      case AlphaISA::IPR_PALtemp14:
+      case AlphaISA::IPR_PALtemp15:
+      case AlphaISA::IPR_PALtemp16:
+      case AlphaISA::IPR_PALtemp17:
+      case AlphaISA::IPR_PALtemp18:
+      case AlphaISA::IPR_PALtemp19:
+      case AlphaISA::IPR_PALtemp20:
+      case AlphaISA::IPR_PALtemp21:
+      case AlphaISA::IPR_PALtemp22:
+      case AlphaISA::IPR_PAL_BASE:
+      case AlphaISA::IPR_IC_PERR_STAT:
+      case AlphaISA::IPR_DC_PERR_STAT:
+      case AlphaISA::IPR_PMCTR:
+        // write entire quad w/ no side-effect
+        ipr[idx] = val;
+        break;
+
+      case AlphaISA::IPR_CC_CTL:
+        // This IPR resets the cycle counter.  We assume this only
+        // happens once... let's verify that.
+        assert(ipr[idx] == 0);
+        ipr[idx] = 1;
+        break;
+
+      case AlphaISA::IPR_CC:
+        // This IPR only writes the upper 64 bits.  It's ok to write
+        // all 64 here since we mask out the lower 32 in rpcc (see
+        // isa_desc).
+        ipr[idx] = val;
+        break;
+
+      case AlphaISA::IPR_PALtemp23:
+        // write entire quad w/ no side-effect
+        old = ipr[idx];
+        ipr[idx] = val;
+        kernelStats.context(old, val);
+        break;
+
+      case AlphaISA::IPR_DTB_PTE:
+        // write entire quad w/ no side-effect, tag is forthcoming
+        ipr[idx] = val;
+        break;
+
+      case AlphaISA::IPR_EXC_ADDR:
+        // second least significant bit in PC is always zero
+        ipr[idx] = val & ~2;
+        break;
+
+      case AlphaISA::IPR_ASTRR:
+      case AlphaISA::IPR_ASTER:
+        // only write least significant four bits - privilege mask
+        ipr[idx] = val & 0xf;
+        break;
+
+      case AlphaISA::IPR_IPLR:
+#ifdef DEBUG
+        if (break_ipl != -1 && break_ipl == (val & 0x1f))
+            debug_break();
+#endif
+
+        // only write least significant five bits - interrupt level
+        ipr[idx] = val & 0x1f;
+        kernelStats.swpipl(ipr[idx]);
+        break;
+
+      case AlphaISA::IPR_DTB_CM:
+        kernelStats.mode((val & 0x18) != 0);
+
+      case AlphaISA::IPR_ICM:
+        // only write two mode bits - processor mode
+        ipr[idx] = val & 0x18;
+        break;
+
+      case AlphaISA::IPR_ALT_MODE:
+        // only write two mode bits - processor mode
+        ipr[idx] = val & 0x18;
+        break;
+
+      case AlphaISA::IPR_MCSR:
+        // more here after optimization...
+        ipr[idx] = val;
+        break;
+
+      case AlphaISA::IPR_SIRR:
+        // only write software interrupt mask
+        ipr[idx] = val & 0x7fff0;
+        break;
+
+      case AlphaISA::IPR_ICSR:
+        ipr[idx] = val & ULL(0xffffff0300);
+        break;
+
+      case AlphaISA::IPR_IVPTBR:
+      case AlphaISA::IPR_MVPTBR:
+        ipr[idx] = val & ULL(0xffffffffc0000000);
+        break;
+
+      case AlphaISA::IPR_DC_TEST_CTL:
+        ipr[idx] = val & 0x1ffb;
+        break;
+
+      case AlphaISA::IPR_DC_MODE:
+      case AlphaISA::IPR_MAF_MODE:
+        ipr[idx] = val & 0x3f;
+        break;
+
+      case AlphaISA::IPR_ITB_ASN:
+        ipr[idx] = val & 0x7f0;
+        break;
+
+      case AlphaISA::IPR_DTB_ASN:
+        ipr[idx] = val & ULL(0xfe00000000000000);
+        break;
+
+      case AlphaISA::IPR_EXC_SUM:
+      case AlphaISA::IPR_EXC_MASK:
+        // any write to this register clears it
+        ipr[idx] = 0;
+        break;
+
+      case AlphaISA::IPR_INTID:
+      case AlphaISA::IPR_SL_RCV:
+      case AlphaISA::IPR_MM_STAT:
+      case AlphaISA::IPR_ITB_PTE_TEMP:
+      case AlphaISA::IPR_DTB_PTE_TEMP:
+        // read-only registers
+        return Unimplemented_Opcode_Fault;
+
+      case AlphaISA::IPR_HWINT_CLR:
+      case AlphaISA::IPR_SL_XMIT:
+      case AlphaISA::IPR_DC_FLUSH:
+      case AlphaISA::IPR_IC_FLUSH:
+        // the following are write only
+        ipr[idx] = val;
+        break;
+
+      case AlphaISA::IPR_DTB_IA:
+        // really a control write
+        ipr[idx] = 0;
+
+        dtb->flushAll();
+        break;
+
+      case AlphaISA::IPR_DTB_IAP:
+        // really a control write
+        ipr[idx] = 0;
+
+        dtb->flushProcesses();
+        break;
+
+      case AlphaISA::IPR_DTB_IS:
+        // really a control write
+        ipr[idx] = val;
+
+        dtb->flushAddr(val, DTB_ASN_ASN(ipr[AlphaISA::IPR_DTB_ASN]));
+        break;
+
+      case AlphaISA::IPR_DTB_TAG: {
+          struct AlphaISA::PTE pte;
+
+          // FIXME: granularity hints NYI...
+          if (DTB_PTE_GH(ipr[AlphaISA::IPR_DTB_PTE]) != 0)
+              panic("PTE GH field != 0");
+
+          // write entire quad
+          ipr[idx] = val;
+
+          // construct PTE for new entry
+          pte.ppn = DTB_PTE_PPN(ipr[AlphaISA::IPR_DTB_PTE]);
+          pte.xre = DTB_PTE_XRE(ipr[AlphaISA::IPR_DTB_PTE]);
+          pte.xwe = DTB_PTE_XWE(ipr[AlphaISA::IPR_DTB_PTE]);
+          pte.fonr = DTB_PTE_FONR(ipr[AlphaISA::IPR_DTB_PTE]);
+          pte.fonw = DTB_PTE_FONW(ipr[AlphaISA::IPR_DTB_PTE]);
+          pte.asma = DTB_PTE_ASMA(ipr[AlphaISA::IPR_DTB_PTE]);
+          pte.asn = DTB_ASN_ASN(ipr[AlphaISA::IPR_DTB_ASN]);
+
+          // insert new TAG/PTE value into data TLB
+          dtb->insert(val, pte);
+      }
+        break;
+
+      case AlphaISA::IPR_ITB_PTE: {
+          struct AlphaISA::PTE pte;
+
+          // FIXME: granularity hints NYI...
+          if (ITB_PTE_GH(val) != 0)
+              panic("PTE GH field != 0");
+
+          // write entire quad
+          ipr[idx] = val;
+
+          // construct PTE for new entry
+          pte.ppn = ITB_PTE_PPN(val);
+          pte.xre = ITB_PTE_XRE(val);
+          pte.xwe = 0;
+          pte.fonr = ITB_PTE_FONR(val);
+          pte.fonw = ITB_PTE_FONW(val);
+          pte.asma = ITB_PTE_ASMA(val);
+          pte.asn = ITB_ASN_ASN(ipr[AlphaISA::IPR_ITB_ASN]);
+
+          // insert new TAG/PTE value into data TLB
+          itb->insert(ipr[AlphaISA::IPR_ITB_TAG], pte);
+      }
+        break;
+
+      case AlphaISA::IPR_ITB_IA:
+        // really a control write
+        ipr[idx] = 0;
+
+        itb->flushAll();
+        break;
+
+      case AlphaISA::IPR_ITB_IAP:
+        // really a control write
+        ipr[idx] = 0;
+
+        itb->flushProcesses();
+        break;
+
+      case AlphaISA::IPR_ITB_IS:
+        // really a control write
+        ipr[idx] = val;
+
+        itb->flushAddr(val, ITB_ASN_ASN(ipr[AlphaISA::IPR_ITB_ASN]));
+        break;
+
+      default:
+        // invalid IPR
+        return Unimplemented_Opcode_Fault;
+    }
+
+    // no error...
+    return No_Fault;
+
+}
+
+template <class Impl>
+int
+AlphaFullCPU<Impl>::readIntrFlag()
+{
+    return regs.intrflag;
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::setIntrFlag(int val)
+{
+    regs.intrflag = val;
+}
+
+// Maybe have this send back from IEW stage to squash and update PC.
+template <class Impl>
+Fault
+AlphaFullCPU<Impl>::hwrei()
+{
+    uint64_t *ipr = getIpr();
+
+    if (!PC_PAL(regs.pc))
+        return Unimplemented_Opcode_Fault;
+
+    setNextPC(ipr[AlphaISA::IPR_EXC_ADDR]);
+
+    if (!misspeculating()) {
+        kernelStats.hwrei();
+
+        if ((ipr[AlphaISA::IPR_EXC_ADDR] & 1) == 0)
+            AlphaISA::swap_palshadow(&regs, false);
+
+        AlphaISA::check_interrupts = true;
+    }
+
+    // FIXME: XXX check for interrupts? XXX
+    return No_Fault;
+}
+
+template <class Impl>
+bool
+AlphaFullCPU<Impl>::inPalMode()
+{
+    return PC_PAL(readPC());
+}
+
+template <class Impl>
+bool
+AlphaFullCPU<Impl>::simPalCheck(int palFunc)
+{
+    kernelStats.callpal(palFunc);
+
+    switch (palFunc) {
+      case PAL::halt:
+        halt();
+        if (--System::numSystemsRunning == 0)
+            new SimExitEvent("all cpus halted");
+        break;
+
+      case PAL::bpt:
+      case PAL::bugchk:
+        if (system->breakpoint())
+            return false;
+        break;
+    }
+
+    return true;
+}
+
+// Probably shouldn't be able to switch to the trap handler as quickly as
+// this.  Also needs to get the exception restart address from the commit
+// stage.
+template <class Impl>
+void
+AlphaFullCPU<Impl>::trap(Fault fault)
+{
+    uint64_t PC = commit.readPC();
+
+    DPRINTF(Fault, "Fault %s\n", FaultName(fault));
+    Stats::recordEvent(csprintf("Fault %s", FaultName(fault)));
+
+    assert(!misspeculating());
+    kernelStats.fault(fault);
+
+    if (fault == Arithmetic_Fault)
+        panic("Arithmetic traps are unimplemented!");
+
+    AlphaISA::InternalProcReg *ipr = getIpr();
+
+    // exception restart address - Get the commit PC
+    if (fault != Interrupt_Fault || !PC_PAL(PC))
+        ipr[AlphaISA::IPR_EXC_ADDR] = PC;
+
+    if (fault == Pal_Fault || fault == Arithmetic_Fault /* ||
+        fault == Interrupt_Fault && !PC_PAL(regs.pc) */) {
+        // traps...  skip faulting instruction
+        ipr[AlphaISA::IPR_EXC_ADDR] += 4;
+    }
+
+    if (!PC_PAL(PC))
+        AlphaISA::swap_palshadow(&regs, true);
+
+    setPC( ipr[AlphaISA::IPR_PAL_BASE] + AlphaISA::fault_addr[fault] );
+    setNextPC(PC + sizeof(MachInst));
+}
+
+template <class Impl>
+void
+AlphaFullCPU<Impl>::processInterrupts()
+{
+    // Check for interrupts here.  For now can copy the code that exists
+    // within isa_fullsys_traits.hh.
+}
+
+// swap_palshadow swaps in the values of the shadow registers and
+// swaps them with the values of the physical registers that map to the
+// same logical index.
+template <class Impl>
+void
+AlphaFullCPU<Impl>::swap_palshadow(RegFile *regs, bool use_shadow)
+{
+    if (palShadowEnabled == use_shadow)
+        panic("swap_palshadow: wrong PAL shadow state");
+
+    palShadowEnabled = use_shadow;
+
+    // Will have to lookup in rename map to get physical registers, then
+    // swap.
+    for (int i = 0; i < AlphaISA::NumIntRegs; i++) {
+        if (reg_redir[i]) {
+            AlphaISA::IntReg temp = regs->intRegFile[i];
+            regs->intRegFile[i] = regs->palregs[i];
+            regs->palregs[i] = temp;
+        }
+    }
+}
+
+#endif // FULL_SYSTEM
diff --git a/cpu/beta_cpu/alpha_impl.hh b/cpu/beta_cpu/alpha_impl.hh
index a80b116a8..fc86dacd7 100644
--- a/cpu/beta_cpu/alpha_impl.hh
+++ b/cpu/beta_cpu/alpha_impl.hh
@@ -3,23 +3,14 @@
 
 #include "arch/alpha/isa_traits.hh"
 
-#include "cpu/beta_cpu/comm.hh"
 #include "cpu/beta_cpu/cpu_policy.hh"
 #include "cpu/beta_cpu/alpha_params.hh"
 
-#include "cpu/beta_cpu/commit.hh"
-#include "cpu/beta_cpu/decode.hh"
-#include "cpu/beta_cpu/fetch.hh"
-#include "cpu/beta_cpu/free_list.hh"
-#include "cpu/beta_cpu/iew.hh"
-
-#include "cpu/beta_cpu/inst_queue.hh"
-#include "cpu/beta_cpu/regfile.hh"
-#include "cpu/beta_cpu/rename.hh"
-#include "cpu/beta_cpu/rename_map.hh"
-#include "cpu/beta_cpu/rob.hh"
-
+// Forward declarations.
+template <class Impl>
 class AlphaDynInst;
+
+template <class Impl>
 class AlphaFullCPU;
 
 /** Implementation specific struct that defines several key things to the
@@ -42,33 +33,22 @@ struct AlphaSimpleImpl
     typedef SimpleCPUPolicy<AlphaSimpleImpl> CPUPol;
 
     /** The DynInst to be used. */
-    typedef AlphaDynInst DynInst;
+    typedef AlphaDynInst<AlphaSimpleImpl> DynInst;
+
+    /** The refcounted DynInst pointer to be used.  In most cases this is
+     *  what should be used, and not DynInst *.
+     */
+    typedef RefCountingPtr<DynInst> DynInstPtr;
 
     /** The FullCPU to be used. */
-    typedef AlphaFullCPU FullCPU;
+    typedef AlphaFullCPU<AlphaSimpleImpl> FullCPU;
 
     /** The Params to be passed to each stage. */
     typedef AlphaSimpleParams Params;
 
-    /** The struct for communication between fetch and decode. */
-    typedef SimpleFetchSimpleDecode<AlphaSimpleImpl> FetchStruct;
-
-    /** The struct for communication between decode and rename. */
-    typedef SimpleDecodeSimpleRename<AlphaSimpleImpl> DecodeStruct;
-
-    /** The struct for communication between rename and IEW. */
-    typedef SimpleRenameSimpleIEW<AlphaSimpleImpl> RenameStruct;
-
-    /** The struct for communication between IEW and commit. */
-    typedef SimpleIEWSimpleCommit<AlphaSimpleImpl> IEWStruct;
-
-    /** The struct for communication within the IEW stage. */
-    typedef IssueStruct<AlphaSimpleImpl> IssueStruct;
-
-    /** The struct for all backwards communication. */
-    typedef TimeBufStruct TimeStruct;
+    enum {
+        MaxWidth = 8
+    };
 };
 
-
-
 #endif // __ALPHA_IMPL_HH__
diff --git a/cpu/beta_cpu/alpha_params.hh b/cpu/beta_cpu/alpha_params.hh
index b217ef8e3..92dfd35f5 100644
--- a/cpu/beta_cpu/alpha_params.hh
+++ b/cpu/beta_cpu/alpha_params.hh
@@ -1,6 +1,8 @@
 #ifndef __ALPHA_SIMPLE_PARAMS_HH__
 #define __ALPHA_SIMPLE_PARAMS_HH__
 
+#include "cpu/beta_cpu/full_cpu.hh"
+
 //Forward declarations
 class System;
 class AlphaITB;
@@ -15,16 +17,11 @@ class MemInterface;
  * defined that it can pass to all of the individual stages.
  */
 
-class AlphaSimpleParams
+class AlphaSimpleParams : public BaseFullCPU::Params
 {
   public:
-    std::string name;
-    int numberOfThreads;
-
 #ifdef FULL_SYSTEM
-    System *_system;
     AlphaITB *itb; AlphaDTB *dtb;
-    Tick freq;
 #else
     std::vector<Process *> workload;
     Process *process;
@@ -33,34 +30,41 @@ class AlphaSimpleParams
 
     FunctionalMemory *mem;
 
-    Counter maxInstsAnyThread;
-    Counter maxInstsAllThreads;
-    Counter maxLoadsAnyThread;
-    Counter maxLoadsAllThreads;
-
     //
     // Caches
     //
     MemInterface *icacheInterface;
     MemInterface *dcacheInterface;
 
+    //
+    // Fetch
+    //
     unsigned decodeToFetchDelay;
     unsigned renameToFetchDelay;
     unsigned iewToFetchDelay;
     unsigned commitToFetchDelay;
     unsigned fetchWidth;
 
+    //
+    // Decode
+    //
     unsigned renameToDecodeDelay;
     unsigned iewToDecodeDelay;
     unsigned commitToDecodeDelay;
     unsigned fetchToDecodeDelay;
     unsigned decodeWidth;
 
+    //
+    // Rename
+    //
     unsigned iewToRenameDelay;
     unsigned commitToRenameDelay;
     unsigned decodeToRenameDelay;
     unsigned renameWidth;
 
+    //
+    // IEW
+    //
     unsigned commitToIEWDelay;
     unsigned renameToIEWDelay;
     unsigned issueToExecuteDelay;
@@ -69,16 +73,39 @@ class AlphaSimpleParams
     unsigned executeIntWidth;
     unsigned executeFloatWidth;
 
+    //
+    // Commit
+    //
     unsigned iewToCommitDelay;
     unsigned renameToROBDelay;
     unsigned commitWidth;
     unsigned squashWidth;
 
+    //
+    // Branch predictor (BP & BTB)
+    //
+    unsigned localPredictorSize;
+    unsigned localPredictorCtrBits;
+    unsigned BTBEntries;
+    unsigned BTBTagSize;
+
+    //
+    // Load store queue
+    //
+    unsigned LQEntries;
+    unsigned SQEntries;
+
+    //
+    // Miscellaneous
+    //
     unsigned numPhysIntRegs;
     unsigned numPhysFloatRegs;
     unsigned numIQEntries;
     unsigned numROBEntries;
 
+    // Probably can get this from somewhere.
+    unsigned instShiftAmt;
+
     bool defReg;
 };
 
diff --git a/cpu/beta_cpu/bpred_unit.cc b/cpu/beta_cpu/bpred_unit.cc
new file mode 100644
index 000000000..6de2def44
--- /dev/null
+++ b/cpu/beta_cpu/bpred_unit.cc
@@ -0,0 +1,5 @@
+
+#include "cpu/beta_cpu/bpred_unit_impl.hh"
+#include "cpu/beta_cpu/alpha_impl.hh"
+
+template DefaultBPredUnit<AlphaSimpleImpl>;
diff --git a/cpu/beta_cpu/bpred_unit.hh b/cpu/beta_cpu/bpred_unit.hh
new file mode 100644
index 000000000..71191f5b7
--- /dev/null
+++ b/cpu/beta_cpu/bpred_unit.hh
@@ -0,0 +1,51 @@
+
+#ifndef __BPRED_UNIT_HH__
+#define __BPRED_UNIT_HH__
+
+// For Addr type.
+#include "arch/alpha/isa_traits.hh"
+
+#include "cpu/beta_cpu/2bit_local_pred.hh"
+#include "cpu/beta_cpu/btb.hh"
+
+/**
+ * Basically a wrapper class to hold both the branch predictor
+ * and the BTB.  Right now I'm unsure of the implementation; it would
+ * be nicer to have something closer to the CPUPolicy or the Impl where
+ * this is just typedefs, but it forces the upper level stages to be
+ * aware of the constructors of the BP and the BTB.  The nicer thing
+ * to do is have this templated on the Impl, accept the usual Params
+ * object, and be able to call the constructors on the BP and BTB.
+ */
+template<class Impl>
+class DefaultBPredUnit
+{
+  public:
+    typedef typename Impl::Params Params;
+
+    DefaultBPredUnit(Params &params);
+
+    bool BPLookup(Addr &inst_PC)
+    { return BP.lookup(inst_PC); }
+
+    bool BTBValid(Addr &inst_PC)
+    { return BTB.valid(inst_PC); }
+
+    Addr BTBLookup(Addr &inst_PC)
+    { return BTB.lookup(inst_PC); }
+
+    void BPUpdate(Addr &inst_PC, bool taken)
+    { BP.update(inst_PC, taken); }
+
+    void BTBUpdate(Addr &inst_PC, Addr &target_PC)
+    { BTB.update(inst_PC, target_PC); }
+
+  private:
+
+    DefaultBP BP;
+
+    DefaultBTB BTB;
+
+};
+
+#endif // __BPRED_UNIT_HH__
diff --git a/cpu/beta_cpu/bpred_unit_impl.hh b/cpu/beta_cpu/bpred_unit_impl.hh
new file mode 100644
index 000000000..47415ce9b
--- /dev/null
+++ b/cpu/beta_cpu/bpred_unit_impl.hh
@@ -0,0 +1,13 @@
+
+#include "cpu/beta_cpu/bpred_unit.hh"
+
+template<class Impl>
+DefaultBPredUnit<Impl>::DefaultBPredUnit(Params &params)
+  : BP(params.localPredictorSize,
+       params.localPredictorCtrBits,
+       params.instShiftAmt),
+    BTB(params.BTBEntries,
+        params.BTBTagSize,
+        params.instShiftAmt)
+{
+}
diff --git a/cpu/beta_cpu/btb.cc b/cpu/beta_cpu/btb.cc
new file mode 100644
index 000000000..b49f30482
--- /dev/null
+++ b/cpu/beta_cpu/btb.cc
@@ -0,0 +1,85 @@
+#include <math.h>
+
+#include "cpu/beta_cpu/btb.hh"
+#include "base/trace.hh"
+
+DefaultBTB::DefaultBTB(unsigned _numEntries,
+                       unsigned _tagBits,
+                       unsigned _instShiftAmt)
+    : numEntries(_numEntries),
+      tagBits(_tagBits),
+      instShiftAmt(_instShiftAmt)
+{
+    // @todo Check to make sure num_entries is valid (a power of 2)
+
+    DPRINTF(Fetch, "BTB: Creating BTB object.\n");
+
+    btb = new BTBEntry[numEntries];
+
+    for (int i = 0; i < numEntries; ++i)
+    {
+        btb[i].valid = false;
+    }
+
+    idxMask = numEntries - 1;
+
+    tagMask = (1 << tagBits) - 1;
+
+    tagShiftAmt = instShiftAmt + (int)log2(numEntries);
+}
+
+inline
+unsigned
+DefaultBTB::getIndex(const Addr &inst_PC)
+{
+    // Need to shift PC over by the word offset.
+    return (inst_PC >> instShiftAmt) & idxMask;
+}
+
+inline
+Addr
+DefaultBTB::getTag(const Addr &inst_PC)
+{
+    return (inst_PC >> tagShiftAmt) & tagMask;
+}
+
+bool
+DefaultBTB::valid(const Addr &inst_PC)
+{
+    unsigned btb_idx = getIndex(inst_PC);
+
+    Addr inst_tag = getTag(inst_PC);
+
+    if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+// @todo Create some sort of return struct that has both whether or not the
+// address is valid, and also the address.  For now will just use addr = 0 to
+// represent invalid entry.
+Addr
+DefaultBTB::lookup(const Addr &inst_PC)
+{
+    unsigned btb_idx = getIndex(inst_PC);
+
+    Addr inst_tag = getTag(inst_PC);
+
+    if (btb[btb_idx].valid && inst_tag == btb[btb_idx].tag) {
+        return btb[btb_idx].target;
+    } else {
+        return 0;
+    }
+}
+
+void
+DefaultBTB::update(const Addr &inst_PC, const Addr &target)
+{
+    unsigned btb_idx = getIndex(inst_PC);
+
+    btb[btb_idx].valid = true;
+    btb[btb_idx].target = target;
+    btb[btb_idx].tag = getTag(inst_PC);
+}
diff --git a/cpu/beta_cpu/btb.hh b/cpu/beta_cpu/btb.hh
new file mode 100644
index 000000000..81069eabe
--- /dev/null
+++ b/cpu/beta_cpu/btb.hh
@@ -0,0 +1,52 @@
+#ifndef __BTB_HH__
+#define __BTB_HH__
+
+// For Addr type.
+#include "arch/alpha/isa_traits.hh"
+
+class DefaultBTB
+{
+  private:
+    struct BTBEntry
+    {
+        BTBEntry()
+            : tag(0), target(0), valid(false)
+        {
+        }
+
+        Addr tag;
+        Addr target;
+        bool valid;
+    };
+
+  public:
+    DefaultBTB(unsigned numEntries, unsigned tagBits,
+               unsigned instShiftAmt);
+
+    Addr lookup(const Addr &inst_PC);
+
+    bool valid(const Addr &inst_PC);
+
+    void update(const Addr &inst_PC, const Addr &target_PC);
+
+  private:
+    inline unsigned getIndex(const Addr &inst_PC);
+
+    inline Addr getTag(const Addr &inst_PC);
+
+    BTBEntry *btb;
+
+    unsigned numEntries;
+
+    unsigned idxMask;
+
+    unsigned tagBits;
+
+    unsigned tagMask;
+
+    unsigned instShiftAmt;
+
+    unsigned tagShiftAmt;
+};
+
+#endif // __BTB_HH__
diff --git a/cpu/beta_cpu/comm.hh b/cpu/beta_cpu/comm.hh
index 21a530ecf..849a6c797 100644
--- a/cpu/beta_cpu/comm.hh
+++ b/cpu/beta_cpu/comm.hh
@@ -2,6 +2,7 @@
 #define __COMM_HH__
 
 #include <stdint.h>
+#include <vector>
 #include "arch/alpha/isa_traits.hh"
 #include "cpu/inst_seq.hh"
 
@@ -10,34 +11,49 @@ using namespace std;
 // Find better place to put this typedef.
 typedef short int PhysRegIndex;
 
-// Might want to put constructors/destructors here.
 template<class Impl>
 struct SimpleFetchSimpleDecode {
-    // Consider having a field of how many ready instructions.
-    typename Impl::DynInst *insts[1];
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    int size;
+
+    DynInstPtr insts[Impl::MaxWidth + 1];
 };
 
 template<class Impl>
 struct SimpleDecodeSimpleRename {
-    // Consider having a field of how many ready instructions.
-    typename Impl::DynInst *insts[1];
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    int size;
+
+    DynInstPtr insts[Impl::MaxWidth + 1];
 };
 
 template<class Impl>
 struct SimpleRenameSimpleIEW {
-    // Consider having a field of how many ready instructions.
-    typename Impl::DynInst *insts[1];
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    int size;
+
+    DynInstPtr insts[Impl::MaxWidth + 1];
 };
 
 template<class Impl>
 struct SimpleIEWSimpleCommit {
-    // Consider having a field of how many ready instructions.
-    typename Impl::DynInst *insts[1];
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    int size;
+
+    DynInstPtr insts[Impl::MaxWidth + 1];
 };
 
 template<class Impl>
 struct IssueStruct {
-    typename Impl::DynInst *insts[1];
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+    int size;
+
+    DynInstPtr insts[Impl::MaxWidth + 1];
 };
 
 struct TimeBufStruct {
@@ -47,11 +63,9 @@ struct TimeBufStruct {
         bool predIncorrect;
         uint64_t branchAddr;
 
-        //Question, is it worthwhile to have this Addr passed along
-        //by each stage, or just have Fetch look it up in the proper
-        //amount of cycles in the time buffer?
-        //Both might actually be needed because decode can send a different
-        //nextPC if the bpred was wrong.
+        bool branchMispredict;
+        bool branchTaken;
+        uint64_t mispredPC;
         uint64_t nextPC;
     };
 
@@ -72,14 +86,14 @@ struct TimeBufStruct {
     struct iewComm {
         bool squash;
         bool stall;
-        bool predIncorrect;
 
         // Also eventually include skid buffer space.
         unsigned freeIQEntries;
 
+        bool branchMispredict;
+        bool branchTaken;
+        uint64_t mispredPC;
         uint64_t nextPC;
-        // For now hardcode the type.
-        // Change this to sequence number eventually.
         InstSeqNum squashedSeqNum;
     };
 
@@ -90,18 +104,31 @@ struct TimeBufStruct {
         bool stall;
         unsigned freeROBEntries;
 
+        bool branchMispredict;
+        bool branchTaken;
+        uint64_t mispredPC;
         uint64_t nextPC;
 
         // Think of better names here.
         // Will need to be a variety of sizes...
         // Maybe make it a vector, that way only need one object.
-        vector<PhysRegIndex> freeRegs;
+        std::vector<PhysRegIndex> freeRegs;
 
         bool robSquashing;
+
         // Represents the instruction that has either been retired or
         // squashed.  Similar to having a single bus that broadcasts the
         // retired or squashed sequence number.
         InstSeqNum doneSeqNum;
+
+        // Extra bits of information so that the LDSTQ only updates when it
+        // needs to.
+        bool commitIsStore;
+        bool commitIsLoad;
+
+        // Communication specifically to the IQ to tell the IQ that it can
+        // schedule a non-speculative instruction.
+        InstSeqNum nonSpecSeqNum;
     };
 
     commitComm commitInfo;
diff --git a/cpu/beta_cpu/commit.hh b/cpu/beta_cpu/commit.hh
index 0e5a96e2a..981d9e78f 100644
--- a/cpu/beta_cpu/commit.hh
+++ b/cpu/beta_cpu/commit.hh
@@ -1,6 +1,4 @@
-// Todo: Squash properly.  Have commit be able to send a squash signal
-// to previous stages; will be needed when trap() is implemented.
-// Maybe have a special method for handling interrupts/traps.
+// Todo: Maybe have a special method for handling interrupts/traps.
 //
 // Traps:  Have IEW send a signal to commit saying that there's a trap to
 // be handled.  Have commit send the PC back to the fetch stage, along
@@ -17,12 +15,11 @@
 #ifndef __SIMPLE_COMMIT_HH__
 #define __SIMPLE_COMMIT_HH__
 
-//Includes: ROB, time buffer, structs, memory interface
-#include "arch/alpha/isa_traits.hh"
+//#include "arch/alpha/isa_traits.hh"
 #include "base/timebuf.hh"
-#include "cpu/beta_cpu/comm.hh"
-#include "cpu/beta_cpu/rename_map.hh"
-#include "cpu/beta_cpu/rob.hh"
+//#include "cpu/beta_cpu/comm.hh"
+//#include "cpu/beta_cpu/rename_map.hh"
+//#include "cpu/beta_cpu/rob.hh"
 #include "mem/memory_interface.hh"
 
 template<class Impl>
@@ -32,14 +29,15 @@ class SimpleCommit
     // Typedefs from the Impl.
     typedef typename Impl::ISA ISA;
     typedef typename Impl::FullCPU FullCPU;
-    typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::Params Params;
+    typedef typename Impl::CPUPol CPUPol;
 
-    typedef typename Impl::CPUPol::ROB ROB;
+    typedef typename CPUPol::ROB ROB;
 
-    typedef typename Impl::TimeStruct TimeStruct;
-    typedef typename Impl::IEWStruct IEWStruct;
-    typedef typename Impl::RenameStruct RenameStruct;
+    typedef typename CPUPol::TimeStruct TimeStruct;
+    typedef typename CPUPol::IEWStruct IEWStruct;
+    typedef typename CPUPol::RenameStruct RenameStruct;
 
   public:
     // I don't believe commit can block, so it will only have two
@@ -83,7 +81,7 @@ class SimpleCommit
 
     void commitInsts();
 
-    bool commitHead(DynInst *head_inst, unsigned inst_num);
+    bool commitHead(DynInstPtr &head_inst, unsigned inst_num);
 
     void getInsts();
 
@@ -117,7 +115,7 @@ class SimpleCommit
     FullCPU *cpu;
 
     /** Pointer to the rename map.  DO NOT USE if possible. */
-    typename Impl::CPUPol::RenameMap *renameMap;
+//    typename Impl::CPUPol::RenameMap *renameMap;
 
     //Store buffer interface?  Will need to move committed stores to the
     //store buffer
diff --git a/cpu/beta_cpu/commit_impl.hh b/cpu/beta_cpu/commit_impl.hh
index bc8db0ce0..45b8bc7de 100644
--- a/cpu/beta_cpu/commit_impl.hh
+++ b/cpu/beta_cpu/commit_impl.hh
@@ -9,7 +9,7 @@
 #include "cpu/beta_cpu/commit.hh"
 #include "cpu/exetrace.hh"
 
-template<class Impl>
+template <class Impl>
 SimpleCommit<Impl>::SimpleCommit(Params &params)
     : dcacheInterface(params.dcacheInterface),
       iewToCommitDelay(params.iewToCommitDelay),
@@ -21,7 +21,7 @@ SimpleCommit<Impl>::SimpleCommit(Params &params)
     _status = Idle;
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::setCPU(FullCPU *cpu_ptr)
 {
@@ -29,7 +29,7 @@ SimpleCommit<Impl>::setCPU(FullCPU *cpu_ptr)
     cpu = cpu_ptr;
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
@@ -43,7 +43,7 @@ SimpleCommit<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
     robInfoFromIEW = timeBuffer->getWire(-iewToCommitDelay);
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 {
@@ -54,7 +54,7 @@ SimpleCommit<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
     fromRename = renameQueue->getWire(-renameToROBDelay);
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
 {
@@ -65,7 +65,7 @@ SimpleCommit<Impl>::setIEWQueue(TimeBuffer<IEWStruct> *iq_ptr)
     fromIEW = iewQueue->getWire(-iewToCommitDelay);
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::setROB(ROB *rob_ptr)
 {
@@ -73,7 +73,7 @@ SimpleCommit<Impl>::setROB(ROB *rob_ptr)
     rob = rob_ptr;
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::tick()
 {
@@ -106,7 +106,7 @@ SimpleCommit<Impl>::tick()
     toIEW->commitInfo.freeROBEntries = rob->numFreeEntries();
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::commit()
 {
@@ -154,17 +154,30 @@ SimpleCommit<Impl>::commit()
 
         // Send back the sequence number of the squashed instruction.
         toIEW->commitInfo.doneSeqNum = squashed_inst;
+
         // Send back the squash signal to tell stages that they should squash.
         toIEW->commitInfo.squash = true;
+
         // Send back the rob squashing signal so other stages know that the
         // ROB is in the process of squashing.
         toIEW->commitInfo.robSquashing = true;
+
+        toIEW->commitInfo.branchMispredict =
+            robInfoFromIEW->iewInfo.branchMispredict;
+
+        toIEW->commitInfo.branchTaken =
+            robInfoFromIEW->iewInfo.branchTaken;
+
         toIEW->commitInfo.nextPC = robInfoFromIEW->iewInfo.nextPC;
+
+        toIEW->commitInfo.mispredPC = robInfoFromIEW->iewInfo.mispredPC;
     }
 
     if (_status != ROBSquashing) {
+        // If we're not currently squashing, then get instructions.
         getInsts();
 
+        // Try to commit any instructions.
         commitInsts();
     }
 
@@ -183,7 +196,7 @@ SimpleCommit<Impl>::commit()
 // Loop that goes through as many instructions in the ROB as possible and
 // tries to commit them.  The actual work for committing is done by the
 // commitHead() function.
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::commitInsts()
 {
@@ -195,7 +208,7 @@ SimpleCommit<Impl>::commitInsts()
     // Can't commit and squash things at the same time...
     ////////////////////////////////////
 
-    DynInst *head_inst = rob->readHeadInst();
+    DynInstPtr head_inst = rob->readHeadInst();
 
     unsigned num_committed = 0;
 
@@ -224,12 +237,12 @@ SimpleCommit<Impl>::commitInsts()
             // inst in the ROB without affecting any other stages.
             rob->retireHead();
 
-            ++num_committed;
         } else {
             // Increment the total number of non-speculative instructions
             // executed.
             // Hack for now: it really shouldn't happen until after the
-            // commit is deemed to be successful.
+            // commit is deemed to be successful, but this count is needed
+            // for syscalls.
             cpu->funcExeInst++;
 
             // Try to commit the head instruction.
@@ -256,9 +269,9 @@ SimpleCommit<Impl>::commitInsts()
     }
 }
 
-template<class Impl>
+template <class Impl>
 bool
-SimpleCommit<Impl>::commitHead(DynInst *head_inst, unsigned inst_num)
+SimpleCommit<Impl>::commitHead(DynInstPtr &head_inst, unsigned inst_num)
 {
     // Make sure instruction is valid
     assert(head_inst);
@@ -271,21 +284,26 @@ SimpleCommit<Impl>::commitHead(DynInst *head_inst, unsigned inst_num)
     // Also check if it's nonspeculative.  Or a nop.  Then it will be
     // executed only when it reaches the head of the ROB.  Actually
     // executing a nop is a bit overkill...
-    if (head_inst->isStore() ||
-        head_inst->isLoad() ||
-        head_inst->isNonSpeculative() ||
-        head_inst->isNop()) {
-        DPRINTF(Commit, "Commit: Executing a memory reference or "
-                "nonspeculative instruction at commit, inst PC %#x\n",
-                head_inst->PC);
-        fault = head_inst->execute();
-
-        // Tell CPU to tell IEW to tell IQ (nasty chain of calls) that
-        // this instruction has completed.  Could predicate this on
-        // whether or not the instruction has a destination.
-        // Slightly unrealistic, but will not really be a factor once
-        // a real load/store queue is added.
-        cpu->wakeDependents(head_inst);
+    if (!head_inst->isExecuted()) {
+        // Keep this number correct.  We have not yet actually executed
+        // and committed this instruction.
+        cpu->funcExeInst--;
+        if (head_inst->isStore() || head_inst->isNonSpeculative()) {
+            DPRINTF(Commit, "Commit: Encountered a store or non-speculative "
+                    "instruction at the head of the ROB, PC %#x.\n",
+                    head_inst->readPC());
+
+            toIEW->commitInfo.nonSpecSeqNum = head_inst->seqNum;
+
+            // Change the instruction so it won't try to commit again until
+            // it is executed.
+            head_inst->clearCanCommit();
+
+            return false;
+        } else {
+            panic("Commit: Trying to commit un-executed instruction "
+                  "of unknown type!\n");
+        }
     }
 
     // Check if memory access was successful.
@@ -320,8 +338,10 @@ SimpleCommit<Impl>::commitHead(DynInst *head_inst, unsigned inst_num)
 #ifdef FULL_SYSTEM
         cpu->trap(fault);
 #else // !FULL_SYSTEM
-        panic("fault (%d) detected @ PC %08p", head_inst->getFault(),
-              head_inst->PC);
+        if (!head_inst->isNop()) {
+            panic("fault (%d) detected @ PC %08p", head_inst->getFault(),
+                  head_inst->PC);
+        }
 #endif // FULL_SYSTEM
     }
 
@@ -333,8 +353,8 @@ SimpleCommit<Impl>::commitHead(DynInst *head_inst, unsigned inst_num)
         return false;
     }
 
-    //If it's a branch, then send back branch prediction update info
-    //to the fetch stage.
+    // If it's a branch, then send back branch prediction update info
+    // to the fetch stage.
     // This should be handled in the iew stage if a mispredict happens...
 #if 0
     if (head_inst->isControl()) {
@@ -358,6 +378,15 @@ SimpleCommit<Impl>::commitHead(DynInst *head_inst, unsigned inst_num)
     }
 #endif
 
+    // Explicit communication back to the LDSTQ that a load has been committed
+    // and can be removed from the LDSTQ.  Stores don't need this because
+    // the LDSTQ will already have been told that a store has reached the head
+    // of the ROB.  Consider including communication if it's a store as well
+    // to keep things orthagonal.
+    if (head_inst->isLoad()) {
+        toIEW->commitInfo.commitIsLoad = true;
+    }
+
     // Now that the instruction is going to be committed, finalize its
     // trace data.
     if (head_inst->traceData) {
@@ -371,7 +400,7 @@ SimpleCommit<Impl>::commitHead(DynInst *head_inst, unsigned inst_num)
     return true;
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::getInsts()
 {
@@ -382,24 +411,33 @@ SimpleCommit<Impl>::getInsts()
     // Read any issued instructions and place them into the ROB.  Do this
     // prior to squashing to avoid having instructions in the ROB that
     // don't get squashed properly.
+    int insts_to_process = min((int)renameWidth, fromRename->size);
+
     for (int inst_num = 0;
-         fromRename->insts[inst_num] != NULL && inst_num < renameWidth;
+         inst_num < insts_to_process;
          ++inst_num)
     {
-        DPRINTF(Commit, "Commit: Inserting PC %#x into ROB.\n",
-                fromRename->insts[inst_num]->readPC());
-        rob->insertInst(fromRename->insts[inst_num]);
+        if (!fromRename->insts[inst_num]->isSquashed()) {
+            DPRINTF(Commit, "Commit: Inserting PC %#x into ROB.\n",
+                    fromRename->insts[inst_num]->readPC());
+            rob->insertInst(fromRename->insts[inst_num]);
+        } else {
+            DPRINTF(Commit, "Commit: Instruction %i PC %#x was "
+                    "squashed, skipping.\n",
+                    fromRename->insts[inst_num]->seqNum,
+                    fromRename->insts[inst_num]->readPC());
+        }
     }
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleCommit<Impl>::markCompletedInsts()
 {
     // Grab completed insts out of the IEW instruction queue, and mark
     // instructions completed within the ROB.
     for (int inst_num = 0;
-         fromIEW->insts[inst_num] != NULL && inst_num < iewWidth;
+         inst_num < iewWidth && fromIEW->insts[inst_num];
          ++inst_num)
     {
         DPRINTF(Commit, "Commit: Marking PC %#x, SN %i ready within ROB.\n",
@@ -411,7 +449,7 @@ SimpleCommit<Impl>::markCompletedInsts()
     }
 }
 
-template<class Impl>
+template <class Impl>
 uint64_t
 SimpleCommit<Impl>::readCommitPC()
 {
diff --git a/cpu/beta_cpu/cpu_policy.hh b/cpu/beta_cpu/cpu_policy.hh
index 676334249..ec8460b77 100644
--- a/cpu/beta_cpu/cpu_policy.hh
+++ b/cpu/beta_cpu/cpu_policy.hh
@@ -1,32 +1,60 @@
 #ifndef __CPU_POLICY_HH__
 #define __CPU_POLICY_HH__
 
+#include "cpu/beta_cpu/bpred_unit.hh"
+#include "cpu/beta_cpu/inst_queue.hh"
+#include "cpu/beta_cpu/regfile.hh"
+#include "cpu/beta_cpu/free_list.hh"
+#include "cpu/beta_cpu/rename_map.hh"
+#include "cpu/beta_cpu/rob.hh"
+#include "cpu/beta_cpu/store_set.hh"
+#include "cpu/beta_cpu/mem_dep_unit.hh"
+#include "cpu/beta_cpu/ldstq.hh"
+
 #include "cpu/beta_cpu/fetch.hh"
 #include "cpu/beta_cpu/decode.hh"
 #include "cpu/beta_cpu/rename.hh"
 #include "cpu/beta_cpu/iew.hh"
 #include "cpu/beta_cpu/commit.hh"
 
-#include "cpu/beta_cpu/inst_queue.hh"
-#include "cpu/beta_cpu/regfile.hh"
-#include "cpu/beta_cpu/free_list.hh"
-#include "cpu/beta_cpu/rename_map.hh"
-#include "cpu/beta_cpu/rob.hh"
+#include "cpu/beta_cpu/comm.hh"
 
 template<class Impl>
 struct SimpleCPUPolicy
 {
+    typedef DefaultBPredUnit<Impl> BPredUnit;
     typedef PhysRegFile<Impl> RegFile;
     typedef SimpleFreeList FreeList;
     typedef SimpleRenameMap RenameMap;
     typedef ROB<Impl> ROB;
     typedef InstructionQueue<Impl> IQ;
+    typedef MemDepUnit<StoreSet, Impl> MemDepUnit;
+    typedef LDSTQ<Impl> LDSTQ;
 
     typedef SimpleFetch<Impl> Fetch;
     typedef SimpleDecode<Impl> Decode;
     typedef SimpleRename<Impl> Rename;
     typedef SimpleIEW<Impl, IQ> IEW;
     typedef SimpleCommit<Impl> Commit;
+
+    /** The struct for communication between fetch and decode. */
+    typedef SimpleFetchSimpleDecode<Impl> FetchStruct;
+
+    /** The struct for communication between decode and rename. */
+    typedef SimpleDecodeSimpleRename<Impl> DecodeStruct;
+
+    /** The struct for communication between rename and IEW. */
+    typedef SimpleRenameSimpleIEW<Impl> RenameStruct;
+
+    /** The struct for communication between IEW and commit. */
+    typedef SimpleIEWSimpleCommit<Impl> IEWStruct;
+
+    /** The struct for communication within the IEW stage. */
+    typedef IssueStruct<Impl> IssueStruct;
+
+    /** The struct for all backwards communication. */
+    typedef TimeBufStruct TimeStruct;
+
 };
 
 #endif //__CPU_POLICY_HH__
diff --git a/cpu/beta_cpu/decode.hh b/cpu/beta_cpu/decode.hh
index c41955dcb..be88a4b36 100644
--- a/cpu/beta_cpu/decode.hh
+++ b/cpu/beta_cpu/decode.hh
@@ -10,11 +10,7 @@
 
 #include <queue>
 
-//Will want to include: time buffer, structs,
 #include "base/timebuf.hh"
-#include "cpu/beta_cpu/comm.hh"
-
-using namespace std;
 
 template<class Impl>
 class SimpleDecode
@@ -22,13 +18,15 @@ class SimpleDecode
   private:
     // Typedefs from the Impl.
     typedef typename Impl::ISA ISA;
-    typedef typename Impl::DynInst DynInst;
     typedef typename Impl::FullCPU FullCPU;
+    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::Params Params;
+    typedef typename Impl::CPUPol CPUPol;
 
-    typedef typename Impl::FetchStruct FetchStruct;
-    typedef typename Impl::DecodeStruct DecodeStruct;
-    typedef typename Impl::TimeStruct TimeStruct;
+    // Typedefs from the CPU policy.
+    typedef typename CPUPol::FetchStruct FetchStruct;
+    typedef typename CPUPol::DecodeStruct DecodeStruct;
+    typedef typename CPUPol::TimeStruct TimeStruct;
 
     // Typedefs from the ISA.
     typedef typename ISA::Addr Addr;
@@ -71,7 +69,7 @@ class SimpleDecode
 
     inline void unblock();
 
-    void squash(DynInst *inst);
+    void squash(DynInstPtr &inst);
 
     // Interfaces to objects outside of decode.
     /** CPU interface. */
@@ -106,7 +104,7 @@ class SimpleDecode
     typename TimeBuffer<FetchStruct>::wire fromFetch;
 
     /** Skid buffer between fetch and decode. */
-    queue<FetchStruct> skidBuffer;
+    std::queue<FetchStruct> skidBuffer;
 
   private:
     //Consider making these unsigned to avoid any confusion.
@@ -124,6 +122,12 @@ class SimpleDecode
 
     /** The width of decode, in instructions. */
     unsigned decodeWidth;
+
+    /** The instruction that decode is currently on.  It needs to have
+     *  persistent state so that when a stall occurs in the middle of a
+     *  group of instructions, it can restart at the proper instruction.
+     */
+    unsigned numInst;
 };
 
 #endif // __SIMPLE_DECODE_HH__
diff --git a/cpu/beta_cpu/decode_impl.hh b/cpu/beta_cpu/decode_impl.hh
index ecf19b8ea..d0f46eaa5 100644
--- a/cpu/beta_cpu/decode_impl.hh
+++ b/cpu/beta_cpu/decode_impl.hh
@@ -9,7 +9,8 @@ SimpleDecode<Impl>::SimpleDecode(Params &params)
       iewToDecodeDelay(params.iewToDecodeDelay),
       commitToDecodeDelay(params.commitToDecodeDelay),
       fetchToDecodeDelay(params.fetchToDecodeDelay),
-      decodeWidth(params.decodeWidth)
+      decodeWidth(params.decodeWidth),
+      numInst(0)
 {
     DPRINTF(Decode, "Decode: decodeWidth=%i.\n", decodeWidth);
     _status = Idle;
@@ -103,7 +104,7 @@ SimpleDecode<Impl>::unblock()
 // was predicted incorrectly.
 template<class Impl>
 void
-SimpleDecode<Impl>::squash(DynInst *inst)
+SimpleDecode<Impl>::squash(DynInstPtr &inst)
 {
     DPRINTF(Decode, "Decode: Squashing due to incorrect branch prediction "
                     "detected at decode.\n");
@@ -163,16 +164,22 @@ SimpleDecode<Impl>::tick()
         // buffer were used.  Remove those instructions and handle
         // the rest of unblocking.
         if (_status == Unblocking) {
+            if (fromFetch->size > 0) {
+                // Add the current inputs to the skid buffer so they can be
+                // reprocessed when this stage unblocks.
+                skidBuffer.push(*fromFetch);
+            }
+
             unblock();
         }
     } else if (_status == Blocked) {
-        if (fromFetch->insts[0] != NULL) {
+        if (fromFetch->size > 0) {
             block();
         }
 
         if (!fromRename->renameInfo.stall &&
-                   !fromIEW->iewInfo.stall &&
-                   !fromCommit->commitInfo.stall) {
+            !fromIEW->iewInfo.stall &&
+            !fromCommit->commitInfo.stall) {
             DPRINTF(Decode, "Decode: Stall signals cleared, going to "
                     "unblock.\n");
             _status = Unblocking;
@@ -204,9 +211,7 @@ void
 SimpleDecode<Impl>::decode()
 {
     // Check time buffer if being told to squash.
-    if (/* fromRename->renameInfo.squash || */
-        /* fromIEW->iewInfo.squash || */
-        fromCommit->commitInfo.squash) {
+    if (fromCommit->commitInfo.squash) {
         squash();
         return;
     }
@@ -223,20 +228,22 @@ SimpleDecode<Impl>::decode()
     // Check fetch queue to see if instructions are available.
     // If no available instructions, do nothing, unless this stage is
     // currently unblocking.
-    if (fromFetch->insts[0] == NULL && _status != Unblocking) {
+    if (!fromFetch->insts[0] && _status != Unblocking) {
         DPRINTF(Decode, "Decode: Nothing to do, breaking out early.\n");
         // Should I change the status to idle?
         return;
     }
 
-    DynInst *inst;
+    DynInstPtr inst;
+
     // Instead have a class member variable that records which instruction
     // was the last one that was ended on.  At the tick() stage, it can
     // check if that's equal to 0.  If not, then don't pop stuff off.
-    unsigned num_inst = 0;
-    bool insts_available = _status == Unblocking ?
-        skidBuffer.front().insts[num_inst] != NULL :
-        fromFetch->insts[num_inst] != NULL;
+    unsigned to_rename_index = 0;
+
+    int insts_available = _status == Unblocking ?
+        skidBuffer.front().size :
+        fromFetch->size;
 
     // Debug block...
 #if 0
@@ -247,7 +254,7 @@ SimpleDecode<Impl>::decode()
             DPRINTF(Decode, "Decode: No instructions available, skid buffer "
                     "empty.\n");
         } else if (_status != Unblocking &&
-                   fromFetch->insts[0] == NULL) {
+                   !fromFetch->insts[0]) {
             DPRINTF(Decode, "Decode: No instructions available, fetch queue "
                     "empty.\n");
         } else {
@@ -262,26 +269,39 @@ SimpleDecode<Impl>::decode()
     // should be computed here.  However in this simple model all
     // computation will take place at execute.  Hence doneTargCalc()
     // will always be false.
-     while (num_inst < decodeWidth &&
-            insts_available)
+     while (insts_available > 0)
      {
         DPRINTF(Decode, "Decode: Sending instruction to rename.\n");
         // Might create some sort of accessor to get an instruction
         // on a per thread basis.  Or might be faster to just get
         // a pointer to an array or list of instructions and use that
         // within this code.
-        inst = _status == Unblocking ? skidBuffer.front().insts[num_inst] :
-               fromFetch->insts[num_inst];
+        inst = _status == Unblocking ? skidBuffer.front().insts[numInst] :
+               fromFetch->insts[numInst];
+
         DPRINTF(Decode, "Decode: Processing instruction %i with PC %#x\n",
-                inst, inst->readPC());
+                inst->seqNum, inst->readPC());
+
+        if (inst->isSquashed()) {
+            DPRINTF(Decode, "Decode: Instruction %i with PC %#x is "
+                    "squashed, skipping.\n",
+                    inst->seqNum, inst->readPC());
+
+            ++numInst;
+            --insts_available;
+
+            continue;
+        }
 
         // This current instruction is valid, so add it into the decode
         // queue.  The next instruction may not be valid, so check to
         // see if branches were predicted correctly.
-        toRename->insts[num_inst] = inst;
+        toRename->insts[to_rename_index] = inst;
+
+        ++(toRename->size);
 
         // Ensure that if it was predicted as a branch, it really is a
-        // branch.  This case should never happen in this model.
+        // branch.
         if (inst->predTaken() && !inst->isControl()) {
             panic("Instruction predicted as a branch!");
 
@@ -306,20 +326,19 @@ SimpleDecode<Impl>::decode()
         // them as ready to issue at any time.  Not sure if this check
         // should exist here or at a later stage; however it doesn't matter
         // too much for function correctness.
+        // Isn't this handled by the inst queue?
         if (inst->numSrcRegs() == 0) {
             inst->setCanIssue();
         }
 
         // Increment which instruction we're looking at.
-        ++num_inst;
-
-        // Check whether or not there are instructions available.
-        // Either need to check within the skid buffer, or the fetch
-        // queue, depending if this stage is unblocking or not.
-        insts_available = _status == Unblocking ?
-                           skidBuffer.front().insts[num_inst] == NULL :
-                           fromFetch->insts[num_inst] == NULL;
+        ++numInst;
+        ++to_rename_index;
+
+        --insts_available;
     }
+
+     numInst = 0;
 }
 
 #endif // __SIMPLE_DECODE_CC__
diff --git a/cpu/beta_cpu/fetch.hh b/cpu/beta_cpu/fetch.hh
index 5717c65ac..e59a9df7f 100644
--- a/cpu/beta_cpu/fetch.hh
+++ b/cpu/beta_cpu/fetch.hh
@@ -13,16 +13,12 @@
 #include "base/timebuf.hh"
 #include "sim/eventq.hh"
 #include "cpu/pc_event.hh"
-#include "cpu/beta_cpu/comm.hh"
 #include "mem/mem_interface.hh"
 
-using namespace std;
-
 /**
  * SimpleFetch class to fetch a single instruction each cycle.  SimpleFetch
  * will stall if there's an Icache miss, but otherwise assumes a one cycle
- * Icache hit.  This will be replaced with a more fleshed out class in the
- * future.
+ * Icache hit.
  */
 
 template <class Impl>
@@ -31,12 +27,15 @@ class SimpleFetch
   public:
     /** Typedefs from Impl. */
     typedef typename Impl::ISA ISA;
+    typedef typename Impl::CPUPol CPUPol;
     typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::FullCPU FullCPU;
     typedef typename Impl::Params Params;
 
-    typedef typename Impl::FetchStruct FetchStruct;
-    typedef typename Impl::TimeStruct TimeStruct;
+    typedef typename CPUPol::BPredUnit BPredUnit;
+    typedef typename CPUPol::FetchStruct FetchStruct;
+    typedef typename CPUPol::TimeStruct TimeStruct;
 
     /** Typedefs from ISA. */
     typedef typename ISA::MachInst MachInst;
@@ -76,6 +75,17 @@ class SimpleFetch
     // Figure out PC vs next PC and how it should be updated
     void squash(Addr newPC);
 
+  private:
+    /**
+     * Looks up in the branch predictor to see if the next PC should be
+     * either next PC+=MachInst or a branch target.
+     * @params next_PC Next PC variable passed in by reference.  It is
+     * expected to be set to the current PC; it will be updated with what
+     * the next PC will be.
+     * @return Whether or not a branch was predicted as taken.
+     */
+    bool lookupAndUpdateNextPC(Addr &next_PC);
+
   public:
     class CacheCompletionEvent : public Event
     {
@@ -110,8 +120,6 @@ class SimpleFetch
     /** Wire to get commit's information from backwards time buffer. */
     typename TimeBuffer<TimeStruct>::wire fromCommit;
 
-    // Will probably have this sit in the FullCPU and just pass a pointr in.
-    // Simplifies the constructors of all stages.
     /** Internal fetch instruction queue. */
     TimeBuffer<FetchStruct> *fetchQueue;
 
@@ -122,6 +130,9 @@ class SimpleFetch
     /** Icache interface. */
     MemInterface *icacheInterface;
 
+    /** BPredUnit. */
+    BPredUnit branchPred;
+
     /** Memory request used to access cache. */
     MemReqPtr memReq;
 
diff --git a/cpu/beta_cpu/fetch_impl.hh b/cpu/beta_cpu/fetch_impl.hh
index 918d2dad2..93f7bf6d2 100644
--- a/cpu/beta_cpu/fetch_impl.hh
+++ b/cpu/beta_cpu/fetch_impl.hh
@@ -1,7 +1,5 @@
-// Todo: Rewrite this.  Add in branch prediction.  Fix up if squashing comes
-// from decode; only the correct instructions should be killed.  This will
-// probably require changing the CPU's instList functions to take a seqNum
-// instead of a dyninst.  With probe path, should be able to specify
+// Todo: Add in branch prediction.  With probe path, should
+// be able to specify
 // size of data to fetch.  Will be able to get full cache line.
 
 // Remove this later.
@@ -41,6 +39,7 @@ template<class Impl>
 SimpleFetch<Impl>::SimpleFetch(Params &params)
     : cacheCompletionEvent(this),
       icacheInterface(params.icacheInterface),
+      branchPred(params),
       decodeToFetchDelay(params.decodeToFetchDelay),
       renameToFetchDelay(params.renameToFetchDelay),
       iewToFetchDelay(params.iewToFetchDelay),
@@ -66,7 +65,7 @@ SimpleFetch<Impl>::SimpleFetch(Params &params)
     blkSize = icacheInterface ? icacheInterface->getBlockSize() : 64;
 
     // Create mask to get rid of offset bits.
-    cacheBlockMask = ~((int)log2(blkSize) - 1);
+    cacheBlockMask = (blkSize - 1);
 
     // Get the size of an instruction.
     instSize = sizeof(MachInst);
@@ -123,24 +122,59 @@ SimpleFetch<Impl>::processCacheCompletion()
         _status = IcacheMissComplete;
 }
 
-// Note that in the SimpleFetch<>, will most likely have to provide the
-// template parameters to BP and BTB.
+template<class Impl>
+bool
+SimpleFetch<Impl>::lookupAndUpdateNextPC(Addr &next_PC)
+{
+#if 1
+    // Do branch prediction check here.
+    bool predict_taken =  branchPred.BPLookup(next_PC);
+    Addr predict_target;
+
+    DPRINTF(Fetch, "Fetch: Branch predictor predicts taken? %i\n",
+            predict_taken);
+
+    if (branchPred.BTBValid(next_PC)) {
+        predict_target = branchPred.BTBLookup(next_PC);
+        DPRINTF(Fetch, "Fetch: BTB target is %#x.\n", predict_target);
+    } else {
+        predict_taken = false;
+        DPRINTF(Fetch, "Fetch: BTB does not have a valid entry.\n");
+    }
+
+    // Now update the PC to fetch the next instruction in the cache
+    // line.
+    if (!predict_taken) {
+        next_PC = next_PC + instSize;
+        return false;
+    } else {
+        next_PC = predict_target;
+        return true;
+    }
+#endif
+
+#if 0
+    next_PC = next_PC + instSize;
+    return false;
+#endif
+}
+
 template<class Impl>
 void
 SimpleFetch<Impl>::squash(Addr new_PC)
 {
     DPRINTF(Fetch, "Fetch: Squashing, setting PC to: %#x.\n", new_PC);
+
     cpu->setNextPC(new_PC + instSize);
     cpu->setPC(new_PC);
 
     _status = Squashing;
 
-    // Clear out the instructions that are no longer valid.
-    // Actually maybe slightly unrealistic to kill instructions that are
-    // in flight like that between stages.  Perhaps just have next
-    // stage ignore those instructions or something.  In the cycle where it's
-    // returning from squashing, the other stages can just ignore the inputs
-    // for that cycle.
+    // Clear the icache miss if it's outstanding.
+    if (_status == IcacheMissStall && icacheInterface) {
+        // @todo: Use an actual thread number here.
+        icacheInterface->squash(0);
+    }
 
     // Tell the CPU to remove any instructions that aren't currently
     // in the ROB (instructions in flight that were killed).
@@ -151,25 +185,27 @@ template<class Impl>
 void
 SimpleFetch<Impl>::tick()
 {
-#if 0
+#if 1
+    // Check squash signals from commit.
     if (fromCommit->commitInfo.squash) {
         DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
                 "from commit.\n");
 
         // In any case, squash.
         squash(fromCommit->commitInfo.nextPC);
-        return;
-    }
 
-    if (fromDecode->decodeInfo.squash) {
-        DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                "from decode.\n");
+        // Also check if there's a mispredict that happened.
+        if (fromCommit->commitInfo.branchMispredict) {
+            branchPred.BPUpdate(fromCommit->commitInfo.mispredPC,
+                                 fromCommit->commitInfo.branchTaken);
+            branchPred.BTBUpdate(fromCommit->commitInfo.mispredPC,
+                                  fromCommit->commitInfo.nextPC);
+        }
 
-        // Squash unless we're already squashing?
-        squash(fromDecode->decodeInfo.nextPC);
         return;
     }
 
+    // Check ROB squash signals from commit.
     if (fromCommit->commitInfo.robSquashing) {
         DPRINTF(Fetch, "Fetch: ROB is still squashing.\n");
 
@@ -178,11 +214,36 @@ SimpleFetch<Impl>::tick()
         return;
     }
 
+    // Check squash signals from decode.
+    if (fromDecode->decodeInfo.squash) {
+        DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
+                "from decode.\n");
+
+        // Update the branch predictor.
+        if (fromCommit->decodeInfo.branchMispredict) {
+            branchPred.BPUpdate(fromDecode->decodeInfo.mispredPC,
+                                 fromDecode->decodeInfo.branchTaken);
+            branchPred.BTBUpdate(fromDecode->decodeInfo.mispredPC,
+                                  fromDecode->decodeInfo.nextPC);
+        }
+
+        if (_status != Squashing) {
+            // Squash unless we're already squashing?
+            squash(fromDecode->decodeInfo.nextPC);
+            return;
+        }
+    }
+
+
+
+    // Check if any of the stall signals are high.
     if (fromDecode->decodeInfo.stall ||
         fromRename->renameInfo.stall ||
         fromIEW->iewInfo.stall ||
         fromCommit->commitInfo.stall)
     {
+        // Block stage, regardless of current status.
+
         DPRINTF(Fetch, "Fetch: Stalling stage.\n");
         DPRINTF(Fetch, "Fetch: Statuses: Decode: %i Rename: %i IEW: %i "
                 "Commit: %i\n",
@@ -190,10 +251,36 @@ SimpleFetch<Impl>::tick()
                 fromRename->renameInfo.stall,
                 fromIEW->iewInfo.stall,
                 fromCommit->commitInfo.stall);
-        // What to do if we're already in an icache stall?
+
+        _status = Blocked;
+        return;
+    } else if (_status == Blocked) {
+        // Unblock stage if status is currently blocked and none of the
+        // stall signals are being held high.
+        _status = Running;
+
+        return;
+    }
+
+    // If fetch has reached this point, then there are no squash signals
+    // still being held high.  Check if fetch is in the squashing state;
+    // if so, fetch can switch to running.
+    // Similarly, there are no blocked signals still being held high.
+    // Check if fetch is in the blocked state; if so, fetch can switch to
+    // running.
+    if (_status == Squashing) {
+        DPRINTF(Fetch, "Fetch: Done squashing, switching to running.\n");
+
+        // Switch status to running
+        _status = Running;
+    } else if (_status != IcacheMissStall) {
+        DPRINTF(Fetch, "Fetch: Running stage.\n");
+
+        fetch();
     }
 #endif
 
+#if 0
     if (_status != Blocked &&
         _status != Squashing &&
         _status != IcacheMissStall) {
@@ -253,62 +340,17 @@ SimpleFetch<Impl>::tick()
             DPRINTF(Fetch, "Fetch: ROB still squashing.\n");
         }
     }
-
+#endif
 }
 
 template<class Impl>
 void
 SimpleFetch<Impl>::fetch()
 {
-    //////////////////////////////////////////
-    // Check backwards communication
-    //////////////////////////////////////////
-
-    // If branch prediction is incorrect, squash any instructions,
-    // update PC, and do not fetch anything this cycle.
-
-    // Might want to put all the PC changing stuff in one area.
-    // Normally should also check here to see if there is branch
-    // misprediction info to update with.
-    if (fromCommit->commitInfo.squash) {
-        DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                "from commit.\n");
-        squash(fromCommit->commitInfo.nextPC);
-        return;
-    } else if (fromDecode->decodeInfo.squash) {
-        DPRINTF(Fetch, "Fetch: Squashing instructions due to squash "
-                "from decode.\n");
-        squash(fromDecode->decodeInfo.nextPC);
-        return;
-    } else if (fromCommit->commitInfo.robSquashing) {
-        DPRINTF(Fetch, "Fetch: ROB still squashing.\n");
-        _status = Squashing;
-        return;
-    }
-
-    // If being told to stall, do nothing.
-    if (fromDecode->decodeInfo.stall ||
-        fromRename->renameInfo.stall ||
-        fromIEW->iewInfo.stall ||
-        fromCommit->commitInfo.stall)
-    {
-        DPRINTF(Fetch, "Fetch: Stalling stage.\n");
-        DPRINTF(Fetch, "Fetch: Statuses: Decode: %i Rename: %i IEW: %i "
-                "Commit: %i\n",
-                fromDecode->decodeInfo.stall,
-                fromRename->renameInfo.stall,
-                fromIEW->iewInfo.stall,
-                fromCommit->commitInfo.stall);
-        _status = Blocked;
-        return;
-    }
-
     //////////////////////////////////////////
     // Start actual fetch
     //////////////////////////////////////////
 
-    // If nothing else outstanding, attempt to read instructions.
-
 #ifdef FULL_SYSTEM
     // Flag to say whether or not address is physical addr.
     unsigned flags = cpu->inPalMode() ? PHYSICAL : 0;
@@ -317,13 +359,14 @@ SimpleFetch<Impl>::fetch()
 #endif // FULL_SYSTEM
 
     // The current PC.
-    Addr PC = cpu->readPC();
+    Addr fetch_PC = cpu->readPC();
 
     // Fault code for memory access.
     Fault fault = No_Fault;
 
     // If returning from the delay of a cache miss, then update the status
-    // to running, otherwise do the cache access.
+    // to running, otherwise do the cache access.  Possibly move this up
+    // to tick() function.
     if (_status == IcacheMissComplete) {
         DPRINTF(Fetch, "Fetch: Icache miss is complete.\n");
 
@@ -334,7 +377,7 @@ SimpleFetch<Impl>::fetch()
     } else {
         DPRINTF(Fetch, "Fetch: Attempting to translate and read "
                        "instruction, starting at PC %08p.\n",
-                PC);
+                fetch_PC);
 
         // Otherwise check if the instruction exists within the cache.
         // If it does, then proceed on to read the instruction and the rest
@@ -347,7 +390,7 @@ SimpleFetch<Impl>::fetch()
         // Setup the memReq to do a read of the first isntruction's address.
         // Set the appropriate read size and flags as well.
         memReq->cmd = Read;
-        memReq->reset(PC, instSize, flags);
+        memReq->reset(fetch_PC, instSize, flags);
 
         // Translate the instruction request.
         // Should this function be
@@ -401,7 +444,7 @@ SimpleFetch<Impl>::fetch()
     // Probably have a status on a per thread basis so each thread can
     // block independently and be woken up independently.
 
-    Addr next_PC = 0;
+    Addr next_PC = fetch_PC;
     InstSeqNum inst_seq;
 
     // If the read of the first instruction was successful, then grab the
@@ -410,6 +453,10 @@ SimpleFetch<Impl>::fetch()
     if (fault == No_Fault) {
         DPRINTF(Fetch, "Fetch: Adding instructions to queue to decode.\n");
 
+        //////////////////////////
+        // Fetch first instruction
+        //////////////////////////
+
         // Need to keep track of whether or not a predicted branch
         // ended this fetch block.
         bool predicted_branch = false;
@@ -420,12 +467,17 @@ SimpleFetch<Impl>::fetch()
         // Get a sequence number.
         inst_seq = cpu->getAndIncrementInstSeq();
 
+        // Update the next PC; it either is PC+sizeof(MachInst), or
+        // branch_target.  Check whether or not a branch was taken.
+        predicted_branch = lookupAndUpdateNextPC(next_PC);
+
         // Because the first instruction was already fetched, create the
         // DynInst and put it into the queue to decode.
-        DynInst *instruction = new DynInst(inst, PC, PC+instSize, inst_seq,
-                                           cpu);
+        DynInstPtr instruction = new DynInst(inst, fetch_PC, next_PC,
+                                             inst_seq, cpu);
+
         DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
-                instruction, instruction->readPC());
+                inst_seq, instruction->readPC());
         DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
                 OPCODE(inst));
 
@@ -440,13 +492,17 @@ SimpleFetch<Impl>::fetch()
         // that heads to decode.
         toDecode->insts[0] = instruction;
 
-        // Now update the PC to fetch the next instruction in the cache
-        // line.
-        PC = PC + instSize;
+        toDecode->size++;
+
+        fetch_PC = next_PC;
+
+        //////////////////////////
+        // Fetch other instructions
+        //////////////////////////
 
         // Obtain the index into the cache line by getting only the low
-        // order bits.
-        int line_index = PC & cacheBlockMask;
+        // order bits.  Will need to do shifting as well.
+        int line_index = fetch_PC & cacheBlockMask;
 
         // Take instructions and put them into the queue heading to decode.
         // Then read the next instruction in the cache line.  Continue
@@ -461,12 +517,14 @@ SimpleFetch<Impl>::fetch()
         // instructions, which can then be used to get all the instructions
         // needed.  Figure out if I can roll it back into one loop.
         for (int fetched = 1;
-             line_index < blkSize && fetched < fetchWidth;
+             line_index < blkSize &&
+                 fetched < fetchWidth &&
+                 !predicted_branch;
              line_index+=instSize, ++fetched)
         {
             // Reset the mem request to setup the read of the next
             // instruction.
-            memReq->reset(PC, instSize, flags);
+            memReq->reset(fetch_PC, instSize, flags);
 
             // Translate the instruction request.
             fault = cpu->translateInstReq(memReq);
@@ -485,16 +543,24 @@ SimpleFetch<Impl>::fetch()
             // Get a sequence number.
             inst_seq = cpu->getAndIncrementInstSeq();
 
+            predicted_branch = lookupAndUpdateNextPC(next_PC);
+
             // Create the actual DynInst.  Parameters are:
             // DynInst(instruction, PC, predicted PC, CPU pointer).
             // Because this simple model has no branch prediction, the
             // predicted PC will simply be PC+sizeof(MachInst).
             // Update to actually use a branch predictor to predict the
             // target in the future.
-            DynInst *instruction = new DynInst(inst, PC, PC+instSize,
-                                               inst_seq, cpu);
+            DynInstPtr instruction =
+                new DynInst(inst, fetch_PC, next_PC, inst_seq, cpu);
+
+            instruction->traceData =
+                Trace::getInstRecord(curTick, cpu->xcBase(), cpu,
+                                     instruction->staticInst,
+                                     instruction->readPC(), 0);
+
             DPRINTF(Fetch, "Fetch: Instruction %i created, with PC %#x\n",
-                    instruction, instruction->readPC());
+                    inst_seq, instruction->readPC());
             DPRINTF(Fetch, "Fetch: Instruction opcode is: %03p\n",
                     OPCODE(inst));
 
@@ -504,20 +570,15 @@ SimpleFetch<Impl>::fetch()
             // that heads to decode.
             toDecode->insts[fetched] = instruction;
 
+            toDecode->size++;
+
             // Might want to keep track of various stats.
 //             numInstsFetched++;
 
-            // Now update the PC to fetch the next instruction in the cache
-            // line.
-            PC = PC + instSize;
+            // Update the PC with the next PC.
+            fetch_PC = next_PC;
         }
 
-        // If no branches predicted taken, then increment PC with
-        // fall-through path.  This simple model always predicts not
-        // taken.
-        if (!predicted_branch) {
-            next_PC = PC;
-        }
     }
 
     // Now that fetching is completed, update the PC to signify what the next
@@ -544,10 +605,10 @@ SimpleFetch<Impl>::fetch()
 
         _status = Blocked;
 #ifdef FULL_SYSTEM
-        // Trap will probably need a pointer to the CPU to do accessing.
-        // Or an exec context. --Write ProxyExecContext eventually.
-        // Avoid using this for now as the xc really shouldn't be in here.
-        cpu->trap(fault);
+//        cpu->trap(fault);
+        // Send a signal to the ROB indicating that there's a trap from the
+        // fetch stage that needs to be handled.  Need to indicate that
+        // there's a fault, and the fault type.
 #else // !FULL_SYSTEM
         fatal("fault (%d) detected @ PC %08p", fault, cpu->readPC());
 #endif // FULL_SYSTEM
diff --git a/cpu/beta_cpu/free_list.cc b/cpu/beta_cpu/free_list.cc
index 006bf4bf7..542b87471 100644
--- a/cpu/beta_cpu/free_list.cc
+++ b/cpu/beta_cpu/free_list.cc
@@ -1,3 +1,5 @@
+#include "base/trace.hh"
+
 #include "cpu/beta_cpu/free_list.hh"
 
 SimpleFreeList::SimpleFreeList(unsigned _numLogicalIntRegs,
@@ -10,6 +12,16 @@ SimpleFreeList::SimpleFreeList(unsigned _numLogicalIntRegs,
       numPhysicalFloatRegs(_numPhysicalFloatRegs),
       numPhysicalRegs(numPhysicalIntRegs + numPhysicalFloatRegs)
 {
+    DPRINTF(FreeList, "FreeList: Creating new free list object.\n");
+
+    // DEBUG stuff.
+    freeIntRegsScoreboard.resize(numPhysicalIntRegs);
+
+    freeFloatRegsScoreboard.resize(numPhysicalRegs);
+
+    for (PhysRegIndex i = 0; i < numLogicalIntRegs; ++i) {
+        freeIntRegsScoreboard[i] = 0;
+    }
 
     // Put all of the extra physical registers onto the free list.  This
     // means excluding all of the base logical registers.
@@ -17,6 +29,14 @@ SimpleFreeList::SimpleFreeList(unsigned _numLogicalIntRegs,
          i < numPhysicalIntRegs; ++i)
     {
         freeIntRegs.push(i);
+
+        freeIntRegsScoreboard[i] = 1;
+    }
+
+    for (PhysRegIndex i = 0; i < numPhysicalIntRegs + numLogicalFloatRegs;
+         ++i)
+    {
+        freeFloatRegsScoreboard[i] = 0;
     }
 
     // Put all of the extra physical registers onto the free list.  This
@@ -26,8 +46,9 @@ SimpleFreeList::SimpleFreeList(unsigned _numLogicalIntRegs,
     for (PhysRegIndex i = numPhysicalIntRegs + numLogicalFloatRegs;
          i < numPhysicalRegs; ++i)
     {
-        cprintf("Free List: Adding register %i to float list.\n", i);
         freeFloatRegs.push(i);
+
+        freeFloatRegsScoreboard[i] = 1;
     }
 }
 
diff --git a/cpu/beta_cpu/free_list.hh b/cpu/beta_cpu/free_list.hh
index 8521ad94c..0d2b2c421 100644
--- a/cpu/beta_cpu/free_list.hh
+++ b/cpu/beta_cpu/free_list.hh
@@ -8,8 +8,6 @@
 #include "cpu/beta_cpu/comm.hh"
 #include "base/trace.hh"
 
-using namespace std;
-
 // Question: Do I even need the number of logical registers?
 // How to avoid freeing registers instantly?  Same with ROB entries.
 
@@ -33,10 +31,10 @@ class SimpleFreeList
 
   private:
     /** The list of free integer registers. */
-    queue<PhysRegIndex> freeIntRegs;
+    std::queue<PhysRegIndex> freeIntRegs;
 
     /** The list of free floating point registers. */
-    queue<PhysRegIndex> freeFloatRegs;
+    std::queue<PhysRegIndex> freeFloatRegs;
 
     /** Number of logical integer registers. */
     int numLogicalIntRegs;
@@ -53,6 +51,11 @@ class SimpleFreeList
     /** Total number of physical registers. */
     int numPhysicalRegs;
 
+    /** DEBUG stuff below. */
+    std::vector<int> freeIntRegsScoreboard;
+
+    std::vector<bool> freeFloatRegsScoreboard;
+
   public:
     SimpleFreeList(unsigned _numLogicalIntRegs,
                    unsigned _numPhysicalIntRegs,
@@ -94,6 +97,10 @@ SimpleFreeList::getIntReg()
 
     freeIntRegs.pop();
 
+    // DEBUG
+    assert(freeIntRegsScoreboard[free_reg]);
+    freeIntRegsScoreboard[free_reg] = 0;
+
     return(free_reg);
 }
 
@@ -109,6 +116,10 @@ SimpleFreeList::getFloatReg()
 
     freeFloatRegs.pop();
 
+    // DEBUG
+    assert(freeFloatRegsScoreboard[free_reg]);
+    freeFloatRegsScoreboard[free_reg] = 0;
+
     return(free_reg);
 }
 
@@ -120,8 +131,16 @@ SimpleFreeList::addReg(PhysRegIndex freed_reg)
     //already in there.  A bit vector or something similar would be useful.
     if (freed_reg < numPhysicalIntRegs) {
         freeIntRegs.push(freed_reg);
+
+        // DEBUG
+        assert(freeIntRegsScoreboard[freed_reg] == false);
+        freeIntRegsScoreboard[freed_reg] = 1;
     } else if (freed_reg < numPhysicalRegs) {
         freeFloatRegs.push(freed_reg);
+
+        // DEBUG
+        assert(freeFloatRegsScoreboard[freed_reg] == false);
+        freeFloatRegsScoreboard[freed_reg] = 1;
     }
 }
 
@@ -130,6 +149,10 @@ SimpleFreeList::addIntReg(PhysRegIndex freed_reg)
 {
     DPRINTF(Rename, "Freelist: Freeing int register %i.\n", freed_reg);
 
+    // DEBUG
+    assert(!freeIntRegsScoreboard[freed_reg]);
+    freeIntRegsScoreboard[freed_reg] = 1;
+
     //Might want to add in a check for whether or not this register is
     //already in there.  A bit vector or something similar would be useful.
     freeIntRegs.push(freed_reg);
@@ -140,6 +163,10 @@ SimpleFreeList::addFloatReg(PhysRegIndex freed_reg)
 {
     DPRINTF(Rename, "Freelist: Freeing float register %i.\n", freed_reg);
 
+    // DEBUG
+    assert(!freeFloatRegsScoreboard[freed_reg]);
+    freeFloatRegsScoreboard[freed_reg] = 1;
+
     //Might want to add in a check for whether or not this register is
     //already in there.  A bit vector or something similar would be useful.
     freeFloatRegs.push(freed_reg);
diff --git a/cpu/beta_cpu/full_cpu.cc b/cpu/beta_cpu/full_cpu.cc
index 6fbf5d69a..abeb4cb87 100644
--- a/cpu/beta_cpu/full_cpu.cc
+++ b/cpu/beta_cpu/full_cpu.cc
@@ -16,29 +16,18 @@
 using namespace std;
 
 #ifdef FULL_SYSTEM
-BaseFullCPU::BaseFullCPU(const std::string &_name,
-                         int number_of_threads,
-                         Counter max_insts_any_thread,
-                         Counter max_insts_all_threads,
-                         Counter max_loads_any_thread,
-                         Counter max_loads_all_threads,
-                         System *_system, Tick freq)
-    : BaseCPU(_name, number_of_threads,
-              max_insts_any_thread, max_insts_all_threads,
-              max_loads_any_thread, max_loads_all_threads,
-              _system, freq)
+BaseFullCPU::BaseFullCPU(Params &params)
+    : BaseCPU(params.name, params.numberOfThreads,
+              params.maxInstsAnyThread, params.maxInstsAllThreads,
+              params.maxLoadsAnyThread, params.maxLoadsAllThreads,
+              params._system, params.freq)
 {
 }
 #else
-BaseFullCPU::BaseFullCPU(const std::string &_name,
-                         int number_of_threads,
-                         Counter max_insts_any_thread,
-                         Counter max_insts_all_threads,
-                         Counter max_loads_any_thread,
-                         Counter max_loads_all_threads)
-    : BaseCPU(_name, number_of_threads,
-              max_insts_any_thread, max_insts_all_threads,
-              max_loads_any_thread, max_loads_all_threads)
+BaseFullCPU::BaseFullCPU(Params &params)
+    : BaseCPU(params.name, params.numberOfThreads,
+              params.maxInstsAnyThread, params.maxInstsAllThreads,
+              params.maxLoadsAnyThread, params.maxLoadsAllThreads)
 {
 }
 #endif // FULL_SYSTEM
@@ -67,14 +56,9 @@ FullBetaCPU<Impl>::TickEvent::description()
 template <class Impl>
 FullBetaCPU<Impl>::FullBetaCPU(Params &params)
 #ifdef FULL_SYSTEM
-    : BaseFullCPU(params.name, /* number_of_threads */ 1,
-                  params.maxInstsAnyThread, params.maxInstsAllThreads,
-                  params.maxLoadsAnyThread, params.maxLoadsAllThreads,
-                  params.system, params.freq),
+    : BaseFullCPU(params),
 #else
-    : BaseFullCPU(params.name, /* number_of_threads */ 1,
-                  params.maxInstsAnyThread, params.maxInstsAllThreads,
-                  params.maxLoadsAnyThread, params.maxLoadsAllThreads),
+    : BaseFullCPU(params),
 #endif // FULL_SYSTEM
       tickEvent(this),
       fetch(params),
@@ -91,17 +75,18 @@ FullBetaCPU<Impl>::FullBetaCPU(Params &params)
       renameMap(Impl::ISA::NumIntRegs, params.numPhysIntRegs,
                 Impl::ISA::NumFloatRegs, params.numPhysFloatRegs,
                 Impl::ISA::NumMiscRegs,
-                Impl::ISA::ZeroReg, Impl::ISA::ZeroReg),
+                Impl::ISA::ZeroReg,
+                Impl::ISA::ZeroReg + Impl::ISA::NumIntRegs),
 
       rob(params.numROBEntries, params.squashWidth),
 
       // What to pass to these time buffers?
       // For now just have these time buffers be pretty big.
-      timeBuffer(20, 20),
-      fetchQueue(20, 20),
-      decodeQueue(20, 20),
-      renameQueue(20, 20),
-      iewQueue(20, 20),
+      timeBuffer(5, 5),
+      fetchQueue(5, 5),
+      decodeQueue(5, 5),
+      renameQueue(5, 5),
+      iewQueue(5, 5),
 
       xc(NULL),
 
@@ -133,9 +118,9 @@ FullBetaCPU<Impl>::FullBetaCPU(Params &params)
     // initialize CPU, including PC
     TheISA::initCPU(&xc->regs);
 #else
-    xc = new ExecContext(this, /* thread_num */ 0, process, /* asid */ 0);
     DPRINTF(FullCPU, "FullCPU: Process's starting PC is %#x, process is %#x",
             process->prog_entry, process);
+    xc = new ExecContext(this, /* thread_num */ 0, process, /* asid */ 0);
 
     assert(process->getMemory() != NULL);
     assert(mem != NULL);
@@ -393,7 +378,7 @@ FullBetaCPU<Impl>::setPC(Addr new_PC)
 
 template <class Impl>
 void
-FullBetaCPU<Impl>::addInst(DynInst *inst)
+FullBetaCPU<Impl>::addInst(DynInstPtr &inst)
 {
     instList.push_back(inst);
 }
@@ -411,9 +396,9 @@ FullBetaCPU<Impl>::instDone()
 
 template <class Impl>
 void
-FullBetaCPU<Impl>::removeBackInst(DynInst *inst)
+FullBetaCPU<Impl>::removeBackInst(DynInstPtr &inst)
 {
-    DynInst *inst_to_delete;
+    DynInstPtr inst_to_delete;
 
     // Walk through the instruction list, removing any instructions
     // that were inserted after the given instruction, inst.
@@ -424,22 +409,22 @@ FullBetaCPU<Impl>::removeBackInst(DynInst *inst)
         // Obtain the pointer to the instruction.
         inst_to_delete = instList.back();
 
-        DPRINTF(FullCPU, "FullCPU: Deleting instruction %#x, PC %#x\n",
-                inst_to_delete, inst_to_delete->readPC());
+        DPRINTF(FullCPU, "FullCPU: Removing instruction %i, PC %#x\n",
+                inst_to_delete->seqNum, inst_to_delete->readPC());
 
         // Remove the instruction from the list.
         instList.pop_back();
 
-        // Delete the instruction itself.
-        delete inst_to_delete;
+        // Mark it as squashed.
+        inst_to_delete->setSquashed();
     }
 }
 
 template <class Impl>
 void
-FullBetaCPU<Impl>::removeFrontInst(DynInst *inst)
+FullBetaCPU<Impl>::removeFrontInst(DynInstPtr &inst)
 {
-    DynInst *inst_to_delete;
+    DynInstPtr inst_to_delete;
 
     // The front instruction should be the same one being asked to be deleted.
     assert(instList.front() == inst);
@@ -451,7 +436,7 @@ FullBetaCPU<Impl>::removeFrontInst(DynInst *inst)
     DPRINTF(FullCPU, "FullCPU: Deleting committed instruction %#x, PC %#x\n",
             inst_to_delete, inst_to_delete->readPC());
 
-    delete inst_to_delete;
+//    delete inst_to_delete;
 }
 
 template <class Impl>
@@ -461,7 +446,7 @@ FullBetaCPU<Impl>::removeInstsNotInROB()
     DPRINTF(FullCPU, "FullCPU: Deleting instructions from instruction "
             "list.\n");
 
-    DynInst *rob_tail = rob.readTailInst();
+    DynInstPtr rob_tail = rob.readTailInst();
 
     removeBackInst(rob_tail);
 }
@@ -478,13 +463,13 @@ void
 FullBetaCPU<Impl>::dumpInsts()
 {
     int num = 0;
-    typename list<DynInst *>::iterator inst_list_it = instList.begin();
+    typename list<DynInstPtr>::iterator inst_list_it = instList.begin();
 
     while (inst_list_it != instList.end())
     {
-        cprintf("Instruction:%i\nInst:%#x\nPC:%#x\nSN:%lli\n\n",
-                num, (*inst_list_it), (*inst_list_it)->readPC(),
-                (*inst_list_it)->seqNum);
+        cprintf("Instruction:%i\nPC:%#x\nSN:%lli\nIssued:%i\nSquashed:%i\n\n",
+                num, (*inst_list_it)->readPC(), (*inst_list_it)->seqNum,
+                (*inst_list_it)->isIssued(), (*inst_list_it)->isSquashed());
         inst_list_it++;
         ++num;
     }
@@ -492,7 +477,7 @@ FullBetaCPU<Impl>::dumpInsts()
 
 template <class Impl>
 void
-FullBetaCPU<Impl>::wakeDependents(DynInst *inst)
+FullBetaCPU<Impl>::wakeDependents(DynInstPtr &inst)
 {
     iew.wakeDependents(inst);
 }
diff --git a/cpu/beta_cpu/full_cpu.hh b/cpu/beta_cpu/full_cpu.hh
index 00ff1f878..cf753ad67 100644
--- a/cpu/beta_cpu/full_cpu.hh
+++ b/cpu/beta_cpu/full_cpu.hh
@@ -16,6 +16,7 @@
 #include "base/statistics.hh"
 #include "base/timebuf.hh"
 #include "cpu/base_cpu.hh"
+#include "cpu/exec_context.hh"
 #include "cpu/beta_cpu/cpu_policy.hh"
 #include "sim/process.hh"
 
@@ -28,17 +29,32 @@ class BaseFullCPU : public BaseCPU
 {
     //Stuff that's pretty ISA independent will go here.
   public:
+    class Params
+    {
+      public:
+#ifdef FULL_SYSTEM
+        std::string name;
+        int numberOfThreads;
+        Counter maxInstsAnyThread;
+        Counter maxInstsAllThreads;
+        Counter maxLoadsAnyThread;
+        Counter maxLoadsAllThreads;
+        System *_system;
+        Tick freq;
+#else
+        std::string name;
+        int numberOfThreads;
+        Counter maxInstsAnyThread;
+        Counter maxInstsAllThreads;
+        Counter maxLoadsAnyThread;
+        Counter maxLoadsAllThreads;
+#endif // FULL_SYSTEM
+    };
+
 #ifdef FULL_SYSTEM
-    BaseFullCPU(const std::string &_name, int _number_of_threads,
-                Counter max_insts_any_thread, Counter max_insts_all_threads,
-                Counter max_loads_any_thread, Counter max_loads_all_threads,
-                System *_system, Tick freq);
+    BaseFullCPU(Params &params);
 #else
-    BaseFullCPU(const std::string &_name, int _number_of_threads,
-                Counter max_insts_any_thread = 0,
-                Counter max_insts_all_threads = 0,
-                Counter max_loads_any_thread = 0,
-                Counter max_loads_all_threads = 0);
+    BaseFullCPU(Params &params);
 #endif // FULL_SYSTEM
 };
 
@@ -49,7 +65,7 @@ class FullBetaCPU : public BaseFullCPU
     //Put typedefs from the Impl here.
     typedef typename Impl::CPUPol CPUPolicy;
     typedef typename Impl::Params Params;
-    typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::DynInstPtr DynInstPtr;
 
   public:
     enum Status {
@@ -162,7 +178,7 @@ class FullBetaCPU : public BaseFullCPU
     /** Function to add instruction onto the head of the list of the
      *  instructions.  Used when new instructions are fetched.
      */
-    void addInst(DynInst *inst);
+    void addInst(DynInstPtr &inst);
 
     /** Function to tell the CPU that an instruction has completed. */
     void instDone();
@@ -175,7 +191,7 @@ class FullBetaCPU : public BaseFullCPU
      *  @todo: Remove only up until that inst?  Squashed inst is most likely
      *  valid.
      */
-    void removeBackInst(DynInst *inst);
+    void removeBackInst(DynInstPtr &inst);
 
     /** Remove an instruction from the front of the list.  It is expected
      *  that there are no instructions in front of it (that is, none are older
@@ -184,7 +200,7 @@ class FullBetaCPU : public BaseFullCPU
      *  last instruction once it's verified that commit has the same ordering
      *  as the instruction list.
      */
-    void removeFrontInst(DynInst *inst);
+    void removeFrontInst(DynInstPtr &inst);
 
     /** Remove all instructions that are not currently in the ROB. */
     void removeInstsNotInROB();
@@ -198,11 +214,11 @@ class FullBetaCPU : public BaseFullCPU
      *  commit can tell the instruction queue that they have completed.
      *  Eventually this hack should be removed.
      */
-    void wakeDependents(DynInst *inst);
+    void wakeDependents(DynInstPtr &inst);
 
   public:
     /** List of all the instructions in flight. */
-    list<DynInst *> instList;
+    list<DynInstPtr> instList;
 
     //not sure these should be private.
   protected:
@@ -255,15 +271,15 @@ class FullBetaCPU : public BaseFullCPU
     /** Typedefs from the Impl to get the structs that each of the
      *  time buffers should use.
      */
-    typedef typename Impl::TimeStruct TimeStruct;
+    typedef typename CPUPolicy::TimeStruct TimeStruct;
 
-    typedef typename Impl::FetchStruct FetchStruct;
+    typedef typename CPUPolicy::FetchStruct FetchStruct;
 
-    typedef typename Impl::DecodeStruct DecodeStruct;
+    typedef typename CPUPolicy::DecodeStruct DecodeStruct;
 
-    typedef typename Impl::RenameStruct RenameStruct;
+    typedef typename CPUPolicy::RenameStruct RenameStruct;
 
-    typedef typename Impl::IEWStruct IEWStruct;
+    typedef typename CPUPolicy::IEWStruct IEWStruct;
 
     /** The main time buffer to do backwards communication. */
     TimeBuffer<TimeStruct> timeBuffer;
diff --git a/cpu/beta_cpu/iew.hh b/cpu/beta_cpu/iew.hh
index 52b9ccdb0..de408ef0c 100644
--- a/cpu/beta_cpu/iew.hh
+++ b/cpu/beta_cpu/iew.hh
@@ -1,13 +1,10 @@
-//Todo: Update with statuses.  Create constructor.  Fix up time buffer stuff.
-//Will also need a signal heading back at least one stage to rename to say
-//how many empty skid buffer entries there are.  Perhaps further back even.
+//Todo: Update with statuses.
 //Need to handle delaying writes to the writeback bus if it's full at the
-//given time.  Squash properly.  Load store queue.
+//given time.  Load store queue.
 
 #ifndef __SIMPLE_IEW_HH__
 #define __SIMPLE_IEW_HH__
 
-// To include: time buffer, structs, queue,
 #include <queue>
 
 #include "base/timebuf.hh"
@@ -22,16 +19,18 @@ class SimpleIEW
   private:
     //Typedefs from Impl
     typedef typename Impl::ISA ISA;
-    typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::CPUPol CPUPol;
+    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::FullCPU FullCPU;
     typedef typename Impl::Params Params;
 
-    typedef typename Impl::CPUPol::RenameMap RenameMap;
+    typedef typename CPUPol::RenameMap RenameMap;
+    typedef typename CPUPol::LDSTQ LDSTQ;
 
-    typedef typename Impl::TimeStruct TimeStruct;
-    typedef typename Impl::IEWStruct IEWStruct;
-    typedef typename Impl::RenameStruct RenameStruct;
-    typedef typename Impl::IssueStruct IssueStruct;
+    typedef typename CPUPol::TimeStruct TimeStruct;
+    typedef typename CPUPol::IEWStruct IEWStruct;
+    typedef typename CPUPol::RenameStruct RenameStruct;
+    typedef typename CPUPol::IssueStruct IssueStruct;
 
   public:
     enum Status {
@@ -51,7 +50,7 @@ class SimpleIEW
   public:
     void squash();
 
-    void squash(DynInst *inst);
+    void squash(DynInstPtr &inst);
 
     void block();
 
@@ -70,7 +69,7 @@ class SimpleIEW
 
     void setRenameMap(RenameMap *rm_ptr);
 
-    void wakeDependents(DynInst *inst);
+    void wakeDependents(DynInstPtr &inst);
 
     void tick();
 
@@ -111,11 +110,13 @@ class SimpleIEW
     //Will need internal queue to hold onto instructions coming from
     //the rename stage in case of a stall.
     /** Skid buffer between rename and IEW. */
-    queue<RenameStruct> skidBuffer;
+    std::queue<RenameStruct> skidBuffer;
 
     /** Instruction queue. */
     IQ instQueue;
 
+    LDSTQ ldstQueue;
+
     /** Pointer to rename map.  Might not want this stage to directly
      *  access this though...
      */
diff --git a/cpu/beta_cpu/iew_impl.hh b/cpu/beta_cpu/iew_impl.hh
index b198220f5..521ce77f6 100644
--- a/cpu/beta_cpu/iew_impl.hh
+++ b/cpu/beta_cpu/iew_impl.hh
@@ -3,8 +3,8 @@
 // communication happens simultaneously.  Might not be that bad really...
 // it might skew stats a bit though.  Issue would otherwise try to issue
 // instructions that would never be executed if there were a delay; without
-// it issue will simply squash.  Make this stage block properly.  Make this
-// stage delay after a squash properly.  Update the statuses for each stage.
+// it issue will simply squash.  Make this stage block properly.
+// Update the statuses for each stage.
 // Actually read instructions out of the skid buffer.
 
 #include <queue>
@@ -15,8 +15,9 @@
 template<class Impl, class IQ>
 SimpleIEW<Impl, IQ>::SimpleIEW(Params &params)
     : // Just make this time buffer really big for now
-      issueToExecQueue(20, 20),
+      issueToExecQueue(5, 5),
       instQueue(params),
+      ldstQueue(params),
       commitToIEWDelay(params.commitToIEWDelay),
       renameToIEWDelay(params.renameToIEWDelay),
       issueToExecuteDelay(params.issueToExecuteDelay),
@@ -45,6 +46,7 @@ SimpleIEW<Impl, IQ>::setCPU(FullCPU *cpu_ptr)
     cpu = cpu_ptr;
 
     instQueue.setCPU(cpu_ptr);
+    ldstQueue.setCPU(cpu_ptr);
 }
 
 template<class Impl, class IQ>
@@ -96,7 +98,7 @@ SimpleIEW<Impl, IQ>::setRenameMap(RenameMap *rm_ptr)
 
 template<class Impl, class IQ>
 void
-SimpleIEW<Impl, IQ>::wakeDependents(DynInst *inst)
+SimpleIEW<Impl, IQ>::wakeDependents(DynInstPtr &inst)
 {
     instQueue.wakeDependents(inst);
 }
@@ -150,17 +152,15 @@ SimpleIEW<Impl, IQ>::squash()
     // Tell the IQ to start squashing.
     instQueue.squash();
 
-    // Tell rename to squash through the time buffer.
-    // This communication may be redundant depending upon where squash()
-    // is called.
-//    toRename->iewInfo.squash = true;
+    // Tell the LDSTQ to start squashing.
+    ldstQueue.squash(fromCommit->commitInfo.doneSeqNum);
 }
 
 template<class Impl, class IQ>
 void
-SimpleIEW<Impl, IQ>::squash(DynInst *inst)
+SimpleIEW<Impl, IQ>::squash(DynInstPtr &inst)
 {
-    DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC:%#x.\n",
+    DPRINTF(IEW, "IEW: Squashing from a specific instruction, PC: %#x.\n",
             inst->PC);
     // Perhaps leave the squashing up to the ROB stage to tell it when to
     // squash?
@@ -170,8 +170,11 @@ SimpleIEW<Impl, IQ>::squash(DynInst *inst)
     toRename->iewInfo.squash = true;
     // Also send PC update information back to prior stages.
     toRename->iewInfo.squashedSeqNum = inst->seqNum;
+    toRename->iewInfo.mispredPC = inst->readPC();
     toRename->iewInfo.nextPC = inst->readCalcTarg();
-    toRename->iewInfo.predIncorrect = true;
+    toRename->iewInfo.branchMispredict = true;
+    // Prediction was incorrect, so send back inverse.
+    toRename->iewInfo.branchTaken = !(inst->predTaken());
 }
 
 template<class Impl, class IQ>
@@ -229,7 +232,7 @@ SimpleIEW<Impl, IQ>::tick()
 
         // If there's still instructions coming from rename, continue to
         // put them on the skid buffer.
-        if (fromRename->insts[0] != NULL) {
+        if (fromRename->insts[0]) {
             block();
         }
 
@@ -244,6 +247,19 @@ SimpleIEW<Impl, IQ>::tick()
     // Write back number of free IQ entries here.
     toRename->iewInfo.freeIQEntries = instQueue.numFreeEntries();
 
+    // Check the committed load/store signals to see if there's a load
+    // or store to commit.  Also check if it's being told to execute a
+    // nonspeculative instruction.
+    if (fromCommit->commitInfo.commitIsStore) {
+        ldstQueue.commitStores(fromCommit->commitInfo.doneSeqNum);
+    } else if (fromCommit->commitInfo.commitIsLoad) {
+        ldstQueue.commitLoads(fromCommit->commitInfo.doneSeqNum);
+    }
+
+    if (fromCommit->commitInfo.nonSpecSeqNum != 0) {
+        instQueue.scheduleNonSpec(fromCommit->commitInfo.nonSpecSeqNum);
+    }
+
     DPRINTF(IEW, "IEW: IQ has %i free entries.\n",
             instQueue.numFreeEntries());
 }
@@ -265,7 +281,7 @@ SimpleIEW<Impl, IQ>::iew()
     }
 
     ////////////////////////////////////////
-    //ISSUE stage
+    // DISPATCH/ISSUE stage
     ////////////////////////////////////////
 
     //Put into its own function?
@@ -273,16 +289,16 @@ SimpleIEW<Impl, IQ>::iew()
 
     // Check if there are any instructions coming from rename, and we're.
     // not squashing.
-    if (fromRename->insts[0] != NULL && _status != Squashing) {
+    if (fromRename->insts[0] && _status != Squashing) {
 
         // Loop through the instructions, putting them in the instruction
         // queue.
         for (int inst_num = 0; inst_num < issueReadWidth; ++inst_num)
         {
-            DynInst *inst = fromRename->insts[inst_num];
+            DynInstPtr inst = fromRename->insts[inst_num];
 
             // Make sure there's a valid instruction there.
-            if (inst == NULL)
+            if (!inst)
                 break;
 
             DPRINTF(IEW, "IEW: Issue: Adding PC %#x to IQ.\n",
@@ -294,25 +310,38 @@ SimpleIEW<Impl, IQ>::iew()
             // Be sure to mark these instructions as ready so that the
             // commit stage can go ahead and execute them, and mark
             // them as issued so the IQ doesn't reprocess them.
-            if (inst->isMemRef()) {
+            if (inst->isSquashed()) {
+                continue;
+            } else if (inst->isLoad()) {
                 DPRINTF(IEW, "IEW: Issue: Memory instruction "
-                             "encountered, skipping.\n");
+                        "encountered, adding to LDSTQ.\n");
 
-                inst->setIssued();
-                inst->setExecuted();
+                // Reserve a spot in the load store queue for this
+                // memory access.
+                ldstQueue.insertLoad(inst);
+
+            } else if (inst->isStore()) {
+                ldstQueue.insertStore(inst);
+
+                // A bit of a hack.  Set that it can commit so that
+                // the commit stage will try committing it, and then
+                // once commit realizes it's a store it will send back
+                // a signal to this stage to issue and execute that
+                // store.
                 inst->setCanCommit();
 
-                instQueue.advanceTail(inst);
+                instQueue.insertNonSpec(inst);
                 continue;
             } else if (inst->isNonSpeculative()) {
                 DPRINTF(IEW, "IEW: Issue: Nonspeculative instruction "
                         "encountered, skipping.\n");
 
-                inst->setIssued();
-                inst->setExecuted();
+                // Same hack as with stores.
                 inst->setCanCommit();
 
-                instQueue.advanceTail(inst);
+                // Specificall insert it as nonspeculative.
+                instQueue.insertNonSpec(inst);
+
                 continue;
             } else if (inst->isNop()) {
                 DPRINTF(IEW, "IEW: Issue: Nop instruction encountered "
@@ -355,6 +384,7 @@ SimpleIEW<Impl, IQ>::iew()
     // @todo: Move to the FU pool used in the current full cpu.
 
     int fu_usage = 0;
+    bool fetch_redirect = false;
 
     // Execute/writeback any instructions that are available.
     for (int inst_num = 0;
@@ -365,26 +395,48 @@ SimpleIEW<Impl, IQ>::iew()
         DPRINTF(IEW, "IEW: Execute: Executing instructions from IQ.\n");
 
         // Get instruction from issue's queue.
-        DynInst *inst = fromIssue->insts[inst_num];
+        DynInstPtr inst = fromIssue->insts[inst_num];
 
         DPRINTF(IEW, "IEW: Execute: Processing PC %#x.\n", inst->readPC());
 
-        inst->setExecuted();
-
         // Check if the instruction is squashed; if so then skip it
         // and don't count it towards the FU usage.
         if (inst->isSquashed()) {
             DPRINTF(IEW, "IEW: Execute: Instruction was squashed.\n");
+
+            // Consider this instruction executed so that commit can go
+            // ahead and retire the instruction.
+            inst->setExecuted();
+
+            toCommit->insts[inst_num] = inst;
+
             continue;
         }
 
+        inst->setExecuted();
+
         // If an instruction is executed, then count it towards FU usage.
         ++fu_usage;
 
         // Execute instruction.
         // Note that if the instruction faults, it will be handled
         // at the commit stage.
-        inst->execute();
+        if (inst->isMemRef()) {
+            DPRINTF(IEW, "IEW: Execute: Calculating address for memory "
+                    "reference.\n");
+
+            // Tell the LDSTQ to execute this instruction (if it is a load).
+            if (inst->isLoad()) {
+                ldstQueue.executeLoad(inst);
+            } else if (inst->isStore()) {
+                ldstQueue.executeStore();
+            } else {
+                panic("IEW: Unexpected memory type!\n");
+            }
+
+        } else {
+            inst->execute();
+        }
 
         // First check the time slot that this instruction will write
         // to.  If there are free write ports at the time, then go ahead
@@ -401,16 +453,34 @@ SimpleIEW<Impl, IQ>::iew()
         // Check if branch was correct.  This check happens after the
         // instruction is added to the queue because even if the branch
         // is mispredicted, the branch instruction itself is still valid.
-        if (inst->mispredicted()) {
-            DPRINTF(IEW, "IEW: Execute: Branch mispredict detected.\n");
-            DPRINTF(IEW, "IEW: Execute: Redirecting fetch to PC: %#x.\n",
-                    inst->nextPC);
+        // Only handle this if there hasn't already been something that
+        // redirects fetch in this group of instructions.
+        if (!fetch_redirect) {
+            if (inst->mispredicted()) {
+                fetch_redirect = true;
+
+                DPRINTF(IEW, "IEW: Execute: Branch mispredict detected.\n");
+                DPRINTF(IEW, "IEW: Execute: Redirecting fetch to PC: %#x.\n",
+                        inst->nextPC);
+
+                // If incorrect, then signal the ROB that it must be squashed.
+                squash(inst);
+            } else if (ldstQueue.violation()) {
+                fetch_redirect = true;
+
+                DynInstPtr violator = ldstQueue.getMemDepViolator();
 
-            // If incorrect, then signal the ROB that it must be squashed.
-            squash(inst);
+                DPRINTF(IEW, "IEW: LDSTQ detected a violation.  Violator PC: "
+                        "%#x, inst PC: %#x.  Addr is: %#x.\n",
+                        violator->readPC(), inst->readPC(), inst->physEffAddr);
 
-            // Not sure it really needs to break.
-//            break;
+                instQueue.violation(inst, violator);
+
+                squash(inst);
+                // Otherwise check if there was a memory ordering violation.
+                // If there was, then signal ROB that it must be squashed.  Also
+                // signal IQ that there was a violation.
+            }
         }
     }
 
@@ -422,18 +492,20 @@ SimpleIEW<Impl, IQ>::iew()
     // Either have IEW have direct access to rename map, or have this as
     // part of backwards communication.
     for (int inst_num = 0; inst_num < executeWidth &&
-             toCommit->insts[inst_num] != NULL; inst_num++)
+             toCommit->insts[inst_num]; inst_num++)
     {
-        DynInst *inst = toCommit->insts[inst_num];
+        DynInstPtr inst = toCommit->insts[inst_num];
 
         DPRINTF(IEW, "IEW: Sending instructions to commit, PC %#x.\n",
                 inst->readPC());
 
-        instQueue.wakeDependents(inst);
+        if(!inst->isSquashed()) {
+            instQueue.wakeDependents(inst);
 
-        for (int i = 0; i < inst->numDestRegs(); i++)
-        {
-            renameMap->markAsReady(inst->renamedDestRegIdx(i));
+            for (int i = 0; i < inst->numDestRegs(); i++)
+            {
+                renameMap->markAsReady(inst->renamedDestRegIdx(i));
+            }
         }
     }
 
diff --git a/cpu/beta_cpu/inst_queue.hh b/cpu/beta_cpu/inst_queue.hh
index 5741bfcf5..a170979cb 100644
--- a/cpu/beta_cpu/inst_queue.hh
+++ b/cpu/beta_cpu/inst_queue.hh
@@ -2,12 +2,13 @@
 #define __INST_QUEUE_HH__
 
 #include <list>
+#include <map>
 #include <queue>
 #include <stdint.h>
+#include <vector>
 
 #include "base/timebuf.hh"
-
-using namespace std;
+#include "cpu/inst_seq.hh"
 
 //Perhaps have a better separation between the data structure underlying
 //and the actual algorithm.
@@ -24,48 +25,53 @@ using namespace std;
  * and 96-191 are fp).  This remains true even for both logical and
  * physical register indices.
  */
-template<class Impl>
+template <class Impl>
 class InstructionQueue
 {
   public:
     //Typedefs from the Impl.
     typedef typename Impl::FullCPU FullCPU;
-    typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::Params Params;
 
-    typedef typename Impl::IssueStruct IssueStruct;
-    typedef typename Impl::TimeStruct TimeStruct;
+    typedef typename Impl::CPUPol::MemDepUnit MemDepUnit;
+    typedef typename Impl::CPUPol::IssueStruct IssueStruct;
+    typedef typename Impl::CPUPol::TimeStruct TimeStruct;
 
     // Typedef of iterator through the list of instructions.  Might be
     // better to untie this from the FullCPU or pass its information to
     // the stages.
-    typedef typename list<DynInst *>::iterator ListIt;
+    typedef typename std::list<DynInstPtr>::iterator ListIt;
 
     /**
-     * Class for priority queue entries.  Mainly made so that the < operator
-     * is defined.
+     * Struct for comparing entries to be added to the priority queue.  This
+     * gives reverse ordering to the instructions in terms of sequence
+     * numbers: the instructions with smaller sequence numbers (and hence
+     * are older) will be at the top of the priority queue.
      */
-    struct ReadyEntry {
-        DynInst *inst;
-
-        ReadyEntry(DynInst *_inst)
-            : inst(_inst)
-        { }
-
-        /** Compare(lhs,rhs) checks if rhs is "bigger" than lhs.  If so, rhs
-         *  goes higher on the priority queue.  The oldest instruction should
-         *  be on the top of the instruction queue, so in this case "bigger"
-         *  has the reverse meaning; the instruction with the lowest
-         *  sequence number is on the top.
-         */
-        bool operator <(const ReadyEntry &rhs) const
+    struct pqCompare
+    {
+        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
         {
-            if (this->inst->seqNum > rhs.inst->seqNum)
-                return true;
-            return false;
+            return lhs->seqNum > rhs->seqNum;
         }
     };
 
+    /**
+     * Struct for comparing entries to be added to the set.  This gives
+     * standard ordering in terms of sequence numbers.
+     */
+    struct setCompare
+    {
+        bool operator() (const DynInstPtr &lhs, const DynInstPtr &rhs) const
+        {
+            return lhs->seqNum < rhs->seqNum;
+        }
+    };
+
+    typedef std::priority_queue<DynInstPtr, vector<DynInstPtr>, pqCompare>
+    ReadyInstQueue;
+
     InstructionQueue(Params &params);
 
     void setCPU(FullCPU *cpu);
@@ -78,20 +84,32 @@ class InstructionQueue
 
     bool isFull();
 
-    void insert(DynInst *new_inst);
+    void insert(DynInstPtr &new_inst);
 
-    void advanceTail(DynInst *inst);
+    void insertNonSpec(DynInstPtr &new_inst);
+
+    void advanceTail(DynInstPtr &inst);
 
     void scheduleReadyInsts();
 
-    void wakeDependents(DynInst *completed_inst);
+    void scheduleNonSpec(const InstSeqNum &inst);
 
-    void doSquash();
+    void wakeDependents(DynInstPtr &completed_inst);
+
+    void violation(DynInstPtr &store, DynInstPtr &faulting_load);
 
     void squash();
 
+    void doSquash();
+
     void stopSquash();
 
+    /** Debugging function to dump all the list sizes, as well as print
+     *  out the list of nonspeculative instructions.  Should not be used
+     *  in any other capacity, but it has no harmful sideaffects.
+     */
+    void dumpLists();
+
   private:
     /** Debugging function to count how many entries are in the IQ.  It does
      *  a linear walk through the instructions, so do not call this function
@@ -103,6 +121,11 @@ class InstructionQueue
     /** Pointer to the CPU. */
     FullCPU *cpu;
 
+    /** The memory dependence unit, which tracks/predicts memory dependences
+     *  between instructions.
+     */
+    MemDepUnit memDepUnit;
+
     /** The queue to the execute stage.  Issued instructions will be written
      *  into it.
      */
@@ -118,26 +141,46 @@ class InstructionQueue
         Int,
         Float,
         Branch,
+        Memory,
+        Misc,
         Squashed,
         None
     };
 
     /** List of ready int instructions.  Used to keep track of the order in
-     *  which */
-    priority_queue<ReadyEntry> readyIntInsts;
+     *  which instructions should issue.
+     */
+    ReadyInstQueue readyIntInsts;
 
     /** List of ready floating point instructions. */
-    priority_queue<ReadyEntry> readyFloatInsts;
+    ReadyInstQueue readyFloatInsts;
 
     /** List of ready branch instructions. */
-    priority_queue<ReadyEntry> readyBranchInsts;
+    ReadyInstQueue readyBranchInsts;
+
+    /** List of ready memory instructions. */
+    ReadyInstQueue readyMemInsts;
+
+    /** List of ready miscellaneous instructions. */
+    ReadyInstQueue readyMiscInsts;
 
     /** List of squashed instructions (which are still valid and in IQ).
      *  Implemented using a priority queue; the entries must contain both
      *  the IQ index and sequence number of each instruction so that
      *  ordering based on sequence numbers can be used.
      */
-    priority_queue<ReadyEntry> squashedInsts;
+    ReadyInstQueue squashedInsts;
+
+    /** List of non-speculative instructions that will be scheduled
+     *  once the IQ gets a signal from commit.  While it's redundant to
+     *  have the key be a part of the value (the sequence number is stored
+     *  inside of DynInst), when these instructions are woken up only
+     *  the sequence number will be available.  Thus it is necessary to be
+     *  able to search by the sequence number alone.
+     */
+    std::map<InstSeqNum, DynInstPtr> nonSpecInsts;
+
+    typedef typename std::map<InstSeqNum, DynInstPtr>::iterator non_spec_it_t;
 
     /** Number of free IQ entries left. */
     unsigned freeEntries;
@@ -158,6 +201,9 @@ class InstructionQueue
     /** The number of branches that can be issued in one cycle. */
     unsigned branchWidth;
 
+    /** The number of memory instructions that can be issued in one cycle. */
+    unsigned memoryWidth;
+
     /** The total number of instructions that can be issued in one cycle. */
     unsigned totalWidth;
 
@@ -183,7 +229,7 @@ class InstructionQueue
     InstSeqNum squashedSeqNum;
 
     /** Iterator that points to the oldest instruction in the IQ. */
-    ListIt head;
+//    ListIt head;
 
     /** Iterator that points to the youngest instruction in the IQ. */
     ListIt tail;
@@ -200,7 +246,7 @@ class InstructionQueue
     class DependencyEntry
     {
       public:
-        DynInst *inst;
+        DynInstPtr inst;
         //Might want to include data about what arch. register the
         //dependence is waiting on.
         DependencyEntry *next;
@@ -212,9 +258,9 @@ class InstructionQueue
         //away.  So for now it will sit here, within the IQ, until
         //a better implementation is decided upon.
         // This function probably shouldn't be within the entry...
-        void insert(DynInst *new_inst);
+        void insert(DynInstPtr &new_inst);
 
-        void remove(DynInst *inst_to_remove);
+        void remove(DynInstPtr &inst_to_remove);
     };
 
     /** Array of linked lists.  Each linked list is a list of all the
@@ -233,11 +279,12 @@ class InstructionQueue
      */
     vector<bool> regScoreboard;
 
-    bool addToDependents(DynInst *new_inst);
-    void insertDependency(DynInst *new_inst);
-    void createDependency(DynInst *new_inst);
+    bool addToDependents(DynInstPtr &new_inst);
+    void insertDependency(DynInstPtr &new_inst);
+    void createDependency(DynInstPtr &new_inst);
+    void dumpDependGraph();
 
-    void addIfReady(DynInst *inst);
+    void addIfReady(DynInstPtr &inst);
 };
 
 #endif //__INST_QUEUE_HH__
diff --git a/cpu/beta_cpu/inst_queue_impl.hh b/cpu/beta_cpu/inst_queue_impl.hh
index 6f1f06858..03e3fed33 100644
--- a/cpu/beta_cpu/inst_queue_impl.hh
+++ b/cpu/beta_cpu/inst_queue_impl.hh
@@ -1,11 +1,8 @@
 #ifndef __INST_QUEUE_IMPL_HH__
 #define __INST_QUEUE_IMPL_HH__
 
-// Todo: Fix up consistency errors about back of the ready list being
-// the oldest instructions in the queue.  When woken up from the dependency
-// graph they will be the oldest, but when they are immediately executable
-// newer instructions will mistakenly get inserted onto the back.  Also
-// current ordering allows for 0 cycle added-to-scheduled.  Could maybe fake
+// Todo:
+// Current ordering allows for 0 cycle added-to-scheduled.  Could maybe fake
 // it; either do in reverse order, or have added instructions put into a
 // different ready queue that, in scheduleRreadyInsts(), gets put onto the
 // normal ready queue.  This would however give only a one cycle delay,
@@ -21,18 +18,21 @@
 // Blatant hack to avoid compile warnings.
 const InstSeqNum MaxInstSeqNum = 0 - 1;
 
-template<class Impl>
+template <class Impl>
 InstructionQueue<Impl>::InstructionQueue(Params &params)
-    : numEntries(params.numIQEntries),
+    : memDepUnit(params),
+      numEntries(params.numIQEntries),
       intWidth(params.executeIntWidth),
       floatWidth(params.executeFloatWidth),
+      totalWidth(params.issueWidth),
       numPhysIntRegs(params.numPhysIntRegs),
       numPhysFloatRegs(params.numPhysFloatRegs),
       commitToIEWDelay(params.commitToIEWDelay)
 {
     // HACK: HARDCODED NUMBER.  REMOVE LATER AND ADD TO PARAMETER.
-    totalWidth = 1;
     branchWidth = 1;
+    memoryWidth = 1;
+
     DPRINTF(IQ, "IQ: Int width is %i.\n", params.executeIntWidth);
 
     // Initialize the number of free IQ entries.
@@ -66,7 +66,7 @@ InstructionQueue<Impl>::InstructionQueue(Params &params)
 
 }
 
-template<class Impl>
+template <class Impl>
 void
 InstructionQueue<Impl>::setCPU(FullCPU *cpu_ptr)
 {
@@ -75,7 +75,7 @@ InstructionQueue<Impl>::setCPU(FullCPU *cpu_ptr)
     tail = cpu->instList.begin();
 }
 
-template<class Impl>
+template <class Impl>
 void
 InstructionQueue<Impl>::setIssueToExecuteQueue(
                         TimeBuffer<IssueStruct> *i2e_ptr)
@@ -84,7 +84,7 @@ InstructionQueue<Impl>::setIssueToExecuteQueue(
     issueToExecuteQueue = i2e_ptr;
 }
 
-template<class Impl>
+template <class Impl>
 void
 InstructionQueue<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
@@ -96,7 +96,7 @@ InstructionQueue<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 
 // Might want to do something more complex if it knows how many instructions
 // will be issued this cycle.
-template<class Impl>
+template <class Impl>
 bool
 InstructionQueue<Impl>::isFull()
 {
@@ -107,16 +107,16 @@ InstructionQueue<Impl>::isFull()
     }
 }
 
-template<class Impl>
+template <class Impl>
 unsigned
 InstructionQueue<Impl>::numFreeEntries()
 {
     return freeEntries;
 }
 
-template<class Impl>
+template <class Impl>
 void
-InstructionQueue<Impl>::insert(DynInst *new_inst)
+InstructionQueue<Impl>::insert(DynInstPtr &new_inst)
 {
     // Make sure the instruction is valid
     assert(new_inst);
@@ -157,18 +157,78 @@ InstructionQueue<Impl>::insert(DynInst *new_inst)
     // register(s).
     createDependency(new_inst);
 
+    // If it's a memory instruction, add it to the memory dependency
+    // unit.
+    if (new_inst->isMemRef()) {
+        memDepUnit.insert(new_inst);
+    }
+
     // If the instruction is ready then add it to the ready list.
     addIfReady(new_inst);
 
     assert(freeEntries == (numEntries - countInsts()));
 }
 
+template <class Impl>
+void
+InstructionQueue<Impl>::insertNonSpec(DynInstPtr &inst)
+{
+    nonSpecInsts[inst->seqNum] = inst;
+
+    // @todo: Clean up this code; can do it by setting inst as unable
+    // to issue, then calling normal insert on the inst.
+
+    // Make sure the instruction is valid
+    assert(inst);
+
+    DPRINTF(IQ, "IQ: Adding instruction PC %#x to the IQ.\n",
+            inst->readPC());
+
+    // Check if there are any free entries.  Panic if there are none.
+    // Might want to have this return a fault in the future instead of
+    // panicing.
+    assert(freeEntries != 0);
+
+    // If the IQ currently has nothing in it, then there's a possibility
+    // that the tail iterator is invalid (might have been pointing at an
+    // instruction that was retired).  Reset the tail iterator.
+    if (freeEntries == numEntries) {
+        tail = cpu->instList.begin();
+    }
+
+    // Move the tail iterator.  Instructions may not have been issued
+    // to the IQ, so we may have to increment the iterator more than once.
+    while ((*tail) != inst) {
+        tail++;
+
+        // Make sure the tail iterator points at something legal.
+        assert(tail != cpu->instList.end());
+    }
+
+    // Decrease the number of free entries.
+    --freeEntries;
+
+    // Look through its source registers (physical regs), and mark any
+    // dependencies.
+//    addToDependents(inst);
+
+    // Have this instruction set itself as the producer of its destination
+    // register(s).
+    createDependency(inst);
+
+    // If it's a memory instruction, add it to the memory dependency
+    // unit.
+    if (inst->isMemRef()) {
+        memDepUnit.insert(inst);
+    }
+}
+
 // Slightly hack function to advance the tail iterator in the case that
 // the IEW stage issues an instruction that is not added to the IQ.  This
 // is needed in case a long chain of such instructions occurs.
-template<class Impl>
+template <class Impl>
 void
-InstructionQueue<Impl>::advanceTail(DynInst *inst)
+InstructionQueue<Impl>::advanceTail(DynInstPtr &inst)
 {
     // Make sure the instruction is valid
     assert(inst);
@@ -205,10 +265,11 @@ InstructionQueue<Impl>::advanceTail(DynInst *inst)
 }
 
 // Need to make sure the number of float and integer instructions
-// issued does not exceed the total issue bandwidth.  Probably should
-// have some sort of limit of total number of branches that can be issued
-// as well.
-template<class Impl>
+// issued does not exceed the total issue bandwidth.
+// @todo: Figure out a better way to remove the squashed items from the
+// lists.  Checking the top item of each list to see if it's squashed
+// wastes time and forces jumps.
+template <class Impl>
 void
 InstructionQueue<Impl>::scheduleReadyInsts()
 {
@@ -218,6 +279,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     int int_issued = 0;
     int float_issued = 0;
     int branch_issued = 0;
+    int memory_issued = 0;
     int squashed_issued = 0;
     int total_issued = 0;
 
@@ -226,6 +288,8 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     bool insts_available = !readyBranchInsts.empty() ||
         !readyIntInsts.empty() ||
         !readyFloatInsts.empty() ||
+        !readyMemInsts.empty() ||
+        !readyMiscInsts.empty() ||
         !squashedInsts.empty();
 
     // Note: Requires a globally defined constant.
@@ -233,10 +297,12 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     InstList list_with_oldest = None;
 
     // Temporary values.
-    DynInst *int_head_inst;
-    DynInst *float_head_inst;
-    DynInst *branch_head_inst;
-    DynInst *squashed_head_inst;
+    DynInstPtr int_head_inst;
+    DynInstPtr float_head_inst;
+    DynInstPtr branch_head_inst;
+    DynInstPtr mem_head_inst;
+    DynInstPtr misc_head_inst;
+    DynInstPtr squashed_head_inst;
 
     // Somewhat nasty code to look at all of the lists where issuable
     // instructions are located, and choose the oldest instruction among
@@ -257,7 +323,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             insts_available = true;
 
-            int_head_inst = readyIntInsts.top().inst;
+            int_head_inst = readyIntInsts.top();
 
             if (int_head_inst->isSquashed()) {
                 readyIntInsts.pop();
@@ -274,7 +340,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             insts_available = true;
 
-            float_head_inst = readyFloatInsts.top().inst;
+            float_head_inst = readyFloatInsts.top();
 
             if (float_head_inst->isSquashed()) {
                 readyFloatInsts.pop();
@@ -291,7 +357,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
             insts_available = true;
 
-            branch_head_inst = readyBranchInsts.top().inst;
+            branch_head_inst = readyBranchInsts.top();
 
             if (branch_head_inst->isSquashed()) {
                 readyBranchInsts.pop();
@@ -304,11 +370,44 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         }
 
+        if (!readyMemInsts.empty() &&
+            memory_issued < memoryWidth) {
+
+            insts_available = true;
+
+            mem_head_inst = readyMemInsts.top();
+
+            if (mem_head_inst->isSquashed()) {
+                readyMemInsts.pop();
+                continue;
+            } else if (mem_head_inst->seqNum < oldest_inst) {
+                oldest_inst = mem_head_inst->seqNum;
+
+                list_with_oldest = Memory;
+            }
+        }
+
+        if (!readyMiscInsts.empty()) {
+
+            insts_available = true;
+
+            misc_head_inst = readyMiscInsts.top();
+
+            if (misc_head_inst->isSquashed()) {
+                readyMiscInsts.pop();
+                continue;
+            } else if (misc_head_inst->seqNum < oldest_inst) {
+                oldest_inst = misc_head_inst->seqNum;
+
+                list_with_oldest = Misc;
+            }
+        }
+
         if (!squashedInsts.empty()) {
 
             insts_available = true;
 
-            squashed_head_inst = squashedInsts.top().inst;
+            squashed_head_inst = squashedInsts.top();
 
             if (squashed_head_inst->seqNum < oldest_inst) {
                 list_with_oldest = Squashed;
@@ -316,13 +415,14 @@ InstructionQueue<Impl>::scheduleReadyInsts()
 
         }
 
-        DynInst *issuing_inst = NULL;
+        DynInstPtr issuing_inst = NULL;
 
         switch (list_with_oldest) {
           case None:
             DPRINTF(IQ, "IQ: Not able to schedule any instructions. Issuing "
                     "inst is %#x.\n", issuing_inst);
             break;
+
           case Int:
             issuing_inst = int_head_inst;
             readyIntInsts.pop();
@@ -330,6 +430,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
             DPRINTF(IQ, "IQ: Issuing integer instruction PC %#x.\n",
                     issuing_inst->readPC());
             break;
+
           case Float:
             issuing_inst = float_head_inst;
             readyFloatInsts.pop();
@@ -337,6 +438,7 @@ InstructionQueue<Impl>::scheduleReadyInsts()
             DPRINTF(IQ, "IQ: Issuing float instruction PC %#x.\n",
                     issuing_inst->readPC());
             break;
+
           case Branch:
             issuing_inst = branch_head_inst;
             readyBranchInsts.pop();
@@ -344,6 +446,25 @@ InstructionQueue<Impl>::scheduleReadyInsts()
             DPRINTF(IQ, "IQ: Issuing branch instruction PC %#x.\n",
                     issuing_inst->readPC());
             break;
+
+          case Memory:
+            issuing_inst = mem_head_inst;
+
+            memDepUnit.issue(mem_head_inst);
+
+            readyMemInsts.pop();
+            ++memory_issued;
+            DPRINTF(IQ, "IQ: Issuing memory instruction PC %#x.\n",
+                    issuing_inst->readPC());
+            break;
+
+          case Misc:
+            issuing_inst = misc_head_inst;
+            readyMiscInsts.pop();
+            DPRINTF(IQ, "IQ: Issuing a miscellaneous instruction PC %#x.\n",
+                    issuing_inst->readPC());
+            break;
+
           case Squashed:
             issuing_inst = squashed_head_inst;
             squashedInsts.pop();
@@ -366,7 +487,52 @@ InstructionQueue<Impl>::scheduleReadyInsts()
     }
 }
 
-template<class Impl>
+template <class Impl>
+void
+InstructionQueue<Impl>::scheduleNonSpec(const InstSeqNum &inst)
+{
+    non_spec_it_t inst_it = nonSpecInsts.find(inst);
+
+    assert(inst_it != nonSpecInsts.end());
+
+    // Mark this instruction as ready to issue.
+    (*inst_it).second->setCanIssue();
+
+    // Now schedule the instruction.
+    addIfReady((*inst_it).second);
+
+    nonSpecInsts.erase(inst_it);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::violation(DynInstPtr &store,
+                                  DynInstPtr &faulting_load)
+{
+    memDepUnit.violation(store, faulting_load);
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::squash()
+{
+    DPRINTF(IQ, "IQ: Starting to squash instructions in the IQ.\n");
+
+    // Read instruction sequence number of last instruction out of the
+    // time buffer.
+    squashedSeqNum = fromCommit->commitInfo.doneSeqNum;
+
+    // Setup the squash iterator to point to the tail.
+    squashIt = tail;
+
+    // Call doSquash.
+    doSquash();
+
+    // Also tell the memory dependence unit to squash.
+    memDepUnit.squash(squashedSeqNum);
+}
+
+template <class Impl>
 void
 InstructionQueue<Impl>::doSquash()
 {
@@ -380,64 +546,59 @@ InstructionQueue<Impl>::doSquash()
     // Squash any instructions younger than the squashed sequence number
     // given.
     while ((*squashIt)->seqNum > squashedSeqNum) {
-        DynInst *squashed_inst = (*squashIt);
+        DynInstPtr squashed_inst = (*squashIt);
 
         // Only handle the instruction if it actually is in the IQ and
         // hasn't already been squashed in the IQ.
         if (!squashed_inst->isIssued() &&
             !squashed_inst->isSquashedInIQ()) {
             // Remove the instruction from the dependency list.
-            int8_t total_src_regs = squashed_inst->numSrcRegs();
-
-            for (int src_reg_idx = 0;
-                 src_reg_idx < total_src_regs;
-                 src_reg_idx++)
-            {
-                // Only remove it from the dependency graph if it was
-                // placed there in the first place.
-                // HACK: This assumes that instructions woken up from the
-                // dependency chain aren't informed that a specific src
-                // register has become ready.  This may not always be true
-                // in the future.
-                if (!squashed_inst->isReadySrcRegIdx(src_reg_idx)) {
-                    int8_t src_reg =
+            // Hack for now: These below don't add themselves to the
+            // dependency list, so don't try to remove them.
+            if (!squashed_inst->isNonSpeculative() &&
+                !squashed_inst->isStore()) {
+                int8_t total_src_regs = squashed_inst->numSrcRegs();
+
+                for (int src_reg_idx = 0;
+                     src_reg_idx < total_src_regs;
+                     src_reg_idx++)
+                {
+                    PhysRegIndex src_reg =
                         squashed_inst->renamedSrcRegIdx(src_reg_idx);
-                    dependGraph[src_reg].remove(squashed_inst);
+
+                    // Only remove it from the dependency graph if it was
+                    // placed there in the first place.
+                    // HACK: This assumes that instructions woken up from the
+                    // dependency chain aren't informed that a specific src
+                    // register has become ready.  This may not always be true
+                    // in the future.
+                    if (!squashed_inst->isReadySrcRegIdx(src_reg_idx) &&
+                        src_reg < numPhysRegs) {
+                        dependGraph[src_reg].remove(squashed_inst);
+                    }
                 }
             }
 
+            // Might want to also clear out the head of the dependency graph.
+
             // Mark it as squashed within the IQ.
             squashed_inst->setSquashedInIQ();
 
-            ReadyEntry temp(squashed_inst);
-
-            squashedInsts.push(temp);
+            squashedInsts.push(squashed_inst);
 
             DPRINTF(IQ, "IQ: Instruction PC %#x squashed.\n",
                     squashed_inst->readPC());
         }
-        squashIt--;
-    }
-}
-
-template<class Impl>
-void
-InstructionQueue<Impl>::squash()
-{
-    DPRINTF(IQ, "IQ: Starting to squash instructions in the IQ.\n");
 
-    // Read instruction sequence number of last instruction out of the
-    // time buffer.
-    squashedSeqNum = fromCommit->commitInfo.doneSeqNum;
-
-    // Setup the squash iterator to point to the tail.
-    squashIt = tail;
+        if (squashed_inst->isNonSpeculative() || squashed_inst->isStore()) {
+            nonSpecInsts.erase(squashed_inst->seqNum);
+        }
 
-    // Call doSquash.
-    doSquash();
+        --squashIt;
+    }
 }
 
-template<class Impl>
+template <class Impl>
 void
 InstructionQueue<Impl>::stopSquash()
 {
@@ -448,36 +609,9 @@ InstructionQueue<Impl>::stopSquash()
     squashIt = cpu->instList.end();
 }
 
-template<class Impl>
-int
-InstructionQueue<Impl>::countInsts()
-{
-    ListIt count_it = cpu->instList.begin();
-    int total_insts = 0;
-
-    while (count_it != tail) {
-        if (!(*count_it)->isIssued()) {
-            ++total_insts;
-        }
-
-        count_it++;
-
-        assert(count_it != cpu->instList.end());
-    }
-
-    // Need to count the tail iterator as well.
-    if (count_it != cpu->instList.end() &&
-        (*count_it) != NULL &&
-        !(*count_it)->isIssued()) {
-        ++total_insts;
-    }
-
-    return total_insts;
-}
-
-template<class Impl>
+template <class Impl>
 void
-InstructionQueue<Impl>::wakeDependents(DynInst *completed_inst)
+InstructionQueue<Impl>::wakeDependents(DynInstPtr &completed_inst)
 {
     DPRINTF(IQ, "IQ: Waking dependents of completed instruction.\n");
     //Look at the physical destination register of the DynInst
@@ -487,6 +621,13 @@ InstructionQueue<Impl>::wakeDependents(DynInst *completed_inst)
 
     DependencyEntry *curr;
 
+    // Tell the memory dependence unit to wake any dependents on this
+    // instruction if it is a memory instruction.
+
+    if (completed_inst->isMemRef()) {
+        memDepUnit.wakeDependents(completed_inst);
+    }
+
     for (int dest_reg_idx = 0;
          dest_reg_idx < total_dest_regs;
          dest_reg_idx++)
@@ -507,7 +648,7 @@ InstructionQueue<Impl>::wakeDependents(DynInst *completed_inst)
         //Maybe abstract this part into a function.
         //Go through the dependency chain, marking the registers as ready
         //within the waiting instructions.
-        while (dependGraph[dest_reg].next != NULL) {
+        while (dependGraph[dest_reg].next) {
 
             curr = dependGraph[dest_reg].next;
 
@@ -537,9 +678,9 @@ InstructionQueue<Impl>::wakeDependents(DynInst *completed_inst)
     }
 }
 
-template<class Impl>
+template <class Impl>
 bool
-InstructionQueue<Impl>::addToDependents(DynInst *new_inst)
+InstructionQueue<Impl>::addToDependents(DynInstPtr &new_inst)
 {
     // Loop through the instruction's source registers, adding
     // them to the dependency list if they are not ready.
@@ -558,7 +699,9 @@ InstructionQueue<Impl>::addToDependents(DynInst *new_inst)
             // hasn't become ready while the instruction was in flight
             // between stages.  Only if it really isn't ready should
             // it be added to the dependency graph.
-            if (regScoreboard[src_reg] == false) {
+            if (src_reg >= numPhysRegs) {
+                continue;
+            } else if (regScoreboard[src_reg] == false) {
                 DPRINTF(IQ, "IQ: Instruction PC %#x has src reg %i that "
                         "is being added to the dependency chain.\n",
                         new_inst->readPC(), src_reg);
@@ -581,9 +724,9 @@ InstructionQueue<Impl>::addToDependents(DynInst *new_inst)
     return return_val;
 }
 
-template<class Impl>
+template <class Impl>
 void
-InstructionQueue<Impl>::createDependency(DynInst *new_inst)
+InstructionQueue<Impl>::createDependency(DynInstPtr &new_inst)
 {
     //Actually nothing really needs to be marked when an
     //instruction becomes the producer of a register's value,
@@ -595,20 +738,32 @@ InstructionQueue<Impl>::createDependency(DynInst *new_inst)
          dest_reg_idx < total_dest_regs;
          dest_reg_idx++)
     {
-        int8_t dest_reg = new_inst->renamedDestRegIdx(dest_reg_idx);
-        dependGraph[dest_reg].inst = new_inst;
-        if (dependGraph[dest_reg].next != NULL) {
-            panic("Dependency chain is not empty.\n");
+        PhysRegIndex dest_reg = new_inst->renamedDestRegIdx(dest_reg_idx);
+
+        // Instructions that use the misc regs will have a reg number
+        // higher than the normal physical registers.  In this case these
+        // registers are not renamed, and there is no need to track
+        // dependencies as these instructions must be executed at commit.
+        if (dest_reg >= numPhysRegs) {
+            continue;
         }
 
+        dependGraph[dest_reg].inst = new_inst;
+#if 0
+        if (dependGraph[dest_reg].next) {
+            panic("Dependency chain of dest reg %i is not empty.\n",
+                  dest_reg);
+        }
+#endif
+        assert(!dependGraph[dest_reg].next);
         // Mark the scoreboard to say it's not yet ready.
         regScoreboard[dest_reg] = false;
     }
 }
 
-template<class Impl>
+template <class Impl>
 void
-InstructionQueue<Impl>::DependencyEntry::insert(DynInst *new_inst)
+InstructionQueue<Impl>::DependencyEntry::insert(DynInstPtr &new_inst)
 {
     //Add this new, dependent instruction at the head of the dependency
     //chain.
@@ -623,9 +778,9 @@ InstructionQueue<Impl>::DependencyEntry::insert(DynInst *new_inst)
     this->next = new_entry;
 }
 
-template<class Impl>
+template <class Impl>
 void
-InstructionQueue<Impl>::DependencyEntry::remove(DynInst *inst_to_remove)
+InstructionQueue<Impl>::DependencyEntry::remove(DynInstPtr &inst_to_remove)
 {
     DependencyEntry *prev = this;
     DependencyEntry *curr = this->next;
@@ -643,6 +798,8 @@ InstructionQueue<Impl>::DependencyEntry::remove(DynInst *inst_to_remove)
     {
         prev = curr;
         curr = curr->next;
+
+        assert(curr != NULL);
     }
 
     // Now remove this instruction from the list.
@@ -651,34 +808,140 @@ InstructionQueue<Impl>::DependencyEntry::remove(DynInst *inst_to_remove)
     delete curr;
 }
 
-template<class Impl>
+template <class Impl>
+void
+InstructionQueue<Impl>::dumpDependGraph()
+{
+    DependencyEntry *curr;
+
+    for (int i = 0; i < numPhysRegs; ++i)
+    {
+        curr = &dependGraph[i];
+
+        if (curr->inst) {
+            cprintf("dependGraph[%i]: producer: %#x consumer: ", i,
+                    curr->inst->readPC());
+        } else {
+            cprintf("dependGraph[%i]: No producer. consumer: ", i);
+        }
+
+        while (curr->next != NULL) {
+            curr = curr->next;
+
+            cprintf("%#x ", curr->inst->readPC());
+        }
+
+        cprintf("\n");
+    }
+}
+
+template <class Impl>
 void
-InstructionQueue<Impl>::addIfReady(DynInst *inst)
+InstructionQueue<Impl>::addIfReady(DynInstPtr &inst)
 {
     //If the instruction now has all of its source registers
     // available, then add it to the list of ready instructions.
     if (inst->readyToIssue()) {
-        ReadyEntry to_add(inst);
+
         //Add the instruction to the proper ready list.
-        if (inst->isInteger()) {
+        if (inst->isControl()) {
+
+            DPRINTF(IQ, "IQ: Branch instruction is ready to issue, "
+                    "putting it onto the ready list, PC %#x.\n",
+                    inst->readPC());
+            readyBranchInsts.push(inst);
+
+        } else if (inst->isMemRef()) {
+
+            DPRINTF(IQ, "IQ: Checking if memory instruction can issue.\n");
+
+            if (memDepUnit.readyToIssue(inst)) {
+                DPRINTF(IQ, "IQ: Memory instruction is ready to issue, "
+                        "putting it onto the ready list, PC %#x.\n",
+                        inst->readPC());
+                readyMemInsts.push(inst);
+            }
+
+        } else if (inst->isInteger()) {
+
             DPRINTF(IQ, "IQ: Integer instruction is ready to issue, "
                     "putting it onto the ready list, PC %#x.\n",
                     inst->readPC());
-            readyIntInsts.push(to_add);
+            readyIntInsts.push(inst);
+
         } else if (inst->isFloating()) {
+
             DPRINTF(IQ, "IQ: Floating instruction is ready to issue, "
                     "putting it onto the ready list, PC %#x.\n",
                     inst->readPC());
-            readyFloatInsts.push(to_add);
-        } else if (inst->isControl()) {
-            DPRINTF(IQ, "IQ: Branch instruction is ready to issue, "
-                    "putting it onto the ready list, PC %#x.\n",
-                    inst->readPC());
-            readyBranchInsts.push(to_add);
+            readyFloatInsts.push(inst);
+
         } else {
-            panic("IQ: Instruction not an expected type.\n");
+            DPRINTF(IQ, "IQ: Miscellaneous instruction is ready to issue, "
+                    "putting it onto the ready list, PC %#x..\n",
+                    inst->readPC());
+
+            readyMiscInsts.push(inst);
         }
     }
 }
 
+template <class Impl>
+int
+InstructionQueue<Impl>::countInsts()
+{
+    ListIt count_it = cpu->instList.begin();
+    int total_insts = 0;
+
+    while (count_it != tail) {
+        if (!(*count_it)->isIssued()) {
+            ++total_insts;
+        }
+
+        ++count_it;
+
+        assert(count_it != cpu->instList.end());
+    }
+
+    // Need to count the tail iterator as well.
+    if (count_it != cpu->instList.end() &&
+        (*count_it) &&
+        !(*count_it)->isIssued()) {
+        ++total_insts;
+    }
+
+    return total_insts;
+}
+
+template <class Impl>
+void
+InstructionQueue<Impl>::dumpLists()
+{
+    cprintf("Ready integer list size: %i\n", readyIntInsts.size());
+
+    cprintf("Ready float list size: %i\n", readyFloatInsts.size());
+
+    cprintf("Ready branch list size: %i\n", readyBranchInsts.size());
+
+    cprintf("Ready memory list size: %i\n", readyMemInsts.size());
+
+    cprintf("Ready misc list size: %i\n", readyMiscInsts.size());
+
+    cprintf("Squashed list size: %i\n", squashedInsts.size());
+
+    cprintf("Non speculative list size: %i\n", nonSpecInsts.size());
+
+    non_spec_it_t non_spec_it = nonSpecInsts.begin();
+
+    cprintf("Non speculative list: ");
+
+    while (non_spec_it != nonSpecInsts.end()) {
+        cprintf("%#x ", (*non_spec_it).second->readPC());
+        ++non_spec_it;
+    }
+
+    cprintf("\n");
+
+}
+
 #endif // __INST_QUEUE_IMPL_HH__
diff --git a/cpu/beta_cpu/mem_dep_unit.cc b/cpu/beta_cpu/mem_dep_unit.cc
new file mode 100644
index 000000000..3175997f6
--- /dev/null
+++ b/cpu/beta_cpu/mem_dep_unit.cc
@@ -0,0 +1,9 @@
+
+#include "cpu/beta_cpu/alpha_dyn_inst.hh"
+#include "cpu/beta_cpu/alpha_impl.hh"
+#include "cpu/beta_cpu/store_set.hh"
+#include "cpu/beta_cpu/mem_dep_unit_impl.hh"
+
+// Force instantation of memory dependency unit using store sets and
+// AlphaSimpleImpl.
+template MemDepUnit<StoreSet, AlphaSimpleImpl>;
diff --git a/cpu/beta_cpu/mem_dep_unit.hh b/cpu/beta_cpu/mem_dep_unit.hh
new file mode 100644
index 000000000..4821c63b7
--- /dev/null
+++ b/cpu/beta_cpu/mem_dep_unit.hh
@@ -0,0 +1,70 @@
+
+#ifndef __MEM_DEP_UNIT_HH__
+#define __MEM_DEP_UNIT_HH__
+
+#include <set>
+#include <map>
+
+#include "cpu/inst_seq.hh"
+
+/**
+ * Memory dependency unit class.  This holds the memory dependence predictor.
+ * As memory operations are issued to the IQ, they are also issued to this
+ * unit, which then looks up the prediction as to what they are dependent
+ * upon.  This unit must be checked prior to a memory operation being able
+ * to issue.  Although this is templated, it's somewhat hard to make a generic
+ * memory dependence unit.  This one is mostly for store sets; it will be
+ * quite limited in what other memory dependence predictions it can also
+ * utilize.  Thus this class should be most likely be rewritten for other
+ * dependence prediction schemes.
+ */
+template <class MemDepPred, class Impl>
+class MemDepUnit {
+  public:
+    typedef typename Impl::Params Params;
+    typedef typename Impl::DynInstPtr DynInstPtr;
+
+  public:
+    typedef typename std::set<InstSeqNum>::iterator sn_it_t;
+    typedef typename std::map<InstSeqNum, vector<InstSeqNum> >::iterator
+    dep_it_t;
+
+  public:
+    MemDepUnit(Params &params);
+
+    void insert(DynInstPtr &inst);
+
+    bool readyToIssue(DynInstPtr &inst);
+
+    void issue(DynInstPtr &inst);
+
+    void wakeDependents(DynInstPtr &inst);
+
+    void squash(const InstSeqNum &squashed_num);
+
+    void violation(DynInstPtr &store_inst, DynInstPtr &violating_load);
+
+  private:
+    /** List of instructions that have passed through rename, yet are still
+     *  waiting on a memory dependence to resolve before they can issue.
+     */
+    std::set<InstSeqNum> renamedInsts;
+
+    /** List of instructions that have all their predicted memory dependences
+     *  resolved.  They are ready in terms of being free of memory
+     *  dependences; however they may still have to wait on source registers.
+     */
+    std::set<InstSeqNum> readyInsts;
+
+    std::map<InstSeqNum, vector<InstSeqNum> > dependencies;
+
+    /** The memory dependence predictor.  It is accessed upon new
+     *  instructions being added to the IQ, and responds by telling
+     *  this unit what instruction the newly added instruction is dependent
+     *  upon.
+     */
+    MemDepPred depPred;
+
+};
+
+#endif
diff --git a/cpu/beta_cpu/mem_dep_unit_impl.hh b/cpu/beta_cpu/mem_dep_unit_impl.hh
new file mode 100644
index 000000000..4299acb7a
--- /dev/null
+++ b/cpu/beta_cpu/mem_dep_unit_impl.hh
@@ -0,0 +1,166 @@
+
+#include <map>
+
+#include "cpu/beta_cpu/mem_dep_unit.hh"
+
+// Hack: dependence predictor sizes are hardcoded.
+template <class MemDepPred, class Impl>
+MemDepUnit<MemDepPred, Impl>::MemDepUnit(Params &params)
+    : depPred(4028, 128)
+{
+    DPRINTF(MemDepUnit, "MemDepUnit: Creating MemDepUnit object.\n");
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::insert(DynInstPtr &inst)
+{
+    InstSeqNum inst_seq_num = inst->seqNum;
+
+
+    InstSeqNum producing_store = depPred.checkInst(inst->readPC());
+
+    if (producing_store == 0 ||
+        dependencies.find(producing_store) == dependencies.end()) {
+        readyInsts.insert(inst_seq_num);
+    } else {
+        // If it's not already ready, then add it to the renamed
+        // list and the dependencies.
+        renamedInsts.insert(inst_seq_num);
+
+        dependencies[producing_store].push_back(inst_seq_num);
+    }
+
+    if (inst->isStore()) {
+        depPred.insertStore(inst->readPC(), inst_seq_num);
+
+        // Make sure this store isn't already in this list.
+        assert(dependencies.find(inst_seq_num) == dependencies.end());
+
+        // Put a dependency entry in at the store's sequence number.
+        // Uh, not sure how this works...I want to create an entry but
+        // I don't have anything to put into the value yet.
+        dependencies[inst_seq_num];
+    } else if (!inst->isLoad()) {
+        panic("MemDepUnit: Unknown type! (most likely a barrier).");
+    }
+}
+
+template <class MemDepPred, class Impl>
+bool
+MemDepUnit<MemDepPred, Impl>::readyToIssue(DynInstPtr &inst)
+{
+    InstSeqNum inst_seq_num = inst->seqNum;
+
+    if (readyInsts.find(inst_seq_num) == readyInsts.end()) {
+        return false;
+    } else {
+        return true;
+    }
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::issue(DynInstPtr &inst)
+{
+    assert(readyInsts.find(inst->seqNum) != readyInsts.end());
+
+    // Remove the instruction from the ready list.
+    readyInsts.erase(inst->seqNum);
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::wakeDependents(DynInstPtr &inst)
+{
+    // Wake any dependencies.
+    dep_it_t dep_it = dependencies.find(inst);
+
+    // If there's no entry, then return.  Really there should only be
+    // no entry if the instruction is a load.
+    if (dep_it == dependencies.end()) {
+        return;
+    }
+
+    assert(inst->isStore());
+
+    for(int i = 0; i < (*dep_it).second.size(); ++i ) {
+        InstSeqNum woken_inst = (*dep_it).second[i];
+
+        // Should we have reached instructions that are actually squashed,
+        // there will be no more useful instructions in this dependency
+        // list.  Break out early.
+        if (renamedInsts.find(woken_inst) == renamedInsts.end()) {
+            DPRINTF(MemDepUnit, "MemDepUnit: Dependents on inst PC %#x "
+                    "are squashed, starting at SN %i.  Breaking early.\n",
+                    inst->readPC(), woken_inst);
+            break;
+        }
+
+        // Remove it from the renamed instructions.
+        renamedInsts.erase(woken_inst);
+
+        // Add it to the ready list.
+        readyInsts.insert(woken_inst);
+    }
+
+    dependencies.erase(dep_it);
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::squash(const InstSeqNum &squashed_num)
+{
+
+    if (!renamedInsts.empty()) {
+        sn_it_t renamed_it = renamedInsts.end();
+
+        --renamed_it;
+
+        // Remove entries from the renamed list as long as we haven't reached
+        // the end and the entries continue to be younger than the squashed.
+        while (!renamedInsts.empty() &&
+               (*renamed_it) > squashed_num)
+        {
+            renamedInsts.erase(renamed_it--);
+        }
+    }
+
+    if (!readyInsts.empty()) {
+        sn_it_t ready_it = readyInsts.end();
+
+        --ready_it;
+
+        // Same for the ready list.
+        while (!readyInsts.empty() &&
+               (*ready_it) > squashed_num)
+        {
+            readyInsts.erase(ready_it--);
+        }
+    }
+
+    if (!dependencies.empty()) {
+        dep_it_t dep_it = dependencies.end();
+
+        --dep_it;
+
+        // Same for the dependencies list.
+        while (!dependencies.empty() &&
+               (*dep_it).first > squashed_num)
+        {
+            dependencies.erase(dep_it--);
+        }
+    }
+
+    // Tell the dependency predictor to squash as well.
+    depPred.squash(squashed_num);
+}
+
+template <class MemDepPred, class Impl>
+void
+MemDepUnit<MemDepPred, Impl>::violation(DynInstPtr &store_inst,
+                                        DynInstPtr &violating_load)
+{
+    // Tell the memory dependence unit of the violation.
+    depPred.violation(violating_load->readPC(), store_inst->readPC());
+}
diff --git a/cpu/beta_cpu/regfile.hh b/cpu/beta_cpu/regfile.hh
index 21e0ce218..aba897fdc 100644
--- a/cpu/beta_cpu/regfile.hh
+++ b/cpu/beta_cpu/regfile.hh
@@ -13,11 +13,11 @@ using namespace std;
 // Things that are in the ifdef FULL_SYSTEM are pretty dependent on the ISA,
 // and should go in the AlphaFullCPU.
 
-template<class Impl>
+template <class Impl>
 class PhysRegFile
 {
     //Note that most of the definitions of the IntReg, FloatReg, etc. exist
-    //within the Impl class and not within this PhysRegFile class.
+    //within the Impl/ISA class and not within this PhysRegFile class.
 
     //Will need some way to allow stuff like swap_palshadow to access the
     //correct registers.  Might require code changes to swap_palshadow and
@@ -42,6 +42,8 @@ class PhysRegFile
 
     uint64_t readIntReg(PhysRegIndex reg_idx)
     {
+        assert(reg_idx < numPhysicalIntRegs);
+
         DPRINTF(IEW, "RegFile: Access to int register %i, has data "
                 "%i\n", int(reg_idx), intRegFile[reg_idx]);
         return intRegFile[reg_idx];
@@ -52,8 +54,10 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        DPRINTF(IEW, "RegFile: Access to float register %i, has data "
-                "%f\n", int(reg_idx), (float)floatRegFile[reg_idx].d);
+        assert(reg_idx < numPhysicalFloatRegs);
+
+        DPRINTF(IEW, "RegFile: Access to float register %i as single, has "
+                "data %8.8f\n", int(reg_idx), (float)floatRegFile[reg_idx].d);
 
         return (float)floatRegFile[reg_idx].d;
     }
@@ -63,8 +67,10 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        DPRINTF(IEW, "RegFile: Access to float register %i, has data "
-                "%f\n", int(reg_idx), floatRegFile[reg_idx].d);
+        assert(reg_idx < numPhysicalFloatRegs);
+
+        DPRINTF(IEW, "RegFile: Access to float register %i as double, has "
+                " data %8.8f\n", int(reg_idx), floatRegFile[reg_idx].d);
 
         return floatRegFile[reg_idx].d;
     }
@@ -74,14 +80,18 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        DPRINTF(IEW, "RegFile: Access to float register %i, has data "
-                "%f\n", int(reg_idx), floatRegFile[reg_idx].q);
+        assert(reg_idx < numPhysicalFloatRegs);
+
+        DPRINTF(IEW, "RegFile: Access to float register %i as int, has data "
+                "%lli\n", int(reg_idx), floatRegFile[reg_idx].q);
 
         return floatRegFile[reg_idx].q;
     }
 
     void setIntReg(PhysRegIndex reg_idx, uint64_t val)
     {
+        assert(reg_idx < numPhysicalIntRegs);
+
         DPRINTF(IEW, "RegFile: Setting int register %i to %lli\n",
                 int(reg_idx), val);
 
@@ -93,7 +103,9 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        DPRINTF(IEW, "RegFile: Setting float register %i to %f\n",
+        assert(reg_idx < numPhysicalFloatRegs);
+
+        DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                 int(reg_idx), val);
 
         floatRegFile[reg_idx].d = (double)val;
@@ -104,7 +116,9 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
-        DPRINTF(IEW, "RegFile: Setting float register %i to %f\n",
+        assert(reg_idx < numPhysicalFloatRegs);
+
+        DPRINTF(IEW, "RegFile: Setting float register %i to %8.8f\n",
                 int(reg_idx), val);
 
         floatRegFile[reg_idx].d = val;
@@ -115,6 +129,8 @@ class PhysRegFile
         // Remove the base Float reg dependency.
         reg_idx = reg_idx - numPhysicalIntRegs;
 
+        assert(reg_idx < numPhysicalFloatRegs);
+
         DPRINTF(IEW, "RegFile: Setting float register %i to %lli\n",
                 int(reg_idx), val);
 
@@ -185,7 +201,7 @@ class PhysRegFile
     unsigned numPhysicalFloatRegs;
 };
 
-template<class Impl>
+template <class Impl>
 PhysRegFile<Impl>::PhysRegFile(unsigned _numPhysicalIntRegs,
                                unsigned _numPhysicalFloatRegs)
     : numPhysicalIntRegs(_numPhysicalIntRegs),
@@ -203,7 +219,7 @@ PhysRegFile<Impl>::PhysRegFile(unsigned _numPhysicalIntRegs,
 //Problem:  This code doesn't make sense at the RegFile level because it
 //needs things such as the itb and dtb.  Either put it at the CPU level or
 //the DynInst level.
-template<class Impl>
+template <class Impl>
 uint64_t
 PhysRegFile<Impl>::readIpr(int idx, Fault &fault)
 {
@@ -319,7 +335,7 @@ PhysRegFile<Impl>::readIpr(int idx, Fault &fault)
 int break_ipl = -1;
 #endif
 
-template<class Impl>
+template <class Impl>
 Fault
 PhysRegFile<Impl>::setIpr(int idx, uint64_t val)
 {
diff --git a/cpu/beta_cpu/rename.hh b/cpu/beta_cpu/rename.hh
index cd66ce686..9f031012a 100644
--- a/cpu/beta_cpu/rename.hh
+++ b/cpu/beta_cpu/rename.hh
@@ -1,25 +1,14 @@
 // Todo:
-// Figure out rename map for reg vs fp (probably just have one rename map).
-// In simple case, there is no renaming, so have this stage do basically
-// nothing.
-// Fix up trap and barrier handling.  Fix up squashing too, as it's too
-// dependent upon the iew stage continually telling it to squash.
-// Have commit send back information whenever a branch has committed.  This
-// way the history buffer can be cleared beyond the point where the branch
-// was.
+// Fix up trap and barrier handling.
+// May want to have different statuses to differentiate the different stall
+// conditions.
 
 #ifndef __SIMPLE_RENAME_HH__
 #define __SIMPLE_RENAME_HH__
 
-//Will want to include: time buffer, structs, free list, rename map
 #include <list>
 
 #include "base/timebuf.hh"
-#include "cpu/beta_cpu/comm.hh"
-#include "cpu/beta_cpu/rename_map.hh"
-#include "cpu/beta_cpu/free_list.hh"
-
-using namespace std;
 
 // Will need rename maps for both the int reg file and fp reg file.
 // Or change rename map class to handle both. (RegFile handles both.)
@@ -30,14 +19,14 @@ class SimpleRename
     // Typedefs from the Impl.
     typedef typename Impl::ISA ISA;
     typedef typename Impl::CPUPol CPUPol;
-    typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::DynInstPtr DynInstPtr;
     typedef typename Impl::FullCPU FullCPU;
     typedef typename Impl::Params Params;
 
-    typedef typename Impl::FetchStruct FetchStruct;
-    typedef typename Impl::DecodeStruct DecodeStruct;
-    typedef typename Impl::RenameStruct RenameStruct;
-    typedef typename Impl::TimeStruct TimeStruct;
+    typedef typename CPUPol::FetchStruct FetchStruct;
+    typedef typename CPUPol::DecodeStruct DecodeStruct;
+    typedef typename CPUPol::RenameStruct RenameStruct;
+    typedef typename CPUPol::TimeStruct TimeStruct;
 
     // Typedefs from the CPUPol
     typedef typename CPUPol::FreeList FreeList;
@@ -94,6 +83,14 @@ class SimpleRename
 
     void removeFromHistory(InstSeqNum inst_seq_num);
 
+    inline void renameSrcRegs(DynInstPtr &inst);
+
+    inline void renameDestRegs(DynInstPtr &inst);
+
+    inline int calcFreeROBEntries();
+
+    inline int calcFreeIQEntries();
+
     /** Holds the previous information for each rename.
      *  Note that often times the inst may have been deleted, so only access
      *  the pointer for the address and do not dereference it.
@@ -123,7 +120,7 @@ class SimpleRename
         bool placeHolder;
     };
 
-    list<RenameHistory> historyBuffer;
+    std::list<RenameHistory> historyBuffer;
 
     /** CPU interface. */
     FullCPU *cpu;
@@ -155,7 +152,7 @@ class SimpleRename
     typename TimeBuffer<DecodeStruct>::wire fromDecode;
 
     /** Skid buffer between rename and decode. */
-    queue<DecodeStruct> skidBuffer;
+    std::queue<DecodeStruct> skidBuffer;
 
     /** Rename map interface. */
     SimpleRenameMap *renameMap;
@@ -179,6 +176,12 @@ class SimpleRename
      *  instructions might have freed registers in the previous cycle.
      */
     unsigned commitWidth;
+
+    /** The instruction that rename is currently on.  It needs to have
+     *  persistent state so that when a stall occurs in the middle of a
+     *  group of instructions, it can restart at the proper instruction.
+     */
+    unsigned numInst;
 };
 
 #endif // __SIMPLE_RENAME_HH__
diff --git a/cpu/beta_cpu/rename_impl.hh b/cpu/beta_cpu/rename_impl.hh
index 2b60c2f50..47464d961 100644
--- a/cpu/beta_cpu/rename_impl.hh
+++ b/cpu/beta_cpu/rename_impl.hh
@@ -2,18 +2,19 @@
 
 #include "cpu/beta_cpu/rename.hh"
 
-template<class Impl>
+template <class Impl>
 SimpleRename<Impl>::SimpleRename(Params &params)
     : iewToRenameDelay(params.iewToRenameDelay),
       decodeToRenameDelay(params.decodeToRenameDelay),
       commitToRenameDelay(params.commitToRenameDelay),
       renameWidth(params.renameWidth),
-      commitWidth(params.commitWidth)
+      commitWidth(params.commitWidth),
+      numInst(0)
 {
     _status = Idle;
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::setCPU(FullCPU *cpu_ptr)
 {
@@ -21,7 +22,7 @@ SimpleRename<Impl>::setCPU(FullCPU *cpu_ptr)
     cpu = cpu_ptr;
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
 {
@@ -38,7 +39,7 @@ SimpleRename<Impl>::setTimeBuffer(TimeBuffer<TimeStruct> *tb_ptr)
     toDecode = timeBuffer->getWire(0);
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
 {
@@ -49,7 +50,7 @@ SimpleRename<Impl>::setRenameQueue(TimeBuffer<RenameStruct> *rq_ptr)
     toIEW = renameQueue->getWire(0);
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 {
@@ -61,7 +62,7 @@ SimpleRename<Impl>::setDecodeQueue(TimeBuffer<DecodeStruct> *dq_ptr)
 
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::setRenameMap(RenameMap *rm_ptr)
 {
@@ -69,7 +70,7 @@ SimpleRename<Impl>::setRenameMap(RenameMap *rm_ptr)
     renameMap = rm_ptr;
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::setFreeList(FreeList *fl_ptr)
 {
@@ -77,7 +78,7 @@ SimpleRename<Impl>::setFreeList(FreeList *fl_ptr)
     freeList = fl_ptr;
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::dumpHistory()
 {
@@ -93,7 +94,7 @@ SimpleRename<Impl>::dumpHistory()
     }
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::block()
 {
@@ -110,12 +111,12 @@ SimpleRename<Impl>::block()
     // the previous stages are expected to check all possible stall signals.
 }
 
-template<class Impl>
+template <class Impl>
 inline void
 SimpleRename<Impl>::unblock()
 {
-    DPRINTF(Rename, "Rename: Reading instructions out of skid "
-            "buffer.\n");
+    DPRINTF(Rename, "Rename: Read instructions out of skid buffer this "
+            "cycle.\n");
     // Remove the now processed instructions from the skid buffer.
     skidBuffer.pop();
 
@@ -130,12 +131,12 @@ SimpleRename<Impl>::unblock()
     }
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::doSquash()
 {
     typename list<RenameHistory>::iterator hb_it = historyBuffer.begin();
-    typename list<RenameHistory>::iterator delete_it;
+//    typename list<RenameHistory>::iterator delete_it;
 
     InstSeqNum squashed_seq_num = fromCommit->commitInfo.doneSeqNum;
 
@@ -166,15 +167,17 @@ SimpleRename<Impl>::doSquash()
             freeList->addReg(hb_it->newPhysReg);
         }
 
-        delete_it = hb_it;
+//        delete_it = hb_it;
+
+//        hb_it++;
 
-        hb_it++;
+        historyBuffer.erase(hb_it++);
 
-        historyBuffer.erase(delete_it);
+        assert(hb_it != historyBuffer.end());
     }
 }
 
-template<class Impl>
+template <class Impl>
 void
 SimpleRename<Impl>::squash()
 {
@@ -182,6 +185,8 @@ SimpleRename<Impl>::squash()
     // Set the status to Squashing.
     _status = Squashing;
 
+    numInst = 0;
+
     // Clear the skid buffer in case it has any data in it.
     while (!skidBuffer.empty())
     {
@@ -199,10 +204,10 @@ void
 SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)
 {
     DPRINTF(Rename, "Rename: Removing a committed instruction from the "
-            "history buffer, sequence number %lli.\n", inst_seq_num);
+            "history buffer, until sequence number %lli.\n", inst_seq_num);
     typename list<RenameHistory>::iterator hb_it = historyBuffer.end();
 
-    hb_it--;
+    --hb_it;
 
     if (hb_it->instSeqNum > inst_seq_num) {
         DPRINTF(Rename, "Rename: Old sequence number encountered.  Ensure "
@@ -210,7 +215,7 @@ SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)
         return;
     }
 
-    for ( ; hb_it->instSeqNum != inst_seq_num; hb_it--)
+    while ((*hb_it).instSeqNum != inst_seq_num)
     {
         // Make sure we haven't gone off the end of the list.
         assert(hb_it != historyBuffer.end());
@@ -222,10 +227,19 @@ SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)
         // be the last instruction in the list, as it is the instruction
         // that was just committed that is being removed.
         assert(hb_it->instSeqNum < inst_seq_num);
-        DPRINTF(Rename, "Rename: Committed instruction is not the last "
-                "entry in the history buffer.\n");
+        DPRINTF(Rename, "Rename: Freeing up older rename of reg %i, sequence"
+                " number %i.\n",
+                (*hb_it).prevPhysReg, (*hb_it).instSeqNum);
+
+        if (!(*hb_it).placeHolder) {
+            freeList->addReg((*hb_it).prevPhysReg);
+        }
+
+        historyBuffer.erase(hb_it--);
     }
 
+    // Finally free up the previous register of the squashed instruction
+    // itself.
     if (!(*hb_it).placeHolder) {
         freeList->addReg(hb_it->prevPhysReg);
     }
@@ -234,6 +248,113 @@ SimpleRename<Impl>::removeFromHistory(InstSeqNum inst_seq_num)
 
 }
 
+template <class Impl>
+inline void
+SimpleRename<Impl>::renameSrcRegs(DynInstPtr &inst)
+{
+    unsigned num_src_regs = inst->numSrcRegs();
+
+    // Get the architectual register numbers from the source and
+    // destination operands, and redirect them to the right register.
+    // Will need to mark dependencies though.
+    for (int src_idx = 0; src_idx < num_src_regs; src_idx++)
+    {
+        RegIndex src_reg = inst->srcRegIdx(src_idx);
+
+        // Look up the source registers to get the phys. register they've
+        // been renamed to, and set the sources to those registers.
+        RegIndex renamed_reg = renameMap->lookup(src_reg);
+
+        DPRINTF(Rename, "Rename: Looking up arch reg %i, got "
+                "physical reg %i.\n", (int)src_reg, (int)renamed_reg);
+
+        inst->renameSrcReg(src_idx, renamed_reg);
+
+        // Either incorporate it into the info passed back,
+        // or make another function call to see if that register is
+        // ready or not.
+        if (renameMap->isReady(renamed_reg)) {
+            DPRINTF(Rename, "Rename: Register is ready.\n");
+
+            inst->markSrcRegReady(src_idx);
+        }
+    }
+}
+
+template <class Impl>
+inline void
+SimpleRename<Impl>::renameDestRegs(DynInstPtr &inst)
+{
+    typename SimpleRenameMap::RenameInfo rename_result;
+
+    unsigned num_dest_regs = inst->numDestRegs();
+
+    // Rename the destination registers.
+    for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++)
+    {
+        RegIndex dest_reg = inst->destRegIdx(dest_idx);
+
+        // Get the physical register that the destination will be
+        // renamed to.
+        rename_result = renameMap->rename(dest_reg);
+
+        DPRINTF(Rename, "Rename: Renaming arch reg %i to physical "
+                "reg %i.\n", (int)dest_reg,
+                (int)rename_result.first);
+
+        // Record the rename information so that a history can be kept.
+        RenameHistory hb_entry(inst->seqNum, dest_reg,
+                               rename_result.first,
+                               rename_result.second);
+
+        historyBuffer.push_front(hb_entry);
+
+        DPRINTF(Rename, "Rename: Adding instruction to history buffer, "
+                "sequence number %lli.\n",
+                (*historyBuffer.begin()).instSeqNum);
+
+        // Tell the instruction to rename the appropriate destination
+        // register (dest_idx) to the new physical register
+        // (rename_result.first), and record the previous physical
+        // register that the same logical register was renamed to
+        // (rename_result.second).
+        inst->renameDestReg(dest_idx,
+                            rename_result.first,
+                            rename_result.second);
+    }
+
+    // If it's an instruction with no destination registers, then put
+    // a placeholder within the history buffer.  It might be better
+    // to not put it in the history buffer at all (other than branches,
+    // which always need at least a place holder), and differentiate
+    // between instructions with and without destination registers
+    // when getting from commit the instructions that committed.
+    if (num_dest_regs == 0) {
+        RenameHistory hb_entry(inst->seqNum);
+
+        historyBuffer.push_front(hb_entry);
+
+        DPRINTF(Rename, "Rename: Adding placeholder instruction to "
+                "history buffer, sequence number %lli.\n",
+                inst->seqNum);
+    }
+}
+
+template <class Impl>
+inline int
+SimpleRename<Impl>::calcFreeROBEntries()
+{
+    return fromCommit->commitInfo.freeROBEntries -
+        renameWidth * iewToRenameDelay;
+}
+
+template <class Impl>
+inline int
+SimpleRename<Impl>::calcFreeIQEntries()
+{
+    return fromIEW->iewInfo.freeIQEntries - renameWidth * iewToRenameDelay;
+}
+
 template<class Impl>
 void
 SimpleRename<Impl>::tick()
@@ -258,12 +379,18 @@ SimpleRename<Impl>::tick()
         // buffer were used.  Remove those instructions and handle
         // the rest of unblocking.
         if (_status == Unblocking) {
+            if (fromDecode->size > 0) {
+                // Add the current inputs onto the skid buffer, so they can be
+                // reprocessed when this stage unblocks.
+                skidBuffer.push(*fromDecode);
+            }
+
             unblock();
         }
     } else if (_status == Blocked) {
         // If stage is blocked and still receiving valid instructions,
         // make sure to store them in the skid buffer.
-        if (fromDecode->insts[0] != NULL) {
+        if (fromDecode->size > 0) {
 
             block();
 
@@ -273,8 +400,9 @@ SimpleRename<Impl>::tick()
 
         if (!fromIEW->iewInfo.stall &&
             !fromCommit->commitInfo.stall &&
-            fromCommit->commitInfo.freeROBEntries != 0 &&
-            fromIEW->iewInfo.freeIQEntries != 0) {
+            calcFreeROBEntries() > 0 &&
+            calcFreeIQEntries() > 0 &&
+            renameMap->numFreeEntries() > 0) {
 
             // Need to be sure to check all blocking conditions above.
             // If they have cleared, then start unblocking.
@@ -344,6 +472,7 @@ SimpleRename<Impl>::rename()
     // the rename map and the free list.
     if (fromCommit->commitInfo.squash ||
         fromCommit->commitInfo.robSquashing) {
+        DPRINTF(Rename, "Rename: Receiving signal from Commit to squash.\n");
         squash();
         return;
     }
@@ -368,37 +497,38 @@ SimpleRename<Impl>::rename()
     // Check the decode queue to see if instructions are available.
     // If there are no available instructions to rename, then do nothing.
     // Or, if the stage is currently unblocking, then go ahead and run it.
-    if (fromDecode->insts[0] == NULL && _status != Unblocking) {
+    if (fromDecode->size == 0 && _status != Unblocking) {
         DPRINTF(Rename, "Rename: Nothing to do, breaking out early.\n");
         // Should I change status to idle?
         return;
     }
 
-    DynInst *inst;
-    unsigned num_inst = 0;
+    ////////////////////////////////////
+    // Actual rename part.
+    ////////////////////////////////////
 
-    bool insts_available = _status == Unblocking ?
-        skidBuffer.front().insts[num_inst] != NULL :
-        fromDecode->insts[num_inst] != NULL;
+    DynInstPtr inst;
 
-    typename SimpleRenameMap::RenameInfo rename_result;
+    // If we're unblocking, then we may be in the middle of an instruction
+    // group.  Subtract off numInst to get the proper number of instructions
+    // left.
+    int insts_available = _status == Unblocking ?
+        skidBuffer.front().size - numInst :
+        fromDecode->size;
 
-    unsigned num_src_regs;
-    unsigned num_dest_regs;
+    bool block_this_cycle = false;
 
     // Will have to do a different calculation for the number of free
     // entries.  Number of free entries recorded on this cycle -
     // renameWidth * renameToDecodeDelay
-    // Can I avoid a multiply?
-    unsigned free_rob_entries =
-        fromCommit->commitInfo.freeROBEntries - iewToRenameDelay;
-    DPRINTF(Rename, "Rename: ROB has %d free entries.\n",
-            free_rob_entries);
-    unsigned free_iq_entries =
-        fromIEW->iewInfo.freeIQEntries - iewToRenameDelay;
+    int free_rob_entries = calcFreeROBEntries();
+    int free_iq_entries = calcFreeIQEntries();
+    int min_iq_rob = min(free_rob_entries, free_iq_entries);
+
+    unsigned to_iew_index = 0;
 
     // Check if there's any space left.
-    if (free_rob_entries == 0 || free_iq_entries == 0) {
+    if (min_iq_rob <= 0) {
         DPRINTF(Rename, "Rename: Blocking due to no free ROB or IQ "
                 "entries.\n"
                 "Rename: ROB has %d free entries.\n"
@@ -410,22 +540,40 @@ SimpleRename<Impl>::rename()
         toDecode->renameInfo.stall = true;
 
         return;
-    }
+    } else if (min_iq_rob < insts_available) {
+        DPRINTF(Rename, "Rename: Will have to block this cycle.  Only "
+                "%i insts can be renamed due to IQ/ROB limits.\n",
+                min_iq_rob);
+
+        insts_available = min_iq_rob;
 
-    unsigned min_iq_rob = min(free_rob_entries, free_iq_entries);
-    unsigned num_insts_to_rename = min(min_iq_rob, renameWidth);
+        block_this_cycle = true;
+    }
 
-    while (insts_available &&
-           num_inst < num_insts_to_rename) {
+    while (insts_available > 0) {
         DPRINTF(Rename, "Rename: Sending instructions to iew.\n");
 
         // Get the next instruction either from the skid buffer or the
         // decode queue.
-        inst = _status == Unblocking ? skidBuffer.front().insts[num_inst] :
-               fromDecode->insts[num_inst];
+        inst = _status == Unblocking ? skidBuffer.front().insts[numInst] :
+               fromDecode->insts[numInst];
+
+        if (inst->isSquashed()) {
+            DPRINTF(Rename, "Rename: instruction %i with PC %#x is "
+                    "squashed, skipping.\n",
+                    inst->seqNum, inst->readPC());
+
+            // Go to the next instruction.
+            ++numInst;
+
+            // Decrement how many instructions are available.
+            --insts_available;
+
+            continue;
+        }
 
         DPRINTF(Rename, "Rename: Processing instruction %i with PC %#x.\n",
-                inst, inst->readPC());
+                inst->seqNum, inst->readPC());
 
         // If it's a trap instruction, then it needs to wait here within
         // rename until the ROB is empty.  Needs a way to detect that the
@@ -438,156 +586,59 @@ SimpleRename<Impl>::rename()
             panic("Rename: Serializing instruction encountered.\n");
             DPRINTF(Rename, "Rename: Serializing instruction "
                             "encountered.\n");
-            block();
 
             // Change status over to BarrierStall so that other stages know
             // what this is blocked on.
             _status = BarrierStall;
 
-            // Tell the previous stage to stall.
-            toDecode->renameInfo.stall = true;
-
-            break;
-        }
-
-        // Make sure there's enough room in the ROB and the IQ.
-        // This doesn't really need to be done dynamically; consider
-        // moving outside of this function.
-        if (free_rob_entries == 0 || free_iq_entries == 0) {
-            DPRINTF(Rename, "Rename: Blocking due to lack of ROB or IQ "
-                            "entries.\n");
-            // Call some sort of function to handle all the setup of being
-            // blocked.
-            block();
-
-            // Not really sure how to schedule an event properly, but an
-            // event must be scheduled such that upon freeing a ROB entry,
-            // this stage will restart up.  Perhaps add in a ptr to an Event
-            // within the ROB that will be able to execute that Event
-            // if a free register is added to the freelist.
-
-            // Tell the previous stage to stall.
-            toDecode->renameInfo.stall = true;
+            block_this_cycle = true;
 
             break;
         }
 
-        // Temporary variables to hold number of source and destination regs.
-        num_src_regs = inst->numSrcRegs();
-        num_dest_regs = inst->numDestRegs();
-
         // Check here to make sure there are enough destination registers
         // to rename to.  Otherwise block.
-        if (renameMap->numFreeEntries() < num_dest_regs)
+        if (renameMap->numFreeEntries() < inst->numDestRegs())
         {
             DPRINTF(Rename, "Rename: Blocking due to lack of free "
                             "physical registers to rename to.\n");
-            // Call function to handle blocking.
-            block();
-
             // Need some sort of event based on a register being freed.
 
-            // Tell the previous stage to stall.
-            toDecode->renameInfo.stall = true;
+            block_this_cycle = true;
 
-            // Break out of rename loop.
             break;
         }
 
-        // Get the architectual register numbers from the source and
-        // destination operands, and redirect them to the right register.
-        // Will need to mark dependencies though.
-        for (int src_idx = 0; src_idx < num_src_regs; src_idx++)
-        {
-            RegIndex src_reg = inst->srcRegIdx(src_idx);
-
-            // Look up the source registers to get the phys. register they've
-            // been renamed to, and set the sources to those registers.
-            RegIndex renamed_reg = renameMap->lookup(src_reg);
-
-            DPRINTF(Rename, "Rename: Looking up arch reg %i, got "
-                    "physical reg %i.\n", (int)src_reg, (int)renamed_reg);
-
-            inst->renameSrcReg(src_idx, renamed_reg);
-
-            // Either incorporate it into the info passed back,
-            // or make another function call to see if that register is
-            // ready or not.
-            if (renameMap->isReady(renamed_reg)) {
-                DPRINTF(Rename, "Rename: Register is ready.\n");
-
-                inst->markSrcRegReady(src_idx);
-            }
-        }
-
-        // Rename the destination registers.
-        for (int dest_idx = 0; dest_idx < num_dest_regs; dest_idx++)
-        {
-            RegIndex dest_reg = inst->destRegIdx(dest_idx);
-
-            // Get the physical register that the destination will be
-            // renamed to.
-            rename_result = renameMap->rename(dest_reg);
-
-            DPRINTF(Rename, "Rename: Renaming arch reg %i to physical "
-                    "register %i.\n", (int)dest_reg,
-                    (int)rename_result.first);
-
-            // Record the rename information so that a history can be kept.
-            RenameHistory hb_entry(inst->seqNum, dest_reg,
-                                   rename_result.first,
-                                   rename_result.second);
-
-            historyBuffer.push_front(hb_entry);
-
-            DPRINTF(Rename, "Rename: Adding instruction to history buffer, "
-                    "sequence number %lli.\n", inst->seqNum);
-
-            // Tell the instruction to rename the appropriate destination
-            // register (dest_idx) to the new physical register
-            // (rename_result.first), and record the previous physical
-            // register that the same logical register was renamed to
-            // (rename_result.second).
-            inst->renameDestReg(dest_idx,
-                                rename_result.first,
-                                rename_result.second);
-        }
-
-        // If it's an instruction with no destination registers, then put
-        // a placeholder within the history buffer.  It might be better
-        // to not put it in the history buffer at all (other than branches,
-        // which always need at least a place holder), and differentiate
-        // between instructions with and without destination registers
-        // when getting from commit the instructions that committed.
-        if (num_dest_regs == 0) {
-            RenameHistory hb_entry(inst->seqNum);
-
-            historyBuffer.push_front(hb_entry);
+        renameSrcRegs(inst);
 
-            DPRINTF(Rename, "Rename: Adding placeholder instruction to "
-                    "history buffer, sequence number %lli.\n",
-                    inst->seqNum);
-        }
+        renameDestRegs(inst);
 
         // Put instruction in rename queue.
-        toIEW->insts[num_inst] = inst;
+        toIEW->insts[to_iew_index] = inst;
+        ++(toIEW->size);
 
         // Decrease the number of free ROB and IQ entries.
         --free_rob_entries;
         --free_iq_entries;
 
         // Increment which instruction we're on.
-        ++num_inst;
-
-        // Check whether or not there are instructions available.
-        // Either need to check within the skid buffer, or the decode
-        // queue, depending if this stage is unblocking or not.
-        // Hmm, dangerous check.  Can touch memory not allocated.  Might
-        // be better to just do check at beginning of loop.  Or better
-        // yet actually pass the number of instructions issued.
-        insts_available = _status == Unblocking ?
-                           skidBuffer.front().insts[num_inst] != NULL :
-                           fromDecode->insts[num_inst] != NULL;
+        ++to_iew_index;
+        ++numInst;
+
+        // Decrement how many instructions are available.
+        --insts_available;
     }
 
+    // Check if there's any instructions left that haven't yet been renamed.
+    // If so then block.
+    if (block_this_cycle) {
+        block();
+
+        toDecode->renameInfo.stall = true;
+    } else {
+        // If we had a successful rename and didn't have to exit early, then
+        // reset numInst so it will refer to the correct instruction on next
+        // run.
+        numInst = 0;
+    }
 }
diff --git a/cpu/beta_cpu/rename_map.cc b/cpu/beta_cpu/rename_map.cc
index c234182f0..cb9720d28 100644
--- a/cpu/beta_cpu/rename_map.cc
+++ b/cpu/beta_cpu/rename_map.cc
@@ -3,12 +3,10 @@
 
 // Todo: Consider making functions inline.  Avoid having things that are
 // using the zero register or misc registers from adding on the registers
-// to the free list.
-
-SimpleRenameMap::RenameEntry::RenameEntry()
-    : physical_reg(0), valid(false)
-{
-}
+// to the free list.  Possibly remove the direct communication between
+// this and the freelist.  Considering making inline bool functions that
+// determine if the register is a logical int, logical fp, physical int,
+// physical fp, etc.
 
 SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs,
                                  unsigned _numPhysicalIntRegs,
@@ -35,11 +33,12 @@ SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs,
 
     //Create the rename maps, and their scoreboards.
     intRenameMap = new RenameEntry[numLogicalIntRegs];
-    floatRenameMap = new RenameEntry[numLogicalFloatRegs];
+    floatRenameMap = new RenameEntry[numLogicalRegs];
 
+    // Should combine this into one scoreboard.
     intScoreboard.resize(numPhysicalIntRegs);
-    floatScoreboard.resize(numPhysicalFloatRegs);
-    miscScoreboard.resize(numMiscRegs);
+    floatScoreboard.resize(numPhysicalRegs);
+    miscScoreboard.resize(numPhysicalRegs + numMiscRegs);
 
     // Initialize the entries in the integer rename map to point to the
     // physical registers of the same index, and consider each register
@@ -59,31 +58,50 @@ SimpleRenameMap::SimpleRenameMap(unsigned _numLogicalIntRegs,
         intScoreboard[index] = 0;
     }
 
+    int float_reg_idx = numPhysicalIntRegs;
+
     // Initialize the entries in the floating point rename map to point to
     // the physical registers of the same index, and consider each register
     // ready until the first rename occurs.
-    for (RegIndex index = 0; index < numLogicalFloatRegs; ++index)
+    // Although the index refers purely to architected registers, because
+    // the floating reg indices come after the integer reg indices, they
+    // may exceed the size of a normal RegIndex (short).
+    for (PhysRegIndex index = numLogicalIntRegs;
+         index < numLogicalRegs; ++index)
+    {
+        floatRenameMap[index].physical_reg = float_reg_idx++;
+    }
+
+    for (RegIndex index = numPhysicalIntRegs;
+         index < numPhysicalIntRegs + numLogicalFloatRegs; ++index)
     {
-        floatRenameMap[index].physical_reg = index + numPhysicalIntRegs;
         floatScoreboard[index] = 1;
     }
 
     // Initialize the rest of the physical registers (the ones that don't
     // directly map to a logical register) as unready.
-    for (PhysRegIndex index = numLogicalFloatRegs;
-         index < numPhysicalFloatRegs;
+    for (PhysRegIndex index = numPhysicalIntRegs + numLogicalFloatRegs;
+         index < numPhysicalRegs;
          ++index)
     {
         floatScoreboard[index] = 0;
     }
 
     // Initialize the entries in the misc register scoreboard to be ready.
-    for (RegIndex index = 0; index < numMiscRegs; ++index)
+    for (RegIndex index = numPhysicalRegs;
+         index < numPhysicalRegs + numMiscRegs; ++index)
     {
         miscScoreboard[index] = 1;
     }
 }
 
+SimpleRenameMap::~SimpleRenameMap()
+{
+    // Delete the rename maps as they were allocated with new.
+    delete [] intRenameMap;
+    delete [] floatRenameMap;
+}
+
 void
 SimpleRenameMap::setFreeList(SimpleFreeList *fl_ptr)
 {
@@ -116,6 +134,8 @@ SimpleRenameMap::rename(RegIndex arch_reg)
             // Update the integer rename map.
             intRenameMap[arch_reg].physical_reg = renamed_reg;
 
+            assert(renamed_reg >= 0 && renamed_reg < numPhysicalIntRegs);
+
             // Mark register as not ready.
             intScoreboard[renamed_reg] = false;
         } else {
@@ -124,7 +144,7 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         }
     } else if (arch_reg < numLogicalRegs) {
         // Subtract off the base offset for floating point registers.
-        arch_reg = arch_reg - numLogicalIntRegs;
+//        arch_reg = arch_reg - numLogicalIntRegs;
 
         // Record the current physical register that is renamed to the
         // requested architected register.
@@ -139,6 +159,9 @@ SimpleRenameMap::rename(RegIndex arch_reg)
             // Update the floating point rename map.
             floatRenameMap[arch_reg].physical_reg = renamed_reg;
 
+            assert(renamed_reg < numPhysicalRegs &&
+                   renamed_reg >= numPhysicalIntRegs);
+
             // Mark register as not ready.
             floatScoreboard[renamed_reg] = false;
         } else {
@@ -160,6 +183,8 @@ SimpleRenameMap::rename(RegIndex arch_reg)
         // so the free list can avoid adding it.
         prev_reg = renamed_reg;
 
+        assert(renamed_reg < numPhysicalRegs + numMiscRegs);
+
         miscScoreboard[renamed_reg] = false;
     }
 
@@ -175,7 +200,7 @@ SimpleRenameMap::lookup(RegIndex arch_reg)
         return intRenameMap[arch_reg].physical_reg;
     } else if (arch_reg < numLogicalRegs) {
         // Subtract off the base FP offset.
-        arch_reg = arch_reg - numLogicalIntRegs;
+//        arch_reg = arch_reg - numLogicalIntRegs;
 
         return floatRenameMap[arch_reg].physical_reg;
     } else {
@@ -196,12 +221,12 @@ SimpleRenameMap::isReady(PhysRegIndex phys_reg)
     } else if (phys_reg < numPhysicalRegs) {
 
         // Subtract off the base FP offset.
-        phys_reg = phys_reg - numPhysicalIntRegs;
+//        phys_reg = phys_reg - numPhysicalIntRegs;
 
         return floatScoreboard[phys_reg];
     } else {
         // Subtract off the misc registers offset.
-        phys_reg = phys_reg - numPhysicalRegs;
+//        phys_reg = phys_reg - numPhysicalRegs;
 
         return miscScoreboard[phys_reg];
     }
@@ -218,13 +243,10 @@ SimpleRenameMap::setEntry(RegIndex arch_reg, PhysRegIndex renamed_reg)
 
         intRenameMap[arch_reg].physical_reg = renamed_reg;
     } else {
-//        assert(arch_reg < (numLogicalIntRegs + numLogicalFloatRegs));
-
-        // Subtract off the base FP offset.
-        arch_reg = arch_reg - numLogicalIntRegs;
+        assert(arch_reg < (numLogicalIntRegs + numLogicalFloatRegs));
 
         DPRINTF(Rename, "Rename Map: Float register %i being set to %i.\n",
-                (int)arch_reg, renamed_reg);
+                (int)arch_reg - numLogicalIntRegs, renamed_reg);
 
         floatRenameMap[arch_reg].physical_reg = renamed_reg;
     }
@@ -234,6 +256,8 @@ void
 SimpleRenameMap::squash(vector<RegIndex> freed_regs,
                         vector<UnmapInfo> unmaps)
 {
+    panic("Not sure this function should be called.");
+
     // Not sure the rename map should be able to access the free list
     // like this.
     while (!freed_regs.empty()) {
@@ -260,16 +284,18 @@ SimpleRenameMap::markAsReady(PhysRegIndex ready_reg)
             (int)ready_reg);
 
     if (ready_reg < numPhysicalIntRegs) {
+        assert(ready_reg >= 0);
+
         intScoreboard[ready_reg] = 1;
     } else if (ready_reg < numPhysicalRegs) {
 
         // Subtract off the base FP offset.
-        ready_reg = ready_reg - numPhysicalIntRegs;
+//        ready_reg = ready_reg - numPhysicalIntRegs;
 
         floatScoreboard[ready_reg] = 1;
     } else {
         //Subtract off the misc registers offset.
-        ready_reg = ready_reg - numPhysicalRegs;
+//        ready_reg = ready_reg - numPhysicalRegs;
 
         miscScoreboard[ready_reg] = 1;
     }
diff --git a/cpu/beta_cpu/rename_map.hh b/cpu/beta_cpu/rename_map.hh
index 05b52bfb2..e68fa05a8 100644
--- a/cpu/beta_cpu/rename_map.hh
+++ b/cpu/beta_cpu/rename_map.hh
@@ -1,6 +1,5 @@
 // Todo:  Create destructor.
-// Make it so that there's a proper separation between int and fp.  Also
-// have it so that there's a more meaningful name given to the variable
+// Have it so that there's a more meaningful name given to the variable
 // that marks the beginning of the FP registers.
 
 #ifndef __RENAME_MAP_HH__
@@ -10,7 +9,6 @@
 #include <vector>
 #include <utility>
 
-//Will want to include faults
 #include "cpu/beta_cpu/free_list.hh"
 
 using namespace std;
@@ -18,8 +16,6 @@ using namespace std;
 class SimpleRenameMap
 {
   public:
-//    typedef typename Impl::RegIndex RegIndex;
-
     /**
      * Pair of a logical register and a physical register.  Tells the
      * previous mapping of a logical register to a physical register.
@@ -45,6 +41,9 @@ class SimpleRenameMap
                     RegIndex _intZeroReg,
                     RegIndex _floatZeroReg);
 
+    /** Destructor. */
+    ~SimpleRenameMap();
+
     void setFreeList(SimpleFreeList *fl_ptr);
 
     //Tell rename map to get a free physical register for a given
@@ -110,7 +109,9 @@ class SimpleRenameMap
         PhysRegIndex physical_reg;
         bool valid;
 
-        RenameEntry();
+        RenameEntry()
+            : physical_reg(0), valid(false)
+        { }
     };
 
     /** Integer rename map. */
@@ -122,6 +123,8 @@ class SimpleRenameMap
     /** Free list interface. */
     SimpleFreeList *freeList;
 
+    // Might want to make all these scoreboards into one large scoreboard.
+
     /** Scoreboard of physical integer registers, saying whether or not they
      *  are ready.
      */
diff --git a/cpu/beta_cpu/rob.hh b/cpu/beta_cpu/rob.hh
index 7963d1b01..c921c0619 100644
--- a/cpu/beta_cpu/rob.hh
+++ b/cpu/beta_cpu/rob.hh
@@ -16,24 +16,20 @@ using namespace std;
 
 /**
  * ROB class.  Uses the instruction list that exists within the CPU to
- * represent the ROB.  This class doesn't contain that structure, but instead
- * a pointer to the CPU to get access to the structure.  The ROB has a large
- * hand in squashing instructions within the CPU, and is responsible for
- * sending out the squash signal as well as what instruction is to be
- * squashed.  The ROB also controls most of the calls to the CPU to delete
- * instructions; the only other call is made in the first stage of the pipe-
- * line, which tells the CPU to delete all instructions not in the ROB.
+ * represent the ROB.  This class doesn't contain that list, but instead
+ * a pointer to the CPU to get access to the list.  The ROB, in this first
+ * implementation, is largely what drives squashing.
  */
-template<class Impl>
+template <class Impl>
 class ROB
 {
   public:
     //Typedefs from the Impl.
     typedef typename Impl::FullCPU FullCPU;
-    typedef typename Impl::DynInst DynInst;
+    typedef typename Impl::DynInstPtr DynInstPtr;
 
-    typedef pair<RegIndex, PhysRegIndex> UnmapInfo;
-    typedef typename list<DynInst *>::iterator InstIt;
+    typedef pair<RegIndex, PhysRegIndex> UnmapInfo_t;
+    typedef typename list<DynInstPtr>::iterator InstIt_t;
 
   public:
     /** ROB constructor.
@@ -56,15 +52,15 @@ class ROB
      *  @params inst The instruction being inserted into the ROB.
      *  @todo Remove the parameter once correctness is ensured.
      */
-    void insertInst(DynInst *inst);
+    void insertInst(DynInstPtr &inst);
 
     /** Returns pointer to the head instruction within the ROB.  There is
      *  no guarantee as to the return value if the ROB is empty.
      *  @retval Pointer to the DynInst that is at the head of the ROB.
      */
-    DynInst *readHeadInst() { return cpu->instList.front(); }
+    DynInstPtr readHeadInst() { return cpu->instList.front(); }
 
-    DynInst *readTailInst() { return (*tail); }
+    DynInstPtr readTailInst() { return (*tail); }
 
     void retireHead();
 
@@ -108,15 +104,28 @@ class ROB
     /** Pointer to the CPU. */
     FullCPU *cpu;
 
+    /** Number of instructions in the ROB. */
     unsigned numEntries;
 
     /** Number of instructions that can be squashed in a single cycle. */
     unsigned squashWidth;
 
-    InstIt tail;
-
-    InstIt squashIt;
+    /** Iterator pointing to the instruction which is the last instruction
+     *  in the ROB.  This may at times be invalid (ie when the ROB is empty),
+     *  however it should never be incorrect.
+     */
+    InstIt_t tail;
+
+    /** Iterator used for walking through the list of instructions when
+     *  squashing.  Used so that there is persistent state between cycles;
+     *  when squashing, the instructions are marked as squashed but not
+     *  immediately removed, meaning the tail iterator remains the same before
+     *  and after a squash.
+     *  This will always be set to cpu->instList.end() if it is invalid.
+     */
+    InstIt_t squashIt;
 
+    /** Number of instructions in the ROB. */
     int numInstsInROB;
 
     /** The sequence number of the squashed instruction. */
diff --git a/cpu/beta_cpu/rob_impl.hh b/cpu/beta_cpu/rob_impl.hh
index 308a8010f..862008429 100644
--- a/cpu/beta_cpu/rob_impl.hh
+++ b/cpu/beta_cpu/rob_impl.hh
@@ -3,7 +3,7 @@
 
 #include "cpu/beta_cpu/rob.hh"
 
-template<class Impl>
+template <class Impl>
 ROB<Impl>::ROB(unsigned _numEntries, unsigned _squashWidth)
     : numEntries(_numEntries),
       squashWidth(_squashWidth),
@@ -13,43 +13,60 @@ ROB<Impl>::ROB(unsigned _numEntries, unsigned _squashWidth)
     doneSquashing = true;
 }
 
-template<class Impl>
+template <class Impl>
 void
 ROB<Impl>::setCPU(FullCPU *cpu_ptr)
 {
     cpu = cpu_ptr;
 
+    // Set the tail to the beginning of the CPU instruction list so that
+    // upon the first instruction being inserted into the ROB, the tail
+    // iterator can simply be incremented.
     tail = cpu->instList.begin();
 
+    // Set the squash iterator to the end of the instruction list.
     squashIt = cpu->instList.end();
 }
 
-template<class Impl>
+template <class Impl>
 int
 ROB<Impl>::countInsts()
 {
-/*
-    int return_val = 0;
+    // Start at 1; if the tail matches cpu->instList.begin(), then there is
+    // one inst in the ROB.
+    int return_val = 1;
+
+    // There are quite a few special cases.  Do not use this function other
+    // than for debugging purposes.
+    if (cpu->instList.begin() == cpu->instList.end()) {
+        // In this case there are no instructions in the list.  The ROB
+        // must be empty.
+        return 0;
+    } else if (tail == cpu->instList.end()) {
+        // In this case, the tail is not yet pointing to anything valid.
+        // The ROB must be empty.
+        return 0;
+    }
 
     // Iterate through the ROB from the head to the tail, counting the
     // entries.
-    for (InstIt i = cpu->instList.begin(); i != tail; i++)
+    for (InstIt_t i = cpu->instList.begin(); i != tail; ++i)
     {
         assert(i != cpu->instList.end());
-        return_val++;
+        ++return_val;
     }
 
     return return_val;
-*/
+
     // Because the head won't be tracked properly until the ROB gets the
     // first instruction, and any time that the ROB is empty and has not
     // yet gotten the instruction, this function doesn't work.
-    return numInstsInROB;
+//    return numInstsInROB;
 }
 
-template<class Impl>
+template <class Impl>
 void
-ROB<Impl>::insertInst(DynInst *inst)
+ROB<Impl>::insertInst(DynInstPtr &inst)
 {
     // Make sure we have the right number of instructions.
     assert(numInstsInROB == countInsts());
@@ -68,7 +85,7 @@ ROB<Impl>::insertInst(DynInst *inst)
     // in which case the tail will be pointing at instList.end().  If that
     // happens, then reset the tail to the beginning of the list.
     if (tail != cpu->instList.end()) {
-        tail++;
+        ++tail;
     } else {
         tail = cpu->instList.begin();
     }
@@ -83,13 +100,14 @@ ROB<Impl>::insertInst(DynInst *inst)
 
 // Whatever calls this function needs to ensure that it properly frees up
 // registers prior to this function.
-template<class Impl>
+template <class Impl>
 void
 ROB<Impl>::retireHead()
 {
     assert(numInstsInROB == countInsts());
+    assert(numInstsInROB > 0);
 
-    DynInst *head_inst;
+    DynInstPtr head_inst;
 
     // Get the head ROB instruction.
     head_inst = cpu->instList.front();
@@ -116,12 +134,12 @@ ROB<Impl>::retireHead()
     }
 }
 
-template<class Impl>
+template <class Impl>
 bool
 ROB<Impl>::isHeadReady()
 {
     if (numInstsInROB != 0) {
-        DynInst *head_inst = cpu->instList.front();
+        DynInstPtr head_inst = cpu->instList.front();
 
         return head_inst->readyToCommit();
     }
@@ -129,7 +147,7 @@ ROB<Impl>::isHeadReady()
     return false;
 }
 
-template<class Impl>
+template <class Impl>
 unsigned
 ROB<Impl>::numFreeEntries()
 {
@@ -138,7 +156,7 @@ ROB<Impl>::numFreeEntries()
     return numEntries - numInstsInROB;
 }
 
-template<class Impl>
+template <class Impl>
 void
 ROB<Impl>::doSquash()
 {
@@ -162,6 +180,12 @@ ROB<Impl>::doSquash()
 
         (*squashIt)->setCanCommit();
 
+        // Special case for when squashing due to a syscall.  It's possible
+        // that the squash happened after the head instruction was already
+        // committed, meaning that (*squashIt)->seqNum != squashedSeqNum
+        // will never be false.  Normally the squash would never be able
+        // to go past the head of the ROB; in this case it might, so it
+        // must be handled otherwise it will segfault.
 #ifndef FULL_SYSTEM
         if (squashIt == cpu->instList.begin()) {
             DPRINTF(ROB, "ROB: Reached head of instruction list while "
@@ -190,7 +214,7 @@ ROB<Impl>::doSquash()
     }
 }
 
-template<class Impl>
+template <class Impl>
 void
 ROB<Impl>::squash(InstSeqNum squash_num)
 {
@@ -206,41 +230,41 @@ ROB<Impl>::squash(InstSeqNum squash_num)
     doSquash();
 }
 
-template<class Impl>
+template <class Impl>
 uint64_t
 ROB<Impl>::readHeadPC()
 {
     assert(numInstsInROB == countInsts());
 
-    DynInst *head_inst = cpu->instList.front();
+    DynInstPtr head_inst = cpu->instList.front();
 
     return head_inst->readPC();
 }
 
-template<class Impl>
+template <class Impl>
 uint64_t
 ROB<Impl>::readHeadNextPC()
 {
     assert(numInstsInROB == countInsts());
 
-    DynInst *head_inst = cpu->instList.front();
+    DynInstPtr head_inst = cpu->instList.front();
 
     return head_inst->readNextPC();
 }
 
-template<class Impl>
+template <class Impl>
 InstSeqNum
 ROB<Impl>::readHeadSeqNum()
 {
     // Return the last sequence number that has not been squashed.  Other
     // stages can use it to squash any instructions younger than the current
     // tail.
-    DynInst *head_inst = cpu->instList.front();
+    DynInstPtr head_inst = cpu->instList.front();
 
     return head_inst->seqNum;
 }
 
-template<class Impl>
+template <class Impl>
 uint64_t
 ROB<Impl>::readTailPC()
 {
@@ -251,7 +275,7 @@ ROB<Impl>::readTailPC()
     return (*tail)->readPC();
 }
 
-template<class Impl>
+template <class Impl>
 InstSeqNum
 ROB<Impl>::readTailSeqNum()
 {
diff --git a/cpu/beta_cpu/store_set.cc b/cpu/beta_cpu/store_set.cc
new file mode 100644
index 000000000..46d763d37
--- /dev/null
+++ b/cpu/beta_cpu/store_set.cc
@@ -0,0 +1,192 @@
+#include "cpu/beta_cpu/store_set.hh"
+#include "base/trace.hh"
+
+StoreSet::StoreSet(int _SSIT_size, int _LFST_size)
+    : SSIT_size(_SSIT_size), LFST_size(_LFST_size)
+{
+    DPRINTF(StoreSet, "StoreSet: Creating store set object.\n");
+
+    SSIT = new SSID[SSIT_size];
+
+    validSSIT.resize(SSIT_size);
+
+    for (int i = 0; i < SSIT_size; ++i)
+        validSSIT[i] = false;
+
+    LFST = new InstSeqNum[LFST_size];
+
+    validLFST.resize(LFST_size);
+
+    SSCounters = new int[LFST_size];
+
+    for (int i = 0; i < LFST_size; ++i)
+    {
+        validLFST[i] = false;
+        SSCounters[i] = 0;
+    }
+
+    index_mask = SSIT_size - 1;
+
+    offset_bits = 2;
+}
+
+void
+StoreSet::violation(Addr load_PC, Addr store_PC)
+{
+    int load_index = calcIndex(load_PC);
+    int store_index = calcIndex(store_PC);
+
+    bool valid_load_SSID = validSSIT[load_index];
+    bool valid_store_SSID = validSSIT[store_index];
+
+    if (!valid_load_SSID && !valid_store_SSID) {
+        // Calculate a new SSID here.
+        SSID new_set = calcSSID(load_PC);
+
+        validSSIT[load_index] = true;
+
+        SSIT[load_index] = new_set;
+
+        validSSIT[store_index] = true;
+
+        SSIT[store_index] = new_set;
+
+        SSCounters[new_set]++;
+    } else if (valid_load_SSID && !valid_store_SSID) {
+        SSID load_SSID = SSIT[load_index];
+
+        validSSIT[store_index] = true;
+
+        SSIT[store_index] = load_SSID;
+
+        SSCounters[load_SSID]++;
+    } else if (!valid_load_SSID && valid_store_SSID) {
+        SSID store_SSID = SSIT[store_index];
+
+        validSSIT[load_index] = true;
+
+        SSIT[load_index] = store_SSID;
+
+        // Because we are having a load point to an already existing set,
+        // the size of the store set is not incremented.
+    } else {
+        SSID load_SSID = SSIT[load_index];
+        SSID store_SSID = SSIT[store_index];
+
+        int load_SS_size = SSCounters[load_SSID];
+        int store_SS_size = SSCounters[store_SSID];
+
+        // If the load has the bigger store set, then assign the store
+        // to the same store set as the load.  Otherwise vice-versa.
+        if (load_SS_size > store_SS_size) {
+            SSIT[store_index] = load_SSID;
+
+            SSCounters[load_SSID]++;
+            SSCounters[store_SSID]--;
+        } else {
+            SSIT[load_index] = store_SSID;
+
+            SSCounters[store_SSID]++;
+            SSCounters[load_SSID]--;
+        }
+    }
+}
+
+void
+StoreSet::insertLoad(Addr load_PC, InstSeqNum load_seq_num)
+{
+    // Does nothing.
+    return;
+}
+
+void
+StoreSet::insertStore(Addr store_PC, InstSeqNum store_seq_num)
+{
+    int index = calcIndex(store_PC);
+
+    int store_SSID;
+
+    if (!validSSIT[index]) {
+        // Do nothing if there's no valid entry.
+        return;
+    } else {
+        store_SSID = SSIT[index];
+
+        assert(store_SSID < LFST_size);
+
+        // Update the last store that was fetched with the current one.
+        LFST[store_SSID] = store_seq_num;
+    }
+}
+
+InstSeqNum
+StoreSet::checkInst(Addr PC)
+{
+    int index = calcIndex(PC);
+
+    int inst_SSID;
+
+    if (!validSSIT[index]) {
+        // Return 0 if there's no valid entry.
+        return 0;
+    } else {
+        inst_SSID = SSIT[index];
+
+        assert(inst_SSID < LFST_size);
+
+        if (!validLFST[inst_SSID]) {
+            return 0;
+        } else {
+            return LFST[inst_SSID];
+        }
+    }
+}
+
+void
+StoreSet::issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store)
+{
+    // This only is updated upon a store being issued.
+    if (!is_store) {
+        return;
+    }
+
+    int index = calcIndex(issued_PC);
+
+    int store_SSID;
+
+    // Make sure the SSIT still has a valid entry for the issued store.
+    assert(validSSIT[index]);
+
+    store_SSID = SSIT[index];
+
+    // If the last fetched store in the store set refers to the store that
+    // was just issued, then invalidate the entry.
+    if (validLFST[store_SSID] && LFST[store_SSID] == issued_seq_num) {
+        validLFST[store_SSID] = false;
+    }
+}
+
+void
+StoreSet::squash(InstSeqNum squashed_num)
+{
+    // Not really sure how to do this well.
+
+    for (int i = 0; i < LFST_size; ++i) {
+        if (LFST[i] < squashed_num) {
+            validLFST[i] = false;
+        }
+    }
+}
+
+void
+StoreSet::clear()
+{
+    for (int i = 0; i < SSIT_size; ++i) {
+        validSSIT[i] = false;
+    }
+
+    for (int i = 0; i < LFST_size; ++i) {
+        validLFST[i] = false;
+    }
+}
+
diff --git a/cpu/beta_cpu/store_set.hh b/cpu/beta_cpu/store_set.hh
new file mode 100644
index 000000000..701c60a2d
--- /dev/null
+++ b/cpu/beta_cpu/store_set.hh
@@ -0,0 +1,58 @@
+#ifndef __STORE_SET_HH__
+#define __STORE_SET_HH__
+
+#include <vector>
+
+#include "arch/alpha/isa_traits.hh"
+#include "cpu/inst_seq.hh"
+
+class StoreSet
+{
+  public:
+    typedef unsigned SSID;
+
+  public:
+    StoreSet(int SSIT_size, int LFST_size);
+
+    void violation(Addr load_PC, Addr store_PC);
+
+    void insertLoad(Addr load_PC, InstSeqNum load_seq_num);
+
+    void insertStore(Addr store_PC, InstSeqNum store_seq_num);
+
+    InstSeqNum checkInst(Addr PC);
+
+    void issued(Addr issued_PC, InstSeqNum issued_seq_num, bool is_store);
+
+    void squash(InstSeqNum squashed_num);
+
+    void clear();
+
+  private:
+    inline int calcIndex(Addr PC)
+    { return (PC >> offset_bits) & index_mask; }
+
+    inline SSID calcSSID(Addr PC)
+    { return ((PC ^ (PC >> 10)) % LFST_size); }
+
+    SSID *SSIT;
+
+    std::vector<bool> validSSIT;
+
+    InstSeqNum *LFST;
+
+    std::vector<bool> validLFST;
+
+    int *SSCounters;
+
+    int SSIT_size;
+
+    int LFST_size;
+
+    int index_mask;
+
+    // HACK: Hardcoded for now.
+    int offset_bits;
+};
+
+#endif // __STORE_SET_HH__
diff --git a/cpu/static_inst.hh b/cpu/static_inst.hh
index 7a707c86a..71e9ef441 100644
--- a/cpu/static_inst.hh
+++ b/cpu/static_inst.hh
@@ -40,9 +40,12 @@
 #include "targetarch/isa_traits.hh"
 
 // forward declarations
+struct AlphaSimpleImpl;
 class ExecContext;
-class AlphaDynInst;
 class DynInst;
+template <class Impl>
+class AlphaDynInst;
+
 class FastCPU;
 class SimpleCPU;
 class SymbolTable;
-- 
2.30.2