Port: Move retry from port base class to Master/SlavePort
[gem5.git] / src / cpu / simple / atomic.cc
index 86deb84e61d28ea218a73a5fa101dc914e9a7c29..0886b276f7d52ac424e573267de0c863dac8e733 100644 (file)
@@ -1,4 +1,16 @@
 /*
+ * Copyright (c) 2012 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2002-2005 The Regents of The University of Michigan
  * All rights reserved.
  *
  */
 
 #include "arch/locked_mem.hh"
-#include "arch/mmaped_ipr.hh"
+#include "arch/mmapped_ipr.hh"
 #include "arch/utility.hh"
 #include "base/bigint.hh"
-#include "cpu/exetrace.hh"
+#include "config/the_isa.hh"
 #include "cpu/simple/atomic.hh"
+#include "cpu/exetrace.hh"
+#include "debug/ExecFaulting.hh"
+#include "debug/SimpleCPU.hh"
 #include "mem/packet.hh"
 #include "mem/packet_access.hh"
+#include "mem/physical.hh"
 #include "params/AtomicSimpleCPU.hh"
+#include "sim/faults.hh"
 #include "sim/system.hh"
+#include "sim/full_system.hh"
 
 using namespace std;
 using namespace TheISA;
 
 AtomicSimpleCPU::TickEvent::TickEvent(AtomicSimpleCPU *c)
-    : Event(&mainEventQueue, CPU_Tick_Pri), cpu(c)
+    : Event(CPU_Tick_Pri), cpu(c)
 {
 }
 
@@ -55,118 +73,50 @@ AtomicSimpleCPU::TickEvent::process()
 }
 
 const char *
-AtomicSimpleCPU::TickEvent::description()
+AtomicSimpleCPU::TickEvent::description() const
 {
     return "AtomicSimpleCPU tick";
 }
 
-Port *
-AtomicSimpleCPU::getPort(const std::string &if_name, int idx)
-{
-    if (if_name == "dcache_port")
-        return &dcachePort;
-    else if (if_name == "icache_port")
-        return &icachePort;
-    else if (if_name == "physmem_port") {
-        hasPhysMemPort = true;
-        return &physmemPort;
-    }
-    else
-        panic("No Such Port\n");
-}
-
 void
 AtomicSimpleCPU::init()
 {
     BaseCPU::init();
-#if FULL_SYSTEM
-    for (int i = 0; i < threadContexts.size(); ++i) {
-        ThreadContext *tc = threadContexts[i];
-
-        // initialize CPU, including PC
-        TheISA::initCPU(tc, tc->readCpuId());
-    }
-#endif
-    if (hasPhysMemPort) {
-        bool snoop = false;
-        AddrRangeList pmAddrList;
-        physmemPort.getPeerAddressRanges(pmAddrList, snoop);
-        physMemAddr = *pmAddrList.begin();
-    }
-}
-
-bool
-AtomicSimpleCPU::CpuPort::recvTiming(PacketPtr pkt)
-{
-    panic("AtomicSimpleCPU doesn't expect recvTiming callback!");
-    return true;
-}
-
-Tick
-AtomicSimpleCPU::CpuPort::recvAtomic(PacketPtr pkt)
-{
-    //Snooping a coherence request, just return
-    return 0;
-}
 
-void
-AtomicSimpleCPU::CpuPort::recvFunctional(PacketPtr pkt)
-{
-    //No internal storage to update, just return
-    return;
-}
+    // Initialise the ThreadContext's memory proxies
+    tcBase()->initMemProxies(tcBase());
 
-void
-AtomicSimpleCPU::CpuPort::recvStatusChange(Status status)
-{
-    if (status == RangeChange) {
-        if (!snoopRangeSent) {
-            snoopRangeSent = true;
-            sendStatusChange(Port::RangeChange);
+    if (FullSystem && !params()->defer_registration) {
+        ThreadID size = threadContexts.size();
+        for (ThreadID i = 0; i < size; ++i) {
+            ThreadContext *tc = threadContexts[i];
+            // initialize CPU, including PC
+            TheISA::initCPU(tc, tc->contextId());
         }
-        return;
     }
 
-    panic("AtomicSimpleCPU doesn't expect recvStatusChange callback!");
-}
-
-void
-AtomicSimpleCPU::CpuPort::recvRetry()
-{
-    panic("AtomicSimpleCPU doesn't expect recvRetry callback!");
+    // Atomic doesn't do MT right now, so contextId == threadId
+    ifetch_req.setThreadContext(_cpuId, 0); // Add thread ID if we add MT
+    data_read_req.setThreadContext(_cpuId, 0); // Add thread ID here too
+    data_write_req.setThreadContext(_cpuId, 0); // Add thread ID here too
 }
 
-void
-AtomicSimpleCPU::DcachePort::setPeer(Port *port)
-{
-    Port::setPeer(port);
-
-#if FULL_SYSTEM
-    // Update the ThreadContext's memory ports (Functional/Virtual
-    // Ports)
-    cpu->tcBase()->connectMemPorts();
-#endif
-}
-
-AtomicSimpleCPU::AtomicSimpleCPU(Params *p)
-    : BaseSimpleCPU(p), tickEvent(this),
-      width(p->width), simulate_stalls(p->simulate_stalls),
+AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p)
+    : BaseSimpleCPU(p), tickEvent(this), width(p->width), locked(false),
+      simulate_data_stalls(p->simulate_data_stalls),
+      simulate_inst_stalls(p->simulate_inst_stalls),
       icachePort(name() + "-iport", this), dcachePort(name() + "-iport", this),
-      physmemPort(name() + "-iport", this), hasPhysMemPort(false)
+      fastmem(p->fastmem)
 {
     _status = Idle;
-
-    icachePort.snoopRangeSent = false;
-    dcachePort.snoopRangeSent = false;
-
-    ifetch_req.setThreadContext(p->cpu_id, 0); // Add thread ID if we add MT
-    data_read_req.setThreadContext(p->cpu_id, 0); // Add thread ID here too
-    data_write_req.setThreadContext(p->cpu_id, 0); // Add thread ID here too
 }
 
 
 AtomicSimpleCPU::~AtomicSimpleCPU()
 {
+    if (tickEvent.scheduled()) {
+        deschedule(tickEvent);
+    }
 }
 
 void
@@ -174,8 +124,7 @@ AtomicSimpleCPU::serialize(ostream &os)
 {
     SimObject::State so_state = SimObject::getState();
     SERIALIZE_ENUM(so_state);
-    Status _status = status();
-    SERIALIZE_ENUM(_status);
+    SERIALIZE_SCALAR(locked);
     BaseSimpleCPU::serialize(os);
     nameOut(os, csprintf("%s.tickEvent", name()));
     tickEvent.serialize(os);
@@ -186,7 +135,7 @@ AtomicSimpleCPU::unserialize(Checkpoint *cp, const string &section)
 {
     SimObject::State so_state;
     UNSERIALIZE_ENUM(so_state);
-    UNSERIALIZE_ENUM(_status);
+    UNSERIALIZE_SCALAR(locked);
     BaseSimpleCPU::unserialize(cp, section);
     tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
 }
@@ -194,23 +143,24 @@ AtomicSimpleCPU::unserialize(Checkpoint *cp, const string &section)
 void
 AtomicSimpleCPU::resume()
 {
+    if (_status == Idle || _status == SwitchedOut)
+        return;
+
     DPRINTF(SimpleCPU, "Resume\n");
-    if (_status != SwitchedOut && _status != Idle) {
-        assert(system->getMemoryMode() == Enums::atomic);
+    assert(system->getMemoryMode() == Enums::atomic);
 
-        changeState(SimObject::Running);
-        if (thread->status() == ThreadContext::Active) {
-            if (!tickEvent.scheduled()) {
-                tickEvent.schedule(nextCycle());
-            }
-        }
+    changeState(SimObject::Running);
+    if (thread->status() == ThreadContext::Active) {
+        if (!tickEvent.scheduled())
+            schedule(tickEvent, nextCycle());
     }
+    system->totalNumInsts = 0;
 }
 
 void
 AtomicSimpleCPU::switchOut()
 {
-    assert(status() == Running || status() == Idle);
+    assert(_status == Running || _status == Idle);
     _status = SwitchedOut;
 
     tickEvent.squash();
@@ -220,28 +170,33 @@ AtomicSimpleCPU::switchOut()
 void
 AtomicSimpleCPU::takeOverFrom(BaseCPU *oldCPU)
 {
-    BaseCPU::takeOverFrom(oldCPU, &icachePort, &dcachePort);
+    BaseCPU::takeOverFrom(oldCPU);
 
     assert(!tickEvent.scheduled());
 
     // if any of this CPU's ThreadContexts are active, mark the CPU as
     // running and schedule its tick event.
-    for (int i = 0; i < threadContexts.size(); ++i) {
+    ThreadID size = threadContexts.size();
+    for (ThreadID i = 0; i < size; ++i) {
         ThreadContext *tc = threadContexts[i];
         if (tc->status() == ThreadContext::Active && _status != Running) {
             _status = Running;
-            tickEvent.schedule(nextCycle());
+            schedule(tickEvent, nextCycle());
             break;
         }
     }
     if (_status != Running) {
         _status = Idle;
     }
+    assert(threadContexts.size() == 1);
+    ifetch_req.setThreadContext(_cpuId, 0); // Add thread ID if we add MT
+    data_read_req.setThreadContext(_cpuId, 0); // Add thread ID here too
+    data_write_req.setThreadContext(_cpuId, 0); // Add thread ID here too
 }
 
 
 void
-AtomicSimpleCPU::activateContext(int thread_num, int delay)
+AtomicSimpleCPU::activateContext(ThreadID thread_num, int delay)
 {
     DPRINTF(SimpleCPU, "ActivateContext %d (%d cycles)\n", thread_num, delay);
 
@@ -252,254 +207,225 @@ AtomicSimpleCPU::activateContext(int thread_num, int delay)
     assert(!tickEvent.scheduled());
 
     notIdleFraction++;
+    numCycles += tickToCycles(thread->lastActivate - thread->lastSuspend);
 
     //Make sure ticks are still on multiples of cycles
-    tickEvent.schedule(nextCycle(curTick + cycles(delay)));
+    schedule(tickEvent, nextCycle(curTick() + ticks(delay)));
     _status = Running;
 }
 
 
 void
-AtomicSimpleCPU::suspendContext(int thread_num)
+AtomicSimpleCPU::suspendContext(ThreadID thread_num)
 {
     DPRINTF(SimpleCPU, "SuspendContext %d\n", thread_num);
 
     assert(thread_num == 0);
     assert(thread);
 
+    if (_status == Idle)
+        return;
+
     assert(_status == Running);
 
     // tick event may not be scheduled if this gets called from inside
     // an instruction's execution, e.g. "quiesce"
     if (tickEvent.scheduled())
-        tickEvent.deschedule();
+        deschedule(tickEvent);
 
     notIdleFraction--;
     _status = Idle;
 }
 
 
-template <class T>
 Fault
-AtomicSimpleCPU::read(Addr addr, T &data, unsigned flags)
+AtomicSimpleCPU::readMem(Addr addr, uint8_t * data,
+                         unsigned size, unsigned flags)
 {
     // use the CPU's statically allocated read request and packet objects
     Request *req = &data_read_req;
-    req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
 
     if (traceData) {
         traceData->setAddr(addr);
     }
 
-    // translate to physical address
-    Fault fault = thread->translateDataReadReq(req);
-
-    // Now do the access.
-    if (fault == NoFault) {
-        Packet pkt =
-            Packet(req,
-                   req->isLocked() ? MemCmd::LoadLockedReq : MemCmd::ReadReq,
-                   Packet::Broadcast);
-        pkt.dataStatic(&data);
-
-        if (req->isMmapedIpr())
-            dcache_latency = TheISA::handleIprRead(thread->getTC(), &pkt);
-        else {
-            if (hasPhysMemPort && pkt.getAddr() == physMemAddr)
-                dcache_latency = physmemPort.sendAtomic(&pkt);
-            else
-                dcache_latency = dcachePort.sendAtomic(&pkt);
-        }
-        dcache_access = true;
-        assert(!pkt.isError());
-
-        data = gtoh(data);
+    //The block size of our peer.
+    unsigned blockSize = dcachePort.peerBlockSize();
+    //The size of the data we're trying to read.
+    int fullSize = size;
 
-        if (req->isLocked()) {
-            TheISA::handleLockedRead(thread, req);
-        }
-    }
+    //The address of the second part of this access if it needs to be split
+    //across a cache line boundary.
+    Addr secondAddr = roundDown(addr + size - 1, blockSize);
 
-    // This will need a new way to tell if it has a dcache attached.
-    if (req->isUncacheable())
-        recordEvent("Uncached Read");
+    if (secondAddr > addr)
+        size = secondAddr - addr;
 
-    return fault;
-}
+    dcache_latency = 0;
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    while (1) {
+        req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr());
 
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, Twin32_t &data, unsigned flags);
+        // translate to physical address
+        Fault fault = thread->dtb->translateAtomic(req, tc, BaseTLB::Read);
 
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, Twin64_t &data, unsigned flags);
-
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, uint64_t &data, unsigned flags);
+        // Now do the access.
+        if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) {
+            Packet pkt = Packet(req,
+                                req->isLLSC() ? MemCmd::LoadLockedReq :
+                                MemCmd::ReadReq);
+            pkt.dataStatic(data);
 
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, uint32_t &data, unsigned flags);
-
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, uint16_t &data, unsigned flags);
+            if (req->isMmappedIpr())
+                dcache_latency += TheISA::handleIprRead(thread->getTC(), &pkt);
+            else {
+                if (fastmem && system->isMemAddr(pkt.getAddr()))
+                    system->getPhysMem().access(&pkt);
+                else
+                    dcache_latency += dcachePort.sendAtomic(&pkt);
+            }
+            dcache_access = true;
 
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, uint8_t &data, unsigned flags);
+            assert(!pkt.isError());
 
-#endif //DOXYGEN_SHOULD_SKIP_THIS
+            if (req->isLLSC()) {
+                TheISA::handleLockedRead(thread, req);
+            }
+        }
 
-template<>
-Fault
-AtomicSimpleCPU::read(Addr addr, double &data, unsigned flags)
-{
-    return read(addr, *(uint64_t*)&data, flags);
-}
+        //If there's a fault, return it
+        if (fault != NoFault) {
+            if (req->isPrefetch()) {
+                return NoFault;
+            } else {
+                return fault;
+            }
+        }
 
-template<>
-Fault
-AtomicSimpleCPU::read(Addr addr, float &data, unsigned flags)
-{
-    return read(addr, *(uint32_t*)&data, flags);
-}
+        //If we don't need to access a second cache line, stop now.
+        if (secondAddr <= addr)
+        {
+            if (req->isLocked() && fault == NoFault) {
+                assert(!locked);
+                locked = true;
+            }
+            return fault;
+        }
 
+        /*
+         * Set up for accessing the second cache line.
+         */
 
-template<>
-Fault
-AtomicSimpleCPU::read(Addr addr, int32_t &data, unsigned flags)
-{
-    return read(addr, (uint32_t&)data, flags);
+        //Move the pointer we're reading into to the correct location.
+        data += size;
+        //Adjust the size to get the remaining bytes.
+        size = addr + fullSize - secondAddr;
+        //And access the right address.
+        addr = secondAddr;
+    }
 }
 
 
-template <class T>
 Fault
-AtomicSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
+AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size,
+                          Addr addr, unsigned flags, uint64_t *res)
 {
     // use the CPU's statically allocated write request and packet objects
     Request *req = &data_write_req;
-    req->setVirt(0, addr, sizeof(T), flags, thread->readPC());
 
     if (traceData) {
         traceData->setAddr(addr);
     }
 
-    // translate to physical address
-    Fault fault = thread->translateDataWriteReq(req);
-
-    // Now do the access.
-    if (fault == NoFault) {
-        MemCmd cmd = MemCmd::WriteReq; // default
-        bool do_access = true;  // flag to suppress cache access
-
-        if (req->isLocked()) {
-            cmd = MemCmd::StoreCondReq;
-            do_access = TheISA::handleLockedWrite(thread, req);
-        } else if (req->isSwap()) {
-            cmd = MemCmd::SwapReq;
-            if (req->isCondSwap()) {
-                assert(res);
-                req->setExtraData(*res);
-            }
-        }
-
-        if (do_access) {
-            Packet pkt = Packet(req, cmd, Packet::Broadcast);
-            pkt.dataStatic(&data);
-
-            if (req->isMmapedIpr()) {
-                dcache_latency = TheISA::handleIprWrite(thread->getTC(), &pkt);
-            } else {
-                data = htog(data);
-                if (hasPhysMemPort && pkt.getAddr() == physMemAddr)
-                    dcache_latency = physmemPort.sendAtomic(&pkt);
-                else
-                    dcache_latency = dcachePort.sendAtomic(&pkt);
-            }
-            dcache_access = true;
-            assert(!pkt.isError());
-
-            if (req->isSwap()) {
-                assert(res);
-                *res = pkt.get<T>();
-            }
-        }
-
-        if (res && !req->isSwap()) {
-            *res = req->getExtraData();
-        }
-    }
-
-    // This will need a new way to tell if it's hooked up to a cache or not.
-    if (req->isUncacheable())
-        recordEvent("Uncached Write");
+    //The block size of our peer.
+    unsigned blockSize = dcachePort.peerBlockSize();
+    //The size of the data we're trying to read.
+    int fullSize = size;
 
-    // If the write needs to have a fault on the access, consider calling
-    // changeStatus() and changing it to "bad addr write" or something.
-    return fault;
-}
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
-
-template
-Fault
-AtomicSimpleCPU::write(Twin32_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
+    //The address of the second part of this access if it needs to be split
+    //across a cache line boundary.
+    Addr secondAddr = roundDown(addr + size - 1, blockSize);
 
-template
-Fault
-AtomicSimpleCPU::write(Twin64_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
+    if(secondAddr > addr)
+        size = secondAddr - addr;
 
-template
-Fault
-AtomicSimpleCPU::write(uint64_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
+    dcache_latency = 0;
 
-template
-Fault
-AtomicSimpleCPU::write(uint32_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
+    while(1) {
+        req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr());
 
-template
-Fault
-AtomicSimpleCPU::write(uint16_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
+        // translate to physical address
+        Fault fault = thread->dtb->translateAtomic(req, tc, BaseTLB::Write);
 
-template
-Fault
-AtomicSimpleCPU::write(uint8_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
+        // Now do the access.
+        if (fault == NoFault) {
+            MemCmd cmd = MemCmd::WriteReq; // default
+            bool do_access = true;  // flag to suppress cache access
+
+            if (req->isLLSC()) {
+                cmd = MemCmd::StoreCondReq;
+                do_access = TheISA::handleLockedWrite(thread, req);
+            } else if (req->isSwap()) {
+                cmd = MemCmd::SwapReq;
+                if (req->isCondSwap()) {
+                    assert(res);
+                    req->setExtraData(*res);
+                }
+            }
 
-#endif //DOXYGEN_SHOULD_SKIP_THIS
+            if (do_access && !req->getFlags().isSet(Request::NO_ACCESS)) {
+                Packet pkt = Packet(req, cmd);
+                pkt.dataStatic(data);
+
+                if (req->isMmappedIpr()) {
+                    dcache_latency +=
+                        TheISA::handleIprWrite(thread->getTC(), &pkt);
+                } else {
+                    if (fastmem && system->isMemAddr(pkt.getAddr()))
+                        system->getPhysMem().access(&pkt);
+                    else
+                        dcache_latency += dcachePort.sendAtomic(&pkt);
+                }
+                dcache_access = true;
+                assert(!pkt.isError());
+
+                if (req->isSwap()) {
+                    assert(res);
+                    memcpy(res, pkt.getPtr<uint8_t>(), fullSize);
+                }
+            }
 
-template<>
-Fault
-AtomicSimpleCPU::write(double data, Addr addr, unsigned flags, uint64_t *res)
-{
-    return write(*(uint64_t*)&data, addr, flags, res);
-}
+            if (res && !req->isSwap()) {
+                *res = req->getExtraData();
+            }
+        }
 
-template<>
-Fault
-AtomicSimpleCPU::write(float data, Addr addr, unsigned flags, uint64_t *res)
-{
-    return write(*(uint32_t*)&data, addr, flags, res);
-}
+        //If there's a fault or we don't need to access a second cache line,
+        //stop now.
+        if (fault != NoFault || secondAddr <= addr)
+        {
+            if (req->isLocked() && fault == NoFault) {
+                assert(locked);
+                locked = false;
+            }
+            if (fault != NoFault && req->isPrefetch()) {
+                return NoFault;
+            } else {
+                return fault;
+            }
+        }
 
+        /*
+         * Set up for accessing the second cache line.
+         */
 
-template<>
-Fault
-AtomicSimpleCPU::write(int32_t data, Addr addr, unsigned flags, uint64_t *res)
-{
-    return write((uint32_t)data, addr, flags, res);
+        //Move the pointer we're reading into to the correct location.
+        data += size;
+        //Adjust the size to get the remaining bytes.
+        size = addr + fullSize - secondAddr;
+        //And access the right address.
+        addr = secondAddr;
+    }
 }
 
 
@@ -508,48 +434,72 @@ AtomicSimpleCPU::tick()
 {
     DPRINTF(SimpleCPU, "Tick\n");
 
-    Tick latency = cycles(1); // instruction takes one cycle by default
+    Tick latency = 0;
 
-    for (int i = 0; i < width; ++i) {
+    for (int i = 0; i < width || locked; ++i) {
         numCycles++;
 
         if (!curStaticInst || !curStaticInst->isDelayedCommit())
             checkForInterrupts();
 
-        Fault fault = setupFetchRequest(&ifetch_req);
+        checkPcEventQueue();
+        // We must have just got suspended by a PC event
+        if (_status == Idle)
+            return;
+
+        Fault fault = NoFault;
+
+        TheISA::PCState pcState = thread->pcState();
+
+        bool needToFetch = !isRomMicroPC(pcState.microPC()) &&
+                           !curMacroStaticInst;
+        if (needToFetch) {
+            setupFetchRequest(&ifetch_req);
+            fault = thread->itb->translateAtomic(&ifetch_req, tc,
+                                                 BaseTLB::Execute);
+        }
 
         if (fault == NoFault) {
             Tick icache_latency = 0;
             bool icache_access = false;
             dcache_access = false; // assume no dcache access
 
-            //Fetch more instruction memory if necessary
-            //if(predecoder.needMoreBytes())
-            //{
-                icache_access = true;
-                Packet ifetch_pkt = Packet(&ifetch_req, MemCmd::ReadReq,
-                                           Packet::Broadcast);
-                ifetch_pkt.dataStatic(&inst);
-
-                if (hasPhysMemPort && ifetch_pkt.getAddr() == physMemAddr)
-                    icache_latency = physmemPort.sendAtomic(&ifetch_pkt);
-                else
-                    icache_latency = icachePort.sendAtomic(&ifetch_pkt);
-
-
-                // ifetch_req is initialized to read the instruction directly
-                // into the CPU object's inst field.
-            //}
+            if (needToFetch) {
+                // This is commented out because the decoder would act like
+                // a tiny cache otherwise. It wouldn't be flushed when needed
+                // like the I cache. It should be flushed, and when that works
+                // this code should be uncommented.
+                //Fetch more instruction memory if necessary
+                //if(decoder.needMoreBytes())
+                //{
+                    icache_access = true;
+                    Packet ifetch_pkt = Packet(&ifetch_req, MemCmd::ReadReq);
+                    ifetch_pkt.dataStatic(&inst);
+
+                    if (fastmem && system->isMemAddr(ifetch_pkt.getAddr()))
+                        system->getPhysMem().access(&ifetch_pkt);
+                    else
+                        icache_latency = icachePort.sendAtomic(&ifetch_pkt);
+
+                    assert(!ifetch_pkt.isError());
+
+                    // ifetch_req is initialized to read the instruction directly
+                    // into the CPU object's inst field.
+                //}
+            }
 
             preExecute();
 
-            if(curStaticInst)
-            {
+            if (curStaticInst) {
                 fault = curStaticInst->execute(this, traceData);
 
                 // keep an instruction count
                 if (fault == NoFault)
                     countInst();
+                else if (traceData && !DTRACE(ExecFaulting)) {
+                    delete traceData;
+                    traceData = NULL;
+                }
 
                 postExecute();
             }
@@ -559,16 +509,21 @@ AtomicSimpleCPU::tick()
                         curStaticInst->isFirstMicroop()))
                 instCnt++;
 
-            if (simulate_stalls) {
-                Tick icache_stall =
-                    icache_access ? icache_latency - cycles(1) : 0;
-                Tick dcache_stall =
-                    dcache_access ? dcache_latency - cycles(1) : 0;
-                Tick stall_cycles = (icache_stall + dcache_stall) / cycles(1);
-                if (cycles(stall_cycles) < (icache_stall + dcache_stall))
-                    latency += cycles(stall_cycles+1);
-                else
-                    latency += cycles(stall_cycles);
+            Tick stall_ticks = 0;
+            if (simulate_inst_stalls && icache_access)
+                stall_ticks += icache_latency;
+
+            if (simulate_data_stalls && dcache_access)
+                stall_ticks += dcache_latency;
+
+            if (stall_ticks) {
+                Tick stall_cycles = stall_ticks / ticks(1);
+                Tick aligned_stall_ticks = ticks(stall_cycles);
+
+                if (aligned_stall_ticks < stall_ticks)
+                    aligned_stall_ticks += 1;
+
+                latency += aligned_stall_ticks;
             }
 
         }
@@ -576,8 +531,19 @@ AtomicSimpleCPU::tick()
             advancePC(fault);
     }
 
+    // instruction takes at least one cycle
+    if (latency < ticks(1))
+        latency = ticks(1);
+
     if (_status != Idle)
-        tickEvent.schedule(curTick + latency);
+        schedule(tickEvent, curTick() + latency);
+}
+
+
+void
+AtomicSimpleCPU::printAddr(Addr a)
+{
+    dcachePort.printAddr(a);
 }
 
 
@@ -588,38 +554,8 @@ AtomicSimpleCPU::tick()
 AtomicSimpleCPU *
 AtomicSimpleCPUParams::create()
 {
-    AtomicSimpleCPU::Params *params = new AtomicSimpleCPU::Params();
-    params->name = name;
-    params->numberOfThreads = 1;
-    params->max_insts_any_thread = max_insts_any_thread;
-    params->max_insts_all_threads = max_insts_all_threads;
-    params->max_loads_any_thread = max_loads_any_thread;
-    params->max_loads_all_threads = max_loads_all_threads;
-    params->progress_interval = progress_interval;
-    params->deferRegistration = defer_registration;
-    params->phase = phase;
-    params->clock = clock;
-    params->functionTrace = function_trace;
-    params->functionTraceStart = function_trace_start;
-    params->width = width;
-    params->simulate_stalls = simulate_stalls;
-    params->system = system;
-    params->cpu_id = cpu_id;
-    params->tracer = tracer;
-
-    params->itb = itb;
-    params->dtb = dtb;
-#if FULL_SYSTEM
-    params->profile = profile;
-    params->do_quiesce = do_quiesce;
-    params->do_checkpoint_insts = do_checkpoint_insts;
-    params->do_statistics_insts = do_statistics_insts;
-#else
-    if (workload.size() != 1)
+    numThreads = 1;
+    if (!FullSystem && workload.size() != 1)
         panic("only one workload allowed");
-    params->process = workload[0];
-#endif
-
-    AtomicSimpleCPU *cpu = new AtomicSimpleCPU(params);
-    return cpu;
+    return new AtomicSimpleCPU(this);
 }