Revert power patch sets with unexpected interactions
[gem5.git] / src / cpu / simple / atomic.cc
index c092b5b1fde8fe1fe3da7065eebbe8ac1b13facf..f3e14d40199044bebc3e2917d3182d0e8ba98afc 100644 (file)
@@ -1,4 +1,17 @@
 /*
+ * Copyright 2014 Google, Inc.
+ * Copyright (c) 2012-2013,2015 ARM Limited
+ * All rights reserved.
+ *
+ * The license below extends only to copyright in the software and shall
+ * not be construed as granting a license to any other intellectual
+ * property including but not limited to intellectual property relating
+ * to a hardware implementation of the functionality of the software
+ * licensed hereunder.  You may use the software subject to the license
+ * terms below provided that you ensure that this notice is replicated
+ * unmodified and in its entirety in all distributions of the software,
+ * modified or unmodified, in source code or in binary form.
+ *
  * Copyright (c) 2002-2005 The Regents of The University of Michigan
  * All rights reserved.
  *
  */
 
 #include "arch/locked_mem.hh"
-#include "arch/mmaped_ipr.hh"
+#include "arch/mmapped_ipr.hh"
 #include "arch/utility.hh"
 #include "base/bigint.hh"
+#include "base/output.hh"
 #include "config/the_isa.hh"
-#include "cpu/exetrace.hh"
 #include "cpu/simple/atomic.hh"
+#include "cpu/exetrace.hh"
+#include "debug/Drain.hh"
+#include "debug/ExecFaulting.hh"
+#include "debug/SimpleCPU.hh"
 #include "mem/packet.hh"
 #include "mem/packet_access.hh"
+#include "mem/physical.hh"
 #include "params/AtomicSimpleCPU.hh"
+#include "sim/faults.hh"
 #include "sim/system.hh"
+#include "sim/full_system.hh"
 
 using namespace std;
 using namespace TheISA;
@@ -61,281 +81,298 @@ AtomicSimpleCPU::TickEvent::description() const
     return "AtomicSimpleCPU tick";
 }
 
-Port *
-AtomicSimpleCPU::getPort(const string &if_name, int idx)
-{
-    if (if_name == "dcache_port")
-        return &dcachePort;
-    else if (if_name == "icache_port")
-        return &icachePort;
-    else if (if_name == "physmem_port") {
-        hasPhysMemPort = true;
-        return &physmemPort;
-    }
-    else
-        panic("No Such Port\n");
-}
-
 void
 AtomicSimpleCPU::init()
 {
-    BaseCPU::init();
-#if FULL_SYSTEM
-    ThreadID size = threadContexts.size();
-    for (ThreadID i = 0; i < size; ++i) {
-        ThreadContext *tc = threadContexts[i];
-
-        // initialize CPU, including PC
-        TheISA::initCPU(tc, tc->contextId());
-    }
-#endif
-    if (hasPhysMemPort) {
-        bool snoop = false;
-        AddrRangeList pmAddrList;
-        physmemPort.getPeerAddressRanges(pmAddrList, snoop);
-        physMemAddr = *pmAddrList.begin();
-    }
-    // Atomic doesn't do MT right now, so contextId == threadId
-    ifetch_req.setThreadContext(_cpuId, 0); // Add thread ID if we add MT
-    data_read_req.setThreadContext(_cpuId, 0); // Add thread ID here too
-    data_write_req.setThreadContext(_cpuId, 0); // Add thread ID here too
+    BaseSimpleCPU::init();
+
+    int cid = threadContexts[0]->contextId();
+    ifetch_req.setThreadContext(cid, 0);
+    data_read_req.setThreadContext(cid, 0);
+    data_write_req.setThreadContext(cid, 0);
 }
 
-bool
-AtomicSimpleCPU::CpuPort::recvTiming(PacketPtr pkt)
+AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p)
+    : BaseSimpleCPU(p), tickEvent(this), width(p->width), locked(false),
+      simulate_data_stalls(p->simulate_data_stalls),
+      simulate_inst_stalls(p->simulate_inst_stalls),
+      icachePort(name() + ".icache_port", this),
+      dcachePort(name() + ".dcache_port", this),
+      fastmem(p->fastmem), dcache_access(false), dcache_latency(0),
+      ppCommit(nullptr)
 {
-    panic("AtomicSimpleCPU doesn't expect recvTiming callback!");
-    return true;
+    _status = Idle;
 }
 
-Tick
-AtomicSimpleCPU::CpuPort::recvAtomic(PacketPtr pkt)
+
+AtomicSimpleCPU::~AtomicSimpleCPU()
 {
-    //Snooping a coherence request, just return
-    return 0;
+    if (tickEvent.scheduled()) {
+        deschedule(tickEvent);
+    }
 }
 
-void
-AtomicSimpleCPU::CpuPort::recvFunctional(PacketPtr pkt)
+DrainState
+AtomicSimpleCPU::drain()
 {
-    //No internal storage to update, just return
-    return;
+    if (switchedOut())
+        return DrainState::Drained;
+
+    if (!isDrained()) {
+        DPRINTF(Drain, "Requesting drain.\n");
+        return DrainState::Draining;
+    } else {
+        if (tickEvent.scheduled())
+            deschedule(tickEvent);
+
+        activeThreads.clear();
+        DPRINTF(Drain, "Not executing microcode, no need to drain.\n");
+        return DrainState::Drained;
+    }
 }
 
 void
-AtomicSimpleCPU::CpuPort::recvStatusChange(Status status)
+AtomicSimpleCPU::threadSnoop(PacketPtr pkt, ThreadID sender)
 {
-    if (status == RangeChange) {
-        if (!snoopRangeSent) {
-            snoopRangeSent = true;
-            sendStatusChange(Port::RangeChange);
+    DPRINTF(SimpleCPU, "received snoop pkt for addr:%#x %s\n", pkt->getAddr(),
+            pkt->cmdString());
+
+    for (ThreadID tid = 0; tid < numThreads; tid++) {
+        if (tid != sender) {
+            if (getCpuAddrMonitor(tid)->doMonitor(pkt)) {
+                wakeup(tid);
+            }
+
+            TheISA::handleLockedSnoop(threadInfo[tid]->thread,
+                                      pkt, dcachePort.cacheBlockMask);
         }
-        return;
     }
-
-    panic("AtomicSimpleCPU doesn't expect recvStatusChange callback!");
 }
 
 void
-AtomicSimpleCPU::CpuPort::recvRetry()
+AtomicSimpleCPU::drainResume()
 {
-    panic("AtomicSimpleCPU doesn't expect recvRetry callback!");
-}
+    assert(!tickEvent.scheduled());
+    if (switchedOut())
+        return;
 
-void
-AtomicSimpleCPU::DcachePort::setPeer(Port *port)
-{
-    Port::setPeer(port);
+    DPRINTF(SimpleCPU, "Resume\n");
+    verifyMemoryMode();
+
+    assert(!threadContexts.empty());
 
-#if FULL_SYSTEM
-    // Update the ThreadContext's memory ports (Functional/Virtual
-    // Ports)
-    cpu->tcBase()->connectMemPorts(cpu->tcBase());
-#endif
+    _status = BaseSimpleCPU::Idle;
+
+    for (ThreadID tid = 0; tid < numThreads; tid++) {
+        if (threadInfo[tid]->thread->status() == ThreadContext::Active) {
+            threadInfo[tid]->notIdleFraction = 1;
+            activeThreads.push_back(tid);
+            _status = BaseSimpleCPU::Running;
+
+            // Tick if any threads active
+            if (!tickEvent.scheduled()) {
+                schedule(tickEvent, nextCycle());
+            }
+        } else {
+            threadInfo[tid]->notIdleFraction = 0;
+        }
+    }
 }
 
-AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p)
-    : BaseSimpleCPU(p), tickEvent(this), width(p->width), locked(false),
-      simulate_data_stalls(p->simulate_data_stalls),
-      simulate_inst_stalls(p->simulate_inst_stalls),
-      icachePort(name() + "-iport", this), dcachePort(name() + "-iport", this),
-      physmemPort(name() + "-iport", this), hasPhysMemPort(false)
+bool
+AtomicSimpleCPU::tryCompleteDrain()
 {
-    _status = Idle;
+    if (drainState() != DrainState::Draining)
+        return false;
 
-    icachePort.snoopRangeSent = false;
-    dcachePort.snoopRangeSent = false;
+    DPRINTF(Drain, "tryCompleteDrain.\n");
+    if (!isDrained())
+        return false;
 
+    DPRINTF(Drain, "CPU done draining, processing drain event\n");
+    signalDrainDone();
+
+    return true;
 }
 
 
-AtomicSimpleCPU::~AtomicSimpleCPU()
+void
+AtomicSimpleCPU::switchOut()
 {
+    BaseSimpleCPU::switchOut();
+
+    assert(!tickEvent.scheduled());
+    assert(_status == BaseSimpleCPU::Running || _status == Idle);
+    assert(isDrained());
 }
 
+
 void
-AtomicSimpleCPU::serialize(ostream &os)
+AtomicSimpleCPU::takeOverFrom(BaseCPU *oldCPU)
 {
-    SimObject::State so_state = SimObject::getState();
-    SERIALIZE_ENUM(so_state);
-    SERIALIZE_SCALAR(locked);
-    BaseSimpleCPU::serialize(os);
-    nameOut(os, csprintf("%s.tickEvent", name()));
-    tickEvent.serialize(os);
+    BaseSimpleCPU::takeOverFrom(oldCPU);
+
+    // The tick event should have been descheduled by drain()
+    assert(!tickEvent.scheduled());
 }
 
 void
-AtomicSimpleCPU::unserialize(Checkpoint *cp, const string &section)
+AtomicSimpleCPU::verifyMemoryMode() const
 {
-    SimObject::State so_state;
-    UNSERIALIZE_ENUM(so_state);
-    UNSERIALIZE_SCALAR(locked);
-    BaseSimpleCPU::unserialize(cp, section);
-    tickEvent.unserialize(cp, csprintf("%s.tickEvent", section));
+    if (!system->isAtomicMode()) {
+        fatal("The atomic CPU requires the memory system to be in "
+              "'atomic' mode.\n");
+    }
 }
 
 void
-AtomicSimpleCPU::resume()
+AtomicSimpleCPU::activateContext(ThreadID thread_num)
 {
-    if (_status == Idle || _status == SwitchedOut)
-        return;
+    DPRINTF(SimpleCPU, "ActivateContext %d\n", thread_num);
 
-    DPRINTF(SimpleCPU, "Resume\n");
-    assert(system->getMemoryMode() == Enums::atomic);
+    assert(thread_num < numThreads);
+
+    threadInfo[thread_num]->notIdleFraction = 1;
+    Cycles delta = ticksToCycles(threadInfo[thread_num]->thread->lastActivate -
+                                 threadInfo[thread_num]->thread->lastSuspend);
+    numCycles += delta;
+    ppCycles->notify(delta);
 
-    changeState(SimObject::Running);
-    if (thread->status() == ThreadContext::Active) {
-        if (!tickEvent.scheduled())
-            schedule(tickEvent, nextCycle());
+    if (!tickEvent.scheduled()) {
+        //Make sure ticks are still on multiples of cycles
+        schedule(tickEvent, clockEdge(Cycles(0)));
+    }
+    _status = BaseSimpleCPU::Running;
+    if (std::find(activeThreads.begin(), activeThreads.end(), thread_num)
+        == activeThreads.end()) {
+        activeThreads.push_back(thread_num);
     }
 }
 
+
 void
-AtomicSimpleCPU::switchOut()
+AtomicSimpleCPU::suspendContext(ThreadID thread_num)
 {
-    assert(_status == Running || _status == Idle);
-    _status = SwitchedOut;
+    DPRINTF(SimpleCPU, "SuspendContext %d\n", thread_num);
 
-    tickEvent.squash();
-}
+    assert(thread_num < numThreads);
+    activeThreads.remove(thread_num);
 
+    if (_status == Idle)
+        return;
 
-void
-AtomicSimpleCPU::takeOverFrom(BaseCPU *oldCPU)
-{
-    BaseCPU::takeOverFrom(oldCPU, &icachePort, &dcachePort);
+    assert(_status == BaseSimpleCPU::Running);
 
-    assert(!tickEvent.scheduled());
+    threadInfo[thread_num]->notIdleFraction = 0;
 
-    // if any of this CPU's ThreadContexts are active, mark the CPU as
-    // running and schedule its tick event.
-    ThreadID size = threadContexts.size();
-    for (ThreadID i = 0; i < size; ++i) {
-        ThreadContext *tc = threadContexts[i];
-        if (tc->status() == ThreadContext::Active && _status != Running) {
-            _status = Running;
-            schedule(tickEvent, nextCycle());
-            break;
-        }
-    }
-    if (_status != Running) {
+    if (activeThreads.empty()) {
         _status = Idle;
+
+        if (tickEvent.scheduled()) {
+            deschedule(tickEvent);
+        }
     }
-    assert(threadContexts.size() == 1);
-    ifetch_req.setThreadContext(_cpuId, 0); // Add thread ID if we add MT
-    data_read_req.setThreadContext(_cpuId, 0); // Add thread ID here too
-    data_write_req.setThreadContext(_cpuId, 0); // Add thread ID here too
+
 }
 
 
-void
-AtomicSimpleCPU::activateContext(int thread_num, int delay)
+Tick
+AtomicSimpleCPU::AtomicCPUDPort::recvAtomicSnoop(PacketPtr pkt)
 {
-    DPRINTF(SimpleCPU, "ActivateContext %d (%d cycles)\n", thread_num, delay);
+    DPRINTF(SimpleCPU, "received snoop pkt for addr:%#x %s\n", pkt->getAddr(),
+            pkt->cmdString());
 
-    assert(thread_num == 0);
-    assert(thread);
+    // X86 ISA: Snooping an invalidation for monitor/mwait
+    AtomicSimpleCPU *cpu = (AtomicSimpleCPU *)(&owner);
 
-    assert(_status == Idle);
-    assert(!tickEvent.scheduled());
+    for (ThreadID tid = 0; tid < cpu->numThreads; tid++) {
+        if (cpu->getCpuAddrMonitor(tid)->doMonitor(pkt)) {
+            cpu->wakeup(tid);
+        }
+    }
 
-    notIdleFraction++;
-    numCycles += tickToCycles(thread->lastActivate - thread->lastSuspend);
+    // if snoop invalidates, release any associated locks
+    // When run without caches, Invalidation packets will not be received
+    // hence we must check if the incoming packets are writes and wakeup
+    // the processor accordingly
+    if (pkt->isInvalidate() || pkt->isWrite()) {
+        DPRINTF(SimpleCPU, "received invalidation for addr:%#x\n",
+                pkt->getAddr());
+        for (auto &t_info : cpu->threadInfo) {
+            TheISA::handleLockedSnoop(t_info->thread, pkt, cacheBlockMask);
+        }
+    }
 
-    //Make sure ticks are still on multiples of cycles
-    schedule(tickEvent, nextCycle(curTick + ticks(delay)));
-    _status = Running;
+    return 0;
 }
 
-
 void
-AtomicSimpleCPU::suspendContext(int thread_num)
+AtomicSimpleCPU::AtomicCPUDPort::recvFunctionalSnoop(PacketPtr pkt)
 {
-    DPRINTF(SimpleCPU, "SuspendContext %d\n", thread_num);
-
-    assert(thread_num == 0);
-    assert(thread);
-
-    if (_status == Idle)
-        return;
-
-    assert(_status == Running);
-
-    // tick event may not be scheduled if this gets called from inside
-    // an instruction's execution, e.g. "quiesce"
-    if (tickEvent.scheduled())
-        deschedule(tickEvent);
+    DPRINTF(SimpleCPU, "received snoop pkt for addr:%#x %s\n", pkt->getAddr(),
+            pkt->cmdString());
+
+    // X86 ISA: Snooping an invalidation for monitor/mwait
+    AtomicSimpleCPU *cpu = (AtomicSimpleCPU *)(&owner);
+    for (ThreadID tid = 0; tid < cpu->numThreads; tid++) {
+        if (cpu->getCpuAddrMonitor(tid)->doMonitor(pkt)) {
+            cpu->wakeup(tid);
+        }
+    }
 
-    notIdleFraction--;
-    _status = Idle;
+    // if snoop invalidates, release any associated locks
+    if (pkt->isInvalidate()) {
+        DPRINTF(SimpleCPU, "received invalidation for addr:%#x\n",
+                pkt->getAddr());
+        for (auto &t_info : cpu->threadInfo) {
+            TheISA::handleLockedSnoop(t_info->thread, pkt, cacheBlockMask);
+        }
+    }
 }
 
-
-template <class T>
 Fault
-AtomicSimpleCPU::read(Addr addr, T &data, unsigned flags)
+AtomicSimpleCPU::readMem(Addr addr, uint8_t * data,
+                         unsigned size, unsigned flags)
 {
+    SimpleExecContext& t_info = *threadInfo[curThread];
+    SimpleThread* thread = t_info.thread;
+
     // use the CPU's statically allocated read request and packet objects
     Request *req = &data_read_req;
 
-    if (traceData) {
-        traceData->setAddr(addr);
-    }
+    if (traceData)
+        traceData->setMem(addr, size, flags);
 
-    //The block size of our peer.
-    unsigned blockSize = dcachePort.peerBlockSize();
     //The size of the data we're trying to read.
-    int dataSize = sizeof(T);
-
-    uint8_t * dataPtr = (uint8_t *)&data;
+    int fullSize = size;
 
     //The address of the second part of this access if it needs to be split
     //across a cache line boundary.
-    Addr secondAddr = roundDown(addr + dataSize - 1, blockSize);
+    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
 
-    if(secondAddr > addr)
-        dataSize = secondAddr - addr;
+    if (secondAddr > addr)
+        size = secondAddr - addr;
 
     dcache_latency = 0;
 
-    while(1) {
-        req->setVirt(0, addr, dataSize, flags, thread->readPC());
+    req->taskId(taskId());
+    while (1) {
+        req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr());
 
         // translate to physical address
-        Fault fault = thread->dtb->translateAtomic(req, tc, BaseTLB::Read);
+        Fault fault = thread->dtb->translateAtomic(req, thread->getTC(),
+                                                          BaseTLB::Read);
 
         // Now do the access.
         if (fault == NoFault && !req->getFlags().isSet(Request::NO_ACCESS)) {
-            Packet pkt = Packet(req,
-                    req->isLLSC() ? MemCmd::LoadLockedReq : MemCmd::ReadReq,
-                    Packet::Broadcast);
-            pkt.dataStatic(dataPtr);
+            Packet pkt(req, Packet::makeReadCmd(req));
+            pkt.dataStatic(data);
 
-            if (req->isMmapedIpr())
+            if (req->isMmappedIpr())
                 dcache_latency += TheISA::handleIprRead(thread->getTC(), &pkt);
             else {
-                if (hasPhysMemPort && pkt.getAddr() == physMemAddr)
-                    dcache_latency += physmemPort.sendAtomic(&pkt);
+                if (fastmem && system->isMemAddr(pkt.getAddr()))
+                    system->getPhysMem().access(&pkt);
                 else
                     dcache_latency += dcachePort.sendAtomic(&pkt);
             }
@@ -348,10 +385,6 @@ AtomicSimpleCPU::read(Addr addr, T &data, unsigned flags)
             }
         }
 
-        // This will need a new way to tell if it has a dcache attached.
-        if (req->isUncacheable())
-            recordEvent("Uncached Read");
-
         //If there's a fault, return it
         if (fault != NoFault) {
             if (req->isPrefetch()) {
@@ -364,14 +397,11 @@ AtomicSimpleCPU::read(Addr addr, T &data, unsigned flags)
         //If we don't need to access a second cache line, stop now.
         if (secondAddr <= addr)
         {
-            data = gtoh(data);
-            if (traceData) {
-                traceData->setData(data);
-            }
-            if (req->isLocked() && fault == NoFault) {
+            if (req->isLockedRMW() && fault == NoFault) {
                 assert(!locked);
                 locked = true;
             }
+
             return fault;
         }
 
@@ -380,97 +410,60 @@ AtomicSimpleCPU::read(Addr addr, T &data, unsigned flags)
          */
 
         //Move the pointer we're reading into to the correct location.
-        dataPtr += dataSize;
+        data += size;
         //Adjust the size to get the remaining bytes.
-        dataSize = addr + sizeof(T) - secondAddr;
+        size = addr + fullSize - secondAddr;
         //And access the right address.
         addr = secondAddr;
     }
 }
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
-
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, Twin32_t &data, unsigned flags);
-
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, Twin64_t &data, unsigned flags);
-
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, uint64_t &data, unsigned flags);
-
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, uint32_t &data, unsigned flags);
-
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, uint16_t &data, unsigned flags);
-
-template
-Fault
-AtomicSimpleCPU::read(Addr addr, uint8_t &data, unsigned flags);
-
-#endif //DOXYGEN_SHOULD_SKIP_THIS
-
-template<>
-Fault
-AtomicSimpleCPU::read(Addr addr, double &data, unsigned flags)
-{
-    return read(addr, *(uint64_t*)&data, flags);
-}
-
-template<>
 Fault
-AtomicSimpleCPU::read(Addr addr, float &data, unsigned flags)
+AtomicSimpleCPU::initiateMemRead(Addr addr, unsigned size, unsigned flags)
 {
-    return read(addr, *(uint32_t*)&data, flags);
+    panic("initiateMemRead() is for timing accesses, and should "
+          "never be called on AtomicSimpleCPU.\n");
 }
 
-
-template<>
 Fault
-AtomicSimpleCPU::read(Addr addr, int32_t &data, unsigned flags)
+AtomicSimpleCPU::writeMem(uint8_t *data, unsigned size,
+                          Addr addr, unsigned flags, uint64_t *res)
 {
-    return read(addr, (uint32_t&)data, flags);
-}
-
+    SimpleExecContext& t_info = *threadInfo[curThread];
+    SimpleThread* thread = t_info.thread;
+    static uint8_t zero_array[64] = {};
+
+    if (data == NULL) {
+        assert(size <= 64);
+        assert(flags & Request::CACHE_BLOCK_ZERO);
+        // This must be a cache block cleaning request
+        data = zero_array;
+    }
 
-template <class T>
-Fault
-AtomicSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
-{
     // use the CPU's statically allocated write request and packet objects
     Request *req = &data_write_req;
 
-    if (traceData) {
-        traceData->setAddr(addr);
-    }
+    if (traceData)
+        traceData->setMem(addr, size, flags);
 
-    //The block size of our peer.
-    unsigned blockSize = dcachePort.peerBlockSize();
     //The size of the data we're trying to read.
-    int dataSize = sizeof(T);
-
-    uint8_t * dataPtr = (uint8_t *)&data;
+    int fullSize = size;
 
     //The address of the second part of this access if it needs to be split
     //across a cache line boundary.
-    Addr secondAddr = roundDown(addr + dataSize - 1, blockSize);
+    Addr secondAddr = roundDown(addr + size - 1, cacheLineSize());
 
-    if(secondAddr > addr)
-        dataSize = secondAddr - addr;
+    if (secondAddr > addr)
+        size = secondAddr - addr;
 
     dcache_latency = 0;
 
-    while(1) {
-        req->setVirt(0, addr, dataSize, flags, thread->readPC());
+    req->taskId(taskId());
+    while (1) {
+        req->setVirt(0, addr, size, flags, dataMasterId(), thread->pcState().instAddr());
 
         // translate to physical address
-        Fault fault = thread->dtb->translateAtomic(req, tc, BaseTLB::Write);
+        Fault fault = thread->dtb->translateAtomic(req, thread->getTC(), BaseTLB::Write);
 
         // Now do the access.
         if (fault == NoFault) {
@@ -479,7 +472,7 @@ AtomicSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
 
             if (req->isLLSC()) {
                 cmd = MemCmd::StoreCondReq;
-                do_access = TheISA::handleLockedWrite(thread, req);
+                do_access = TheISA::handleLockedWrite(thread, req, dcachePort.cacheBlockMask);
             } else if (req->isSwap()) {
                 cmd = MemCmd::SwapReq;
                 if (req->isCondSwap()) {
@@ -489,28 +482,27 @@ AtomicSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
             }
 
             if (do_access && !req->getFlags().isSet(Request::NO_ACCESS)) {
-                Packet pkt = Packet(req, cmd, Packet::Broadcast);
-                pkt.dataStatic(dataPtr);
+                Packet pkt = Packet(req, cmd);
+                pkt.dataStatic(data);
 
-                if (req->isMmapedIpr()) {
+                if (req->isMmappedIpr()) {
                     dcache_latency +=
                         TheISA::handleIprWrite(thread->getTC(), &pkt);
                 } else {
-                    //XXX This needs to be outside of the loop in order to
-                    //work properly for cache line boundary crossing
-                    //accesses in transendian simulations.
-                    data = htog(data);
-                    if (hasPhysMemPort && pkt.getAddr() == physMemAddr)
-                        dcache_latency += physmemPort.sendAtomic(&pkt);
+                    if (fastmem && system->isMemAddr(pkt.getAddr()))
+                        system->getPhysMem().access(&pkt);
                     else
                         dcache_latency += dcachePort.sendAtomic(&pkt);
+
+                    // Notify other threads on this CPU of write
+                    threadSnoop(&pkt, curThread);
                 }
                 dcache_access = true;
                 assert(!pkt.isError());
 
                 if (req->isSwap()) {
                     assert(res);
-                    *res = pkt.get<T>();
+                    memcpy(res, pkt.getConstPtr<uint8_t>(), fullSize);
                 }
             }
 
@@ -519,24 +511,16 @@ AtomicSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
             }
         }
 
-        // This will need a new way to tell if it's hooked up to a cache or not.
-        if (req->isUncacheable())
-            recordEvent("Uncached Write");
-
         //If there's a fault or we don't need to access a second cache line,
         //stop now.
         if (fault != NoFault || secondAddr <= addr)
         {
-            // If the write needs to have a fault on the access, consider
-            // calling changeStatus() and changing it to "bad addr write"
-            // or something.
-            if (traceData) {
-                traceData->setData(gtoh(data));
-            }
-            if (req->isLocked() && fault == NoFault) {
+            if (req->isLockedRMW() && fault == NoFault) {
                 assert(locked);
                 locked = false;
             }
+
+
             if (fault != NoFault && req->isPrefetch()) {
                 return NoFault;
             } else {
@@ -549,93 +533,62 @@ AtomicSimpleCPU::write(T data, Addr addr, unsigned flags, uint64_t *res)
          */
 
         //Move the pointer we're reading into to the correct location.
-        dataPtr += dataSize;
+        data += size;
         //Adjust the size to get the remaining bytes.
-        dataSize = addr + sizeof(T) - secondAddr;
+        size = addr + fullSize - secondAddr;
         //And access the right address.
         addr = secondAddr;
     }
 }
 
 
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
-
-template
-Fault
-AtomicSimpleCPU::write(Twin32_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-AtomicSimpleCPU::write(Twin64_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-AtomicSimpleCPU::write(uint64_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-AtomicSimpleCPU::write(uint32_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-AtomicSimpleCPU::write(uint16_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-template
-Fault
-AtomicSimpleCPU::write(uint8_t data, Addr addr,
-                       unsigned flags, uint64_t *res);
-
-#endif //DOXYGEN_SHOULD_SKIP_THIS
-
-template<>
-Fault
-AtomicSimpleCPU::write(double data, Addr addr, unsigned flags, uint64_t *res)
-{
-    return write(*(uint64_t*)&data, addr, flags, res);
-}
-
-template<>
-Fault
-AtomicSimpleCPU::write(float data, Addr addr, unsigned flags, uint64_t *res)
+void
+AtomicSimpleCPU::tick()
 {
-    return write(*(uint32_t*)&data, addr, flags, res);
-}
+    DPRINTF(SimpleCPU, "Tick\n");
 
+    // Change thread if multi-threaded
+    swapActiveThread();
 
-template<>
-Fault
-AtomicSimpleCPU::write(int32_t data, Addr addr, unsigned flags, uint64_t *res)
-{
-    return write((uint32_t)data, addr, flags, res);
-}
+    // Set memroy request ids to current thread
+    if (numThreads > 1) {
+        ContextID cid = threadContexts[curThread]->contextId();
 
+        ifetch_req.setThreadContext(cid, curThread);
+        data_read_req.setThreadContext(cid, curThread);
+        data_write_req.setThreadContext(cid, curThread);
+    }
 
-void
-AtomicSimpleCPU::tick()
-{
-    DPRINTF(SimpleCPU, "Tick\n");
+    SimpleExecContext& t_info = *threadInfo[curThread];
+    SimpleThread* thread = t_info.thread;
 
     Tick latency = 0;
 
     for (int i = 0; i < width || locked; ++i) {
         numCycles++;
+        ppCycles->notify(1);
 
-        if (!curStaticInst || !curStaticInst->isDelayedCommit())
+        if (!curStaticInst || !curStaticInst->isDelayedCommit()) {
             checkForInterrupts();
+            checkPcEventQueue();
+        }
 
-        checkPcEventQueue();
+        // We must have just got suspended by a PC event
+        if (_status == Idle) {
+            tryCompleteDrain();
+            return;
+        }
 
         Fault fault = NoFault;
 
-        bool fromRom = isRomMicroPC(thread->readMicroPC());
-        if (!fromRom && !curMacroStaticInst) {
+        TheISA::PCState pcState = thread->pcState();
+
+        bool needToFetch = !isRomMicroPC(pcState.microPC()) &&
+                           !curMacroStaticInst;
+        if (needToFetch) {
+            ifetch_req.taskId(taskId());
             setupFetchRequest(&ifetch_req);
-            fault = thread->itb->translateAtomic(&ifetch_req, tc,
+            fault = thread->itb->translateAtomic(&ifetch_req, thread->getTC(),
                                                  BaseTLB::Execute);
         }
 
@@ -644,21 +597,20 @@ AtomicSimpleCPU::tick()
             bool icache_access = false;
             dcache_access = false; // assume no dcache access
 
-            if (!fromRom && !curMacroStaticInst) {
-                // This is commented out because the predecoder would act like
+            if (needToFetch) {
+                // This is commented out because the decoder would act like
                 // a tiny cache otherwise. It wouldn't be flushed when needed
                 // like the I cache. It should be flushed, and when that works
                 // this code should be uncommented.
                 //Fetch more instruction memory if necessary
-                //if(predecoder.needMoreBytes())
+                //if (decoder.needMoreBytes())
                 //{
                     icache_access = true;
-                    Packet ifetch_pkt = Packet(&ifetch_req, MemCmd::ReadReq,
-                                               Packet::Broadcast);
+                    Packet ifetch_pkt = Packet(&ifetch_req, MemCmd::ReadReq);
                     ifetch_pkt.dataStatic(&inst);
 
-                    if (hasPhysMemPort && ifetch_pkt.getAddr() == physMemAddr)
-                        icache_latency = physmemPort.sendAtomic(&ifetch_pkt);
+                    if (fastmem && system->isMemAddr(ifetch_pkt.getAddr()))
+                        system->getPhysMem().access(&ifetch_pkt);
                     else
                         icache_latency = icachePort.sendAtomic(&ifetch_pkt);
 
@@ -672,13 +624,14 @@ AtomicSimpleCPU::tick()
             preExecute();
 
             if (curStaticInst) {
-                fault = curStaticInst->execute(this, traceData);
+                fault = curStaticInst->execute(&t_info, traceData);
 
                 // keep an instruction count
-                if (fault == NoFault)
+                if (fault == NoFault) {
                     countInst();
-                else if (traceData) {
-                    // If there was a fault, we should trace this instruction.
+                    ppCommit->notify(std::make_pair(thread, curStaticInst));
+                }
+                else if (traceData && !DTRACE(ExecFaulting)) {
                     delete traceData;
                     traceData = NULL;
                 }
@@ -699,28 +652,37 @@ AtomicSimpleCPU::tick()
                 stall_ticks += dcache_latency;
 
             if (stall_ticks) {
-                Tick stall_cycles = stall_ticks / ticks(1);
-                Tick aligned_stall_ticks = ticks(stall_cycles);
-
-                if (aligned_stall_ticks < stall_ticks)
-                    aligned_stall_ticks += 1;
-
-                latency += aligned_stall_ticks;
+                // the atomic cpu does its accounting in ticks, so
+                // keep counting in ticks but round to the clock
+                // period
+                latency += divCeil(stall_ticks, clockPeriod()) *
+                    clockPeriod();
             }
 
         }
-        if(fault != NoFault || !stayAtPC)
+        if (fault != NoFault || !t_info.stayAtPC)
             advancePC(fault);
     }
 
+    if (tryCompleteDrain())
+        return;
+
     // instruction takes at least one cycle
-    if (latency < ticks(1))
-        latency = ticks(1);
+    if (latency < clockPeriod())
+        latency = clockPeriod();
 
     if (_status != Idle)
-        schedule(tickEvent, curTick + latency);
+        reschedule(tickEvent, curTick() + latency, true);
 }
 
+void
+AtomicSimpleCPU::regProbePoints()
+{
+    BaseCPU::regProbePoints();
+
+    ppCommit = new ProbePointArg<pair<SimpleThread*, const StaticInstPtr>>
+                                (getProbeManager(), "Commit");
+}
 
 void
 AtomicSimpleCPU::printAddr(Addr a)
@@ -728,7 +690,6 @@ AtomicSimpleCPU::printAddr(Addr a)
     dcachePort.printAddr(a);
 }
 
-
 ////////////////////////////////////////////////////////////////////////
 //
 //  AtomicSimpleCPU Simulation Object
@@ -736,10 +697,5 @@ AtomicSimpleCPU::printAddr(Addr a)
 AtomicSimpleCPU *
 AtomicSimpleCPUParams::create()
 {
-    numThreads = 1;
-#if !FULL_SYSTEM
-    if (workload.size() != 1)
-        panic("only one workload allowed");
-#endif
     return new AtomicSimpleCPU(this);
 }