From d658b6e1cc22de852fef611e28f448257acc298a Mon Sep 17 00:00:00 2001 From: Tony Gutierrez Date: Tue, 19 Jan 2016 13:57:50 -0500 Subject: [PATCH] * * * mem: support for gpu-style RMWs in ruby This patch adds support for GPU-style read-modify-write (RMW) operations in ruby. Such atomic operations are traditionally executed at the memory controller (instead of through an L1 cache using cache-line locking). Currently, this patch works by propogating operation functors through the memory system. --- src/base/types.hh | 13 +++++ src/mem/abstract_mem.cc | 69 +++++++++++++++------------ src/mem/packet.hh | 6 +++ src/mem/protocol/RubySlicc_Exports.sm | 7 ++- src/mem/protocol/RubySlicc_Types.sm | 1 + src/mem/request.hh | 69 +++++++++++++++++++++++---- 6 files changed, 124 insertions(+), 41 deletions(-) diff --git a/src/base/types.hh b/src/base/types.hh index bc5c715ce..7b115901a 100644 --- a/src/base/types.hh +++ b/src/base/types.hh @@ -200,6 +200,19 @@ typedef std::shared_ptr Fault; constexpr decltype(nullptr) NoFault = nullptr; #endif +struct AtomicOpFunctor +{ + virtual void operator()(uint8_t *p) = 0; + virtual ~AtomicOpFunctor() {} +}; + +template +struct TypedAtomicOpFunctor : public AtomicOpFunctor +{ + void operator()(uint8_t *p) { execute((T *)p); } + virtual void execute(T * p) = 0; +}; + enum ByteOrder { BigEndianByteOrder, LittleEndianByteOrder diff --git a/src/mem/abstract_mem.cc b/src/mem/abstract_mem.cc index 0835d3fdf..cbe360779 100644 --- a/src/mem/abstract_mem.cc +++ b/src/mem/abstract_mem.cc @@ -341,39 +341,46 @@ AbstractMemory::access(PacketPtr pkt) uint8_t *hostAddr = pmemAddr + pkt->getAddr() - range.start(); if (pkt->cmd == MemCmd::SwapReq) { - std::vector overwrite_val(pkt->getSize()); - uint64_t condition_val64; - uint32_t condition_val32; - - if (!pmemAddr) - panic("Swap only works if there is real memory (i.e. null=False)"); - - bool overwrite_mem = true; - // keep a copy of our possible write value, and copy what is at the - // memory address into the packet - std::memcpy(&overwrite_val[0], pkt->getConstPtr(), - pkt->getSize()); - std::memcpy(pkt->getPtr(), hostAddr, pkt->getSize()); - - if (pkt->req->isCondSwap()) { - if (pkt->getSize() == sizeof(uint64_t)) { - condition_val64 = pkt->req->getExtraData(); - overwrite_mem = !std::memcmp(&condition_val64, hostAddr, - sizeof(uint64_t)); - } else if (pkt->getSize() == sizeof(uint32_t)) { - condition_val32 = (uint32_t)pkt->req->getExtraData(); - overwrite_mem = !std::memcmp(&condition_val32, hostAddr, - sizeof(uint32_t)); - } else - panic("Invalid size for conditional read/write\n"); - } + if (pkt->isAtomicOp()) { + if (pmemAddr) { + memcpy(pkt->getPtr(), hostAddr, pkt->getSize()); + (*(pkt->getAtomicOp()))(hostAddr); + } + } else { + std::vector overwrite_val(pkt->getSize()); + uint64_t condition_val64; + uint32_t condition_val32; + + if (!pmemAddr) + panic("Swap only works if there is real memory (i.e. null=False)"); + + bool overwrite_mem = true; + // keep a copy of our possible write value, and copy what is at the + // memory address into the packet + std::memcpy(&overwrite_val[0], pkt->getConstPtr(), + pkt->getSize()); + std::memcpy(pkt->getPtr(), hostAddr, pkt->getSize()); + + if (pkt->req->isCondSwap()) { + if (pkt->getSize() == sizeof(uint64_t)) { + condition_val64 = pkt->req->getExtraData(); + overwrite_mem = !std::memcmp(&condition_val64, hostAddr, + sizeof(uint64_t)); + } else if (pkt->getSize() == sizeof(uint32_t)) { + condition_val32 = (uint32_t)pkt->req->getExtraData(); + overwrite_mem = !std::memcmp(&condition_val32, hostAddr, + sizeof(uint32_t)); + } else + panic("Invalid size for conditional read/write\n"); + } - if (overwrite_mem) - std::memcpy(hostAddr, &overwrite_val[0], pkt->getSize()); + if (overwrite_mem) + std::memcpy(hostAddr, &overwrite_val[0], pkt->getSize()); - assert(!pkt->req->isInstFetch()); - TRACE_PACKET("Read/Write"); - numOther[pkt->req->masterId()]++; + assert(!pkt->req->isInstFetch()); + TRACE_PACKET("Read/Write"); + numOther[pkt->req->masterId()]++; + } } else if (pkt->isRead()) { assert(!pkt->isWrite()); if (pkt->isLLSC()) { diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 0e7135d73..19c7e6397 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -660,6 +660,12 @@ class Packet : public Printable return _isSecure; } + /** + * Accessor function to atomic op. + */ + AtomicOpFunctor *getAtomicOp() const { return req->getAtomicOpFunctor(); } + bool isAtomicOp() const { return req->isAtomic(); } + /** * It has been determined that the SC packet should successfully update * memory. Therefore, convert this SC packet to a normal write. diff --git a/src/mem/protocol/RubySlicc_Exports.sm b/src/mem/protocol/RubySlicc_Exports.sm index 6fedfeb2d..882102923 100644 --- a/src/mem/protocol/RubySlicc_Exports.sm +++ b/src/mem/protocol/RubySlicc_Exports.sm @@ -56,6 +56,7 @@ bool testAndWrite(Addr addr, DataBlock datablk, Packet *pkt); enumeration(AccessPermission, desc="...", default="AccessPermission_NotPresent") { // Valid data Read_Only, desc="block is Read Only (modulo functional writes)"; + Write_Only, desc="block is Write Only"; Read_Write, desc="block is Read/Write"; // Possibly Invalid data @@ -144,7 +145,9 @@ enumeration(TransitionResult, desc="...") { enumeration(RubyRequestType, desc="...", default="RubyRequestType_NULL") { LD, desc="Load"; ST, desc="Store"; - ATOMIC, desc="Atomic Load/Store"; + ATOMIC, desc="Atomic Load/Store -- depricated. use ATOMIC_RETURN or ATOMIC_NO_RETURN"; + ATOMIC_RETURN, desc="Atomic Load/Store, return data"; + ATOMIC_NO_RETURN, desc="Atomic Load/Store, do not return data"; IFETCH, desc="Instruction fetch"; IO, desc="I/O"; REPLACEMENT, desc="Replacement"; @@ -166,6 +169,8 @@ enumeration(SequencerRequestType, desc="...", default="SequencerRequestType_NULL Default, desc="Replace this with access_types passed to the DMA Ruby object"; LD, desc="Load"; ST, desc="Store"; + ATOMIC, desc="Atomic Load/Store"; + REPLACEMENT, desc="Replacement"; FLUSH, desc="Flush request type"; NULL, desc="Invalid request type"; } diff --git a/src/mem/protocol/RubySlicc_Types.sm b/src/mem/protocol/RubySlicc_Types.sm index c7479089b..95fa1db17 100644 --- a/src/mem/protocol/RubySlicc_Types.sm +++ b/src/mem/protocol/RubySlicc_Types.sm @@ -126,6 +126,7 @@ structure(RubyRequest, desc="...", interface="Message", external="yes") { int Size, desc="size in bytes of access"; PrefetchBit Prefetch, desc="Is this a prefetch request"; int contextId, desc="this goes away but must be replace with Nilay"; + int wfid, desc="Writethrough wavefront"; HSAScope scope, desc="HSA scope"; HSASegment segment, desc="HSA segment"; } diff --git a/src/mem/request.hh b/src/mem/request.hh index bb5e5d59c..0d2750a16 100644 --- a/src/mem/request.hh +++ b/src/mem/request.hh @@ -160,6 +160,11 @@ class Request /** The request should be marked with RELEASE. */ RELEASE = 0x00040000, + /** The request is an atomic that returns data. */ + ATOMIC_RETURN_OP = 0x40000000, + /** The request is an atomic that does not return data. */ + ATOMIC_NO_RETURN_OP = 0x80000000, + /** The request should be marked with KERNEL. * Used to indicate the synchronization associated with a GPU kernel * launch or completion. @@ -345,6 +350,9 @@ class Request /** Sequence number of the instruction that creates the request */ InstSeqNum _reqInstSeqNum; + /** A pointer to an atomic operation */ + AtomicOpFunctor *atomicOpFunctor; + public: /** @@ -356,7 +364,8 @@ class Request : _paddr(0), _size(0), _masterId(invldMasterId), _time(0), _taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0), _extraData(0), _contextId(0), _threadId(0), _pc(0), - _reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0) + _reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0), + accessDelta(0), depth(0) {} Request(Addr paddr, unsigned size, Flags flags, MasterID mid, @@ -364,7 +373,8 @@ class Request : _paddr(0), _size(0), _masterId(invldMasterId), _time(0), _taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0), _extraData(0), _contextId(0), _threadId(0), _pc(0), - _reqInstSeqNum(seq_num), translateDelta(0), accessDelta(0), depth(0) + _reqInstSeqNum(seq_num), atomicOpFunctor(nullptr), translateDelta(0), + accessDelta(0), depth(0) { setPhys(paddr, size, flags, mid, curTick()); setThreadContext(cid, tid); @@ -380,7 +390,8 @@ class Request : _paddr(0), _size(0), _masterId(invldMasterId), _time(0), _taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0), _extraData(0), _contextId(0), _threadId(0), _pc(0), - _reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0) + _reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0), + accessDelta(0), depth(0) { setPhys(paddr, size, flags, mid, curTick()); } @@ -389,7 +400,8 @@ class Request : _paddr(0), _size(0), _masterId(invldMasterId), _time(0), _taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0), _extraData(0), _contextId(0), _threadId(0), _pc(0), - _reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0) + _reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0), + accessDelta(0), depth(0) { setPhys(paddr, size, flags, mid, time); } @@ -398,12 +410,12 @@ class Request Addr pc) : _paddr(0), _size(0), _masterId(invldMasterId), _time(0), _taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0), - _extraData(0), _contextId(0), _threadId(0), _pc(0), - _reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0) + _extraData(0), _contextId(0), _threadId(0), _pc(pc), + _reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0), + accessDelta(0), depth(0) { setPhys(paddr, size, flags, mid, time); privateFlags.set(VALID_PC); - _pc = pc; } Request(int asid, Addr vaddr, unsigned size, Flags flags, MasterID mid, @@ -411,13 +423,27 @@ class Request : _paddr(0), _size(0), _masterId(invldMasterId), _time(0), _taskId(ContextSwitchTaskId::Unknown), _asid(0), _vaddr(0), _extraData(0), _contextId(0), _threadId(0), _pc(0), - _reqInstSeqNum(0), translateDelta(0), accessDelta(0), depth(0) + _reqInstSeqNum(0), atomicOpFunctor(nullptr), translateDelta(0), + accessDelta(0), depth(0) + { + setVirt(asid, vaddr, size, flags, mid, pc); + setThreadContext(cid, tid); + } + + Request(int asid, Addr vaddr, int size, Flags flags, MasterID mid, Addr pc, + int cid, ThreadID tid, AtomicOpFunctor *atomic_op) + : atomicOpFunctor(atomic_op) { setVirt(asid, vaddr, size, flags, mid, pc); setThreadContext(cid, tid); } - ~Request() {} + ~Request() + { + if (hasAtomicOpFunctor()) { + delete atomicOpFunctor; + } + } /** * Set up CPU and thread numbers. @@ -541,6 +567,22 @@ class Request return _time; } + /** + * Accessor for atomic-op functor. + */ + bool + hasAtomicOpFunctor() + { + return atomicOpFunctor != NULL; + } + + AtomicOpFunctor * + getAtomicOpFunctor() + { + assert(atomicOpFunctor != NULL); + return atomicOpFunctor; + } + /** Accessor for flags. */ Flags getFlags() @@ -749,6 +791,15 @@ class Request bool isAcquire() const { return _flags.isSet(ACQUIRE); } bool isRelease() const { return _flags.isSet(RELEASE); } bool isKernel() const { return _flags.isSet(KERNEL); } + bool isAtomicReturn() const { return _flags.isSet(ATOMIC_RETURN_OP); } + bool isAtomicNoReturn() const { return _flags.isSet(ATOMIC_NO_RETURN_OP); } + + bool + isAtomic() const + { + return _flags.isSet(ATOMIC_RETURN_OP) || + _flags.isSet(ATOMIC_NO_RETURN_OP); + } /** * Accessor functions for the memory space configuration flags and used by -- 2.30.2