From 46d7fdf1b6672ca00dcca2bce30997b95774b906 Mon Sep 17 00:00:00 2001 From: Timothy Hayes Date: Wed, 2 Sep 2020 11:28:33 +0100 Subject: [PATCH] cpu: HTM Implementation for O3CPU JIRA: https://gem5.atlassian.net/browse/GEM5-587 Change-Id: I83787f4594963a15d856b81ad283b4f032d1c007 Signed-off-by: Giacomo Travaglini Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/30328 Reviewed-by: Jason Lowe-Power Maintainer: Jason Lowe-Power Tested-by: kokoro --- src/cpu/base_dyn_inst.hh | 45 ++++++-- src/cpu/base_dyn_inst_impl.hh | 3 + src/cpu/o3/commit.hh | 13 ++- src/cpu/o3/commit_impl.hh | 100 +++++++++++++++- src/cpu/o3/cpu.cc | 36 +++++- src/cpu/o3/cpu.hh | 1 - src/cpu/o3/iew.hh | 12 +- src/cpu/o3/iew_impl.hh | 16 ++- src/cpu/o3/lsq.hh | 70 ++++++++++++ src/cpu/o3/lsq_impl.hh | 115 ++++++++++++++++++- src/cpu/o3/lsq_unit.hh | 93 +++++++++++++++ src/cpu/o3/lsq_unit_impl.hh | 183 +++++++++++++++++++++++++++++- src/cpu/o3/mem_dep_unit_impl.hh | 13 ++- src/cpu/o3/thread_context_impl.hh | 9 +- src/cpu/o3/thread_state.hh | 5 +- 15 files changed, 683 insertions(+), 31 deletions(-) diff --git a/src/cpu/base_dyn_inst.hh b/src/cpu/base_dyn_inst.hh index 31dee6c50..b0e9ef267 100644 --- a/src/cpu/base_dyn_inst.hh +++ b/src/cpu/base_dyn_inst.hh @@ -61,6 +61,7 @@ #include "cpu/op_class.hh" #include "cpu/static_inst.hh" #include "cpu/translation.hh" +#include "debug/HtmCpu.hh" #include "mem/packet.hh" #include "mem/request.hh" #include "sim/byteswap.hh" @@ -140,6 +141,7 @@ class BaseDynInst : public ExecContext, public RefCounted IsStrictlyOrdered, ReqMade, MemOpDone, + HtmFromTransaction, MaxFlags }; @@ -240,6 +242,11 @@ class BaseDynInst : public ExecContext, public RefCounted // Need a copy of main request pointer to verify on writes. RequestPtr reqToVerify; + private: + // hardware transactional memory + uint64_t htmUid; + uint64_t htmDepth; + protected: /** Flattened register index of the destination registers of this * instruction. @@ -548,8 +555,8 @@ class BaseDynInst : public ExecContext, public RefCounted uint64_t getHtmTransactionUid() const override { - panic("Not yet implemented\n"); - return 0; + assert(instFlags[HtmFromTransaction]); + return this->htmUid; } uint64_t newHtmTransactionUid() const override @@ -560,14 +567,35 @@ class BaseDynInst : public ExecContext, public RefCounted bool inHtmTransactionalState() const override { - panic("Not yet implemented\n"); - return false; + return instFlags[HtmFromTransaction]; } uint64_t getHtmTransactionalDepth() const override { - panic("Not yet implemented\n"); - return 0; + if (inHtmTransactionalState()) + return this->htmDepth; + else + return 0; + } + + void setHtmTransactionalState(uint64_t htm_uid, uint64_t htm_depth) + { + instFlags.set(HtmFromTransaction); + htmUid = htm_uid; + htmDepth = htm_depth; + } + + void clearHtmTransactionalState() + { + if (inHtmTransactionalState()) { + DPRINTF(HtmCpu, + "clearing instuction's transactional state htmUid=%u\n", + getHtmTransactionUid()); + + instFlags.reset(HtmFromTransaction); + htmUid = -1; + htmDepth = 0; + } } /** Temporarily sets this instruction as a serialize before instruction. */ @@ -997,8 +1025,9 @@ template Fault BaseDynInst::initiateHtmCmd(Request::Flags flags) { - panic("Not yet implemented\n"); - return NoFault; + return cpu->pushRequest( + dynamic_cast(this), + /* ld */ true, nullptr, 8, 0x0ul, flags, nullptr, nullptr); } template diff --git a/src/cpu/base_dyn_inst_impl.hh b/src/cpu/base_dyn_inst_impl.hh index 45b938d4c..bfe8ff5ba 100644 --- a/src/cpu/base_dyn_inst_impl.hh +++ b/src/cpu/base_dyn_inst_impl.hh @@ -95,6 +95,9 @@ BaseDynInst::initVars() physEffAddr = 0; readyRegs = 0; memReqFlags = 0; + // hardware transactional memory + htmUid = -1; + htmDepth = 0; status.reset(); diff --git a/src/cpu/o3/commit.hh b/src/cpu/o3/commit.hh index 69d1c86ce..85d00a9fd 100644 --- a/src/cpu/o3/commit.hh +++ b/src/cpu/o3/commit.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2012, 2014 ARM Limited + * Copyright (c) 2010-2012, 2014, 2019 ARM Limited * All rights reserved. * * The license below extends only to copyright in the software and shall @@ -205,6 +205,12 @@ class DefaultCommit /** Deschedules a thread from scheduling */ void deactivateThread(ThreadID tid); + /** Is the CPU currently processing a HTM transaction? */ + bool executingHtmTransaction(ThreadID) const; + + /* Reset HTM tracking, e.g. after an abort */ + void resetHtmStartsStops(ThreadID); + /** Ticks the commit stage, which tries to commit instructions. */ void tick(); @@ -473,6 +479,11 @@ class DefaultCommit /** Updates commit stats based on this instruction. */ void updateComInstStats(const DynInstPtr &inst); + + // HTM + int htmStarts[Impl::MaxThreads]; + int htmStops[Impl::MaxThreads]; + /** Stat for the total number of squashed instructions discarded by commit. */ Stats::Scalar commitSquashedInsts; diff --git a/src/cpu/o3/commit_impl.hh b/src/cpu/o3/commit_impl.hh index de792060d..73041ba04 100644 --- a/src/cpu/o3/commit_impl.hh +++ b/src/cpu/o3/commit_impl.hh @@ -60,6 +60,7 @@ #include "debug/CommitRate.hh" #include "debug/Drain.hh" #include "debug/ExecFaulting.hh" +#include "debug/HtmCpu.hh" #include "debug/O3PipeView.hh" #include "params/DerivO3CPU.hh" #include "sim/faults.hh" @@ -121,6 +122,8 @@ DefaultCommit::DefaultCommit(O3CPU *_cpu, DerivO3CPUParams *params) committedStores[tid] = false; checkEmptyROB[tid] = false; renameMap[tid] = nullptr; + htmStarts[tid] = 0; + htmStops[tid] = 0; } interrupt = NoFault; } @@ -404,6 +407,14 @@ DefaultCommit::drainSanityCheck() const { assert(isDrained()); rob->drainSanityCheck(); + + // hardware transactional memory + // cannot drain partially through a transaction + for (ThreadID tid = 0; tid < numThreads; tid++) { + if (executingHtmTransaction(tid)) { + panic("cannot drain partially through a HTM transaction"); + } + } } template @@ -462,6 +473,27 @@ DefaultCommit::deactivateThread(ThreadID tid) } } +template +bool +DefaultCommit::executingHtmTransaction(ThreadID tid) const +{ + if (tid == InvalidThreadID) + return false; + else + return (htmStarts[tid] > htmStops[tid]); +} + +template +void +DefaultCommit::resetHtmStartsStops(ThreadID tid) +{ + if (tid != InvalidThreadID) + { + htmStarts[tid] = 0; + htmStops[tid] = 0; + } +} + template void @@ -532,6 +564,14 @@ DefaultCommit::generateTrapEvent(ThreadID tid, Fault inst_fault) Cycles latency = dynamic_pointer_cast(inst_fault) ? cpu->syscallRetryLatency : trapLatency; + // hardware transactional memory + if (inst_fault != nullptr && + std::dynamic_pointer_cast(inst_fault)) { + // TODO + // latency = default abort/restore latency + // could also do some kind of exponential back off if desired + } + cpu->schedule(trap, cpu->clockEdge(latency)); trapInFlight[tid] = true; thread[tid]->trapPending = true; @@ -991,12 +1031,27 @@ DefaultCommit::commitInsts() // Commit as many instructions as possible until the commit bandwidth // limit is reached, or it becomes impossible to commit any more. while (num_committed < commitWidth) { + // hardware transactionally memory + // If executing within a transaction, + // need to handle interrupts specially + + ThreadID commit_thread = getCommittingThread(); + // Check for any interrupt that we've already squashed for // and start processing it. - if (interrupt != NoFault) - handleInterrupt(); + if (interrupt != NoFault) { + // If inside a transaction, postpone interrupts + if (executingHtmTransaction(commit_thread)) { + cpu->clearInterrupts(0); + toIEW->commitInfo[0].clearInterrupt = true; + interrupt = NoFault; + avoidQuiesceLiveLock = true; + } else { + handleInterrupt(); + } + } - ThreadID commit_thread = getCommittingThread(); + // ThreadID commit_thread = getCommittingThread(); if (commit_thread == -1 || !rob->isHeadReady(commit_thread)) break; @@ -1044,6 +1099,23 @@ DefaultCommit::commitInsts() statCommittedInstType[tid][head_inst->opClass()]++; ppCommit->notify(head_inst); + // hardware transactional memory + + // update nesting depth + if (head_inst->isHtmStart()) + htmStarts[tid]++; + + // sanity check + if (head_inst->inHtmTransactionalState()) { + assert(executingHtmTransaction(tid)); + } else { + assert(!executingHtmTransaction(tid)); + } + + // update nesting depth + if (head_inst->isHtmStop()) + htmStops[tid]++; + changedROBNumEntries[tid] = true; // Set the doneSeqNum to the youngest committed instruction. @@ -1206,6 +1278,23 @@ DefaultCommit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) // Check if the instruction caused a fault. If so, trap. Fault inst_fault = head_inst->getFault(); + // hardware transactional memory + // if a fault occurred within a HTM transaction + // ensure that the transaction aborts + if (inst_fault != NoFault && head_inst->inHtmTransactionalState()) { + // There exists a generic HTM fault common to all ISAs + if (!std::dynamic_pointer_cast(inst_fault)) { + DPRINTF(HtmCpu, "%s - fault (%s) encountered within transaction" + " - converting to GenericHtmFailureFault\n", + head_inst->staticInst->getName(), inst_fault->name()); + inst_fault = std::make_shared( + head_inst->getHtmTransactionUid(), + HtmFailureFaultCause::EXCEPTION); + } + // If this point is reached and the fault inherits from the HTM fault, + // then there is no need to raise a new fault + } + // Stores mark themselves as completed. if (!head_inst->isStore() && inst_fault == NoFault) { head_inst->setCompleted(); @@ -1301,6 +1390,11 @@ DefaultCommit::commitHead(const DynInstPtr &head_inst, unsigned inst_num) head_inst->renamedDestRegIdx(i)); } + // hardware transactional memory + // the HTM UID is purely for correctness and debugging purposes + if (head_inst->isHtmStart()) + iewStage->setLastRetiredHtmUid(tid, head_inst->getHtmTransactionUid()); + // Finally clear the head ROB entry. rob->retireHead(tid); diff --git a/src/cpu/o3/cpu.cc b/src/cpu/o3/cpu.cc index 613ffd19d..ed69b1a65 100644 --- a/src/cpu/o3/cpu.cc +++ b/src/cpu/o3/cpu.cc @@ -623,6 +623,10 @@ template void FullO3CPU::deactivateThread(ThreadID tid) { + // hardware transactional memory + // shouldn't deactivate thread in the middle of a transaction + assert(!commit.executingHtmTransaction(tid)); + //Remove From Active List, if Active list::iterator thread_it = std::find(activeThreads.begin(), activeThreads.end(), tid); @@ -1829,10 +1833,38 @@ FullO3CPU::exitThreads() template void -FullO3CPU::htmSendAbortSignal(ThreadID tid, uint64_t htmUid, +FullO3CPU::htmSendAbortSignal(ThreadID tid, uint64_t htm_uid, HtmFailureFaultCause cause) { - panic("not yet supported!"); + const Addr addr = 0x0ul; + const int size = 8; + const Request::Flags flags = + Request::PHYSICAL|Request::STRICT_ORDER|Request::HTM_ABORT; + + // O3-specific actions + this->iew.ldstQueue.resetHtmStartsStops(tid); + this->commit.resetHtmStartsStops(tid); + + // notify l1 d-cache (ruby) that core has aborted transaction + RequestPtr req = + std::make_shared(addr, size, flags, _dataMasterId); + + req->taskId(taskId()); + req->setContext(this->thread[tid]->contextId()); + req->setHtmAbortCause(cause); + + assert(req->isHTMAbort()); + + PacketPtr abort_pkt = Packet::createRead(req); + uint8_t *memData = new uint8_t[8]; + assert(memData); + abort_pkt->dataStatic(memData); + abort_pkt->setHtmTransactional(htm_uid); + + // TODO include correct error handling here + if (!this->iew.ldstQueue.getDataPort().sendTimingReq(abort_pkt)) { + panic("HTM abort signal was not sent to the memory subsystem."); + } } // Forward declaration of FullO3CPU. diff --git a/src/cpu/o3/cpu.hh b/src/cpu/o3/cpu.hh index 137fbc89b..04472752c 100644 --- a/src/cpu/o3/cpu.hh +++ b/src/cpu/o3/cpu.hh @@ -61,7 +61,6 @@ #include "cpu/base.hh" #include "cpu/simple_thread.hh" #include "cpu/timebuf.hh" -//#include "cpu/o3/thread_context.hh" #include "params/DerivO3CPU.hh" #include "sim/process.hh" diff --git a/src/cpu/o3/iew.hh b/src/cpu/o3/iew.hh index 7f3409ede..4dbb9efb4 100644 --- a/src/cpu/o3/iew.hh +++ b/src/cpu/o3/iew.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2012, 2014 ARM Limited + * Copyright (c) 2010-2012, 2014, 2019 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -233,6 +233,16 @@ class DefaultIEW /** Check misprediction */ void checkMisprediction(const DynInstPtr &inst); + // hardware transactional memory + // For debugging purposes, it is useful to keep track of the most recent + // htmUid that has been committed (architecturally, not transactionally) + // to ensure that the core and the memory subsystem are observing + // correct ordering constraints. + void setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid) + { + ldstQueue.setLastRetiredHtmUid(tid, htmUid); + } + private: /** Sends commit proper information for a squash due to a branch * mispredict. diff --git a/src/cpu/o3/iew_impl.hh b/src/cpu/o3/iew_impl.hh index 99dfd19c3..9a04fe662 100644 --- a/src/cpu/o3/iew_impl.hh +++ b/src/cpu/o3/iew_impl.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2013, 2018 ARM Limited + * Copyright (c) 2010-2013, 2018-2019 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved. * @@ -1051,6 +1051,20 @@ DefaultIEW::dispatchInsts(ThreadID tid) break; } + // hardware transactional memory + // CPU needs to track transactional state in program order. + const int numHtmStarts = ldstQueue.numHtmStarts(tid); + const int numHtmStops = ldstQueue.numHtmStops(tid); + const int htmDepth = numHtmStarts - numHtmStops; + + if (htmDepth > 0) { + inst->setHtmTransactionalState(ldstQueue.getLatestHtmUid(tid), + htmDepth); + } else { + inst->clearHtmTransactionalState(); + } + + // Otherwise issue the instruction just fine. if (inst->isAtomic()) { DPRINTF(IEW, "[tid:%i] Issue: Memory instruction " diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 9ef3b0ce8..35c28739e 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -687,6 +687,8 @@ class LSQ { flags.set(Flag::Complete); } + + virtual std::string name() const { return "LSQRequest"; } }; class SingleDataRequest : public LSQRequest @@ -739,6 +741,35 @@ class LSQ virtual void buildPackets(); virtual Cycles handleLocalAccess(ThreadContext *thread, PacketPtr pkt); virtual bool isCacheBlockHit(Addr blockAddr, Addr cacheBlockMask); + virtual std::string name() const { return "SingleDataRequest"; } + }; + + // hardware transactional memory + // This class extends SingleDataRequest for the sole purpose + // of encapsulating hardware transactional memory command requests + class HtmCmdRequest : public SingleDataRequest + { + protected: + /* Given that we are inside templates, children need explicit + * declaration of the names in the parent class. */ + using Flag = typename LSQRequest::Flag; + using State = typename LSQRequest::State; + using LSQRequest::_addr; + using LSQRequest::_size; + using LSQRequest::_byteEnable; + using LSQRequest::_requests; + using LSQRequest::_inst; + using LSQRequest::_taskId; + using LSQRequest::flags; + using LSQRequest::setState; + public: + HtmCmdRequest(LSQUnit* port, const DynInstPtr& inst, + const Request::Flags& flags_); + inline virtual ~HtmCmdRequest() {} + virtual void initiateTranslation(); + virtual void finish(const Fault &fault, const RequestPtr &req, + ThreadContext* tc, BaseTLB::Mode mode); + virtual std::string name() const { return "HtmCmdRequest"; } }; class SplitDataRequest : public LSQRequest @@ -815,6 +846,7 @@ class LSQ virtual RequestPtr mainRequest(); virtual PacketPtr mainPacket(); + virtual std::string name() const { return "SplitDataRequest"; } }; /** Constructs an LSQ with the given parameters. */ @@ -933,6 +965,44 @@ class LSQ /** Returns the total number of stores for a single thread. */ int numStores(ThreadID tid) { return thread.at(tid).numStores(); } + + // hardware transactional memory + + int numHtmStarts(ThreadID tid) const + { + if (tid == InvalidThreadID) + return 0; + else + return thread[tid].numHtmStarts(); + } + int numHtmStops(ThreadID tid) const + { + if (tid == InvalidThreadID) + return 0; + else + return thread[tid].numHtmStops(); + } + + void resetHtmStartsStops(ThreadID tid) + { + if (tid != InvalidThreadID) + thread[tid].resetHtmStartsStops(); + } + + uint64_t getLatestHtmUid(ThreadID tid) const + { + if (tid == InvalidThreadID) + return 0; + else + return thread[tid].getLatestHtmUid(); + } + + void setLastRetiredHtmUid(ThreadID tid, uint64_t htmUid) + { + if (tid != InvalidThreadID) + thread[tid].setLastRetiredHtmUid(htmUid); + } + /** Returns the number of free load entries. */ unsigned numFreeLoadEntries(); diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index 1ca7d53ec..a535dcc89 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011-2012, 2014, 2017-2018 ARM Limited + * Copyright (c) 2011-2012, 2014, 2017-2019 ARM Limited * Copyright (c) 2013 Advanced Micro Devices, Inc. * All rights reserved * @@ -51,6 +51,7 @@ #include "cpu/o3/lsq.hh" #include "debug/Drain.hh" #include "debug/Fetch.hh" +#include "debug/HtmCpu.hh" #include "debug/LSQ.hh" #include "debug/Writeback.hh" #include "params/DerivO3CPU.hh" @@ -706,11 +707,17 @@ LSQ::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, // lines. For now, such cross-line update is not supported. assert(!isAtomic || (isAtomic && !needs_burst)); + const bool htm_cmd = isLoad && (flags & Request::HTM_CMD); + if (inst->translationStarted()) { req = inst->savedReq; assert(req); } else { - if (needs_burst) { + if (htm_cmd) { + assert(addr == 0x0lu); + assert(size == 8); + req = new HtmCmdRequest(&thread[tid], inst, flags); + } else if (needs_burst) { req = new SplitDataRequest(&thread[tid], inst, isLoad, addr, size, flags, data, res); } else { @@ -1033,6 +1040,23 @@ LSQ::SingleDataRequest::buildPackets() : Packet::createWrite(request())); _packets.back()->dataStatic(_inst->memData); _packets.back()->senderState = _senderState; + + // hardware transactional memory + // If request originates in a transaction (not necessarily a HtmCmd), + // then the packet should be marked as such. + if (_inst->inHtmTransactionalState()) { + _packets.back()->setHtmTransactional( + _inst->getHtmTransactionUid()); + + DPRINTF(HtmCpu, + "HTM %s pc=0x%lx - vaddr=0x%lx - paddr=0x%lx - htmUid=%u\n", + isLoad() ? "LD" : "ST", + _inst->instAddr(), + _packets.back()->req->hasVaddr() ? + _packets.back()->req->getVaddr() : 0lu, + _packets.back()->getAddr(), + _inst->getHtmTransactionUid()); + } } assert(_packets.size() == 1); } @@ -1049,6 +1073,21 @@ LSQ::SplitDataRequest::buildPackets() if (isLoad()) { _mainPacket = Packet::createRead(mainReq); _mainPacket->dataStatic(_inst->memData); + + // hardware transactional memory + // If request originates in a transaction, + // packet should be marked as such + if (_inst->inHtmTransactionalState()) { + _mainPacket->setHtmTransactional( + _inst->getHtmTransactionUid()); + DPRINTF(HtmCpu, + "HTM LD.0 pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n", + _inst->instAddr(), + _mainPacket->req->hasVaddr() ? + _mainPacket->req->getVaddr() : 0lu, + _mainPacket->getAddr(), + _inst->getHtmTransactionUid()); + } } for (int i = 0; i < _requests.size() && _fault[i] == NoFault; i++) { RequestPtr r = _requests[i]; @@ -1066,6 +1105,23 @@ LSQ::SplitDataRequest::buildPackets() } pkt->senderState = _senderState; _packets.push_back(pkt); + + // hardware transactional memory + // If request originates in a transaction, + // packet should be marked as such + if (_inst->inHtmTransactionalState()) { + _packets.back()->setHtmTransactional( + _inst->getHtmTransactionUid()); + DPRINTF(HtmCpu, + "HTM %s.%d pc=0x%lx-vaddr=0x%lx-paddr=0x%lx-htmUid=%u\n", + isLoad() ? "LD" : "ST", + i+1, + _inst->instAddr(), + _packets.back()->req->hasVaddr() ? + _packets.back()->req->getVaddr() : 0lu, + _packets.back()->getAddr(), + _inst->getHtmTransactionUid()); + } } } assert(_packets.size() > 0); @@ -1192,4 +1248,59 @@ LSQ::DcachePort::recvReqRetry() lsq->recvReqRetry(); } +template +LSQ::HtmCmdRequest::HtmCmdRequest(LSQUnit* port, + const DynInstPtr& inst, + const Request::Flags& flags_) : + SingleDataRequest(port, inst, true, 0x0lu, 8, flags_, + nullptr, nullptr, nullptr) +{ + assert(_requests.size() == 0); + + this->addRequest(_addr, _size, _byteEnable); + + if (_requests.size() > 0) { + _requests.back()->setReqInstSeqNum(_inst->seqNum); + _requests.back()->taskId(_taskId); + _requests.back()->setPaddr(_addr); + _requests.back()->setInstCount(_inst->getCpuPtr()->totalInsts()); + + _inst->strictlyOrdered(_requests.back()->isStrictlyOrdered()); + _inst->fault = NoFault; + _inst->physEffAddr = _requests.back()->getPaddr(); + _inst->memReqFlags = _requests.back()->getFlags(); + _inst->savedReq = this; + + setState(State::Translation); + } else { + panic("unexpected behaviour"); + } +} + +template +void +LSQ::HtmCmdRequest::initiateTranslation() +{ + // Transaction commands are implemented as loads to avoid significant + // changes to the cpu and memory interfaces + // The virtual and physical address uses a dummy value of 0x00 + // Address translation does not really occur thus the code below + + flags.set(Flag::TranslationStarted); + flags.set(Flag::TranslationFinished); + + _inst->translationStarted(true); + _inst->translationCompleted(true); + + setState(State::Request); +} + +template +void +LSQ::HtmCmdRequest::finish(const Fault &fault, const RequestPtr &req, + ThreadContext* tc, BaseTLB::Mode mode) +{ + panic("unexpected behaviour"); +} + #endif//__CPU_O3_LSQ_IMPL_HH__ diff --git a/src/cpu/o3/lsq_unit.hh b/src/cpu/o3/lsq_unit.hh index 06d43eeae..70995d61f 100644 --- a/src/cpu/o3/lsq_unit.hh +++ b/src/cpu/o3/lsq_unit.hh @@ -53,6 +53,7 @@ #include "config/the_isa.hh" #include "cpu/inst_seq.hh" #include "cpu/timebuf.hh" +#include "debug/HtmCpu.hh" #include "debug/LSQUnit.hh" #include "mem/packet.hh" #include "mem/port.hh" @@ -312,6 +313,21 @@ class LSQUnit /** Returns the number of stores in the SQ. */ int numStores() { return stores; } + // hardware transactional memory + int numHtmStarts() const { return htmStarts; } + int numHtmStops() const { return htmStops; } + void resetHtmStartsStops() { htmStarts = htmStops = 0; } + uint64_t getLatestHtmUid() const + { + const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr(); + return htm_cpt->getHtmUid(); + } + void setLastRetiredHtmUid(uint64_t htm_uid) + { + assert(htm_uid >= lastRetiredHtmUid); + lastRetiredHtmUid = htm_uid; + } + /** Returns if either the LQ or SQ is full. */ bool isFull() { return lqFull() || sqFull(); } @@ -496,6 +512,13 @@ class LSQUnit /** The number of store instructions in the SQ waiting to writeback. */ int storesToWB; + // hardware transactional memory + // nesting depth + int htmStarts; + int htmStops; + // sanity checks and debugging + uint64_t lastRetiredHtmUid; + /** The index of the first instruction that may be ready to be * written back, and has not yet been written back. */ @@ -665,6 +688,7 @@ LSQUnit::read(LSQRequest *req, int load_idx) if (req->mainRequest()->isLocalAccess()) { assert(!load_inst->memData); + assert(!load_inst->inHtmTransactionalState()); load_inst->memData = new uint8_t[MaxDataBytes]; ThreadContext *thread = cpu->tcBase(lsqID); @@ -679,6 +703,37 @@ LSQUnit::read(LSQRequest *req, int load_idx) return NoFault; } + // hardware transactional memory + if (req->mainRequest()->isHTMStart() || req->mainRequest()->isHTMCommit()) + { + // don't want to send nested transactionStarts and + // transactionStops outside of core, e.g. to Ruby + if (req->mainRequest()->getFlags().isSet(Request::NO_ACCESS)) { + Cycles delay(0); + PacketPtr data_pkt = + new Packet(req->mainRequest(), MemCmd::ReadReq); + + // Allocate memory if this is the first time a load is issued. + if (!load_inst->memData) { + load_inst->memData = + new uint8_t[req->mainRequest()->getSize()]; + // sanity checks espect zero in request's data + memset(load_inst->memData, 0, req->mainRequest()->getSize()); + } + + data_pkt->dataStatic(load_inst->memData); + if (load_inst->inHtmTransactionalState()) { + data_pkt->setHtmTransactional( + load_inst->getHtmTransactionUid()); + } + data_pkt->makeResponse(); + + WritebackEvent *wb = new WritebackEvent(load_inst, data_pkt, this); + cpu->schedule(wb, cpu->clockEdge(delay)); + return NoFault; + } + } + // Check the SQ for any previous stores that might lead to forwarding auto store_it = load_inst->sqIt; assert (store_it >= storeWBIt); @@ -771,6 +826,35 @@ LSQUnit::read(LSQRequest *req, int load_idx) MemCmd::ReadReq); data_pkt->dataStatic(load_inst->memData); + // hardware transactional memory + // Store to load forwarding within a transaction + // This should be okay because the store will be sent to + // the memory subsystem and subsequently get added to the + // write set of the transaction. The write set has a stronger + // property than the read set, so the load doesn't necessarily + // have to be there. + assert(!req->mainRequest()->isHTMCmd()); + if (load_inst->inHtmTransactionalState()) { + assert (!storeQueue[store_it._idx].completed()); + assert ( + storeQueue[store_it._idx].instruction()-> + inHtmTransactionalState()); + assert ( + load_inst->getHtmTransactionUid() == + storeQueue[store_it._idx].instruction()-> + getHtmTransactionUid()); + data_pkt->setHtmTransactional( + load_inst->getHtmTransactionUid()); + DPRINTF(HtmCpu, "HTM LD (ST2LDF) " + "pc=0x%lx - vaddr=0x%lx - " + "paddr=0x%lx - htmUid=%u\n", + load_inst->instAddr(), + data_pkt->req->hasVaddr() ? + data_pkt->req->getVaddr() : 0lu, + data_pkt->getAddr(), + load_inst->getHtmTransactionUid()); + } + if (req->isAnyOutstandingRequest()) { assert(req->_numOutstandingPackets > 0); // There are memory requests packets in flight already. @@ -841,6 +925,15 @@ LSQUnit::read(LSQRequest *req, int load_idx) load_inst->memData = new uint8_t[req->mainRequest()->getSize()]; } + + // hardware transactional memory + if (req->mainRequest()->isHTMCmd()) { + // this is a simple sanity check + // the Ruby cache controller will set + // memData to 0x0ul if successful. + *load_inst->memData = (uint64_t) 0x1ull; + } + // For now, load throughput is constrained by the number of // load FUs only, and loads do not consume a cache port (only // stores do). diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index c39f8943d..fcbfc9cd2 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -51,6 +51,7 @@ #include "cpu/o3/lsq.hh" #include "cpu/o3/lsq_unit.hh" #include "debug/Activity.hh" +#include "debug/HtmCpu.hh" #include "debug/IEW.hh" #include "debug/LSQUnit.hh" #include "debug/O3PipeView.hh" @@ -112,6 +113,59 @@ LSQUnit::completeDataAccess(PacketPtr pkt) LSQSenderState *state = dynamic_cast(pkt->senderState); DynInstPtr inst = state->inst; + // hardware transactional memory + // sanity check + if (pkt->isHtmTransactional() && !inst->isSquashed()) { + assert(inst->getHtmTransactionUid() == pkt->getHtmTransactionUid()); + } + + // if in a HTM transaction, it's possible + // to abort within the cache hierarchy. + // This is signalled back to the processor + // through responses to memory requests. + if (pkt->htmTransactionFailedInCache()) { + // cannot do this for write requests because + // they cannot tolerate faults + const HtmCacheFailure htm_rc = + pkt->getHtmTransactionFailedInCacheRC(); + if(pkt->isWrite()) { + DPRINTF(HtmCpu, + "store notification (ignored) of HTM transaction failure " + "in cache - addr=0x%lx - rc=%s - htmUid=%d\n", + pkt->getAddr(), htmFailureToStr(htm_rc), + pkt->getHtmTransactionUid()); + } else { + HtmFailureFaultCause fail_reason = + HtmFailureFaultCause::INVALID; + + if (htm_rc == HtmCacheFailure::FAIL_SELF) { + fail_reason = HtmFailureFaultCause::SIZE; + } else if (htm_rc == HtmCacheFailure::FAIL_REMOTE) { + fail_reason = HtmFailureFaultCause::MEMORY; + } else if (htm_rc == HtmCacheFailure::FAIL_OTHER) { + // these are likely loads that were issued out of order + // they are faulted here, but it's unlikely that these will + // ever reach the commit head. + fail_reason = HtmFailureFaultCause::OTHER; + } else { + panic("HTM error - unhandled return code from cache (%s)", + htmFailureToStr(htm_rc)); + } + + inst->fault = + std::make_shared( + inst->getHtmTransactionUid(), + fail_reason); + + DPRINTF(HtmCpu, + "load notification of HTM transaction failure " + "in cache - pc=%s - addr=0x%lx - " + "rc=%u - htmUid=%d\n", + inst->pcState(), pkt->getAddr(), + htmFailureToStr(htm_rc), pkt->getHtmTransactionUid()); + } + } + cpu->ppDataAccessComplete->notify(std::make_pair(inst, pkt)); /* Notify the sender state that the access is complete (for ownership @@ -125,6 +179,13 @@ LSQUnit::completeDataAccess(PacketPtr pkt) // after receving the response from the memory assert(inst->isLoad() || inst->isStoreConditional() || inst->isAtomic()); + + // hardware transactional memory + if (pkt->htmTransactionFailedInCache()) { + state->request()->mainPacket()->setHtmTransactionFailedInCache( + pkt->getHtmTransactionFailedInCacheRC() ); + } + writeback(inst, state->request()->mainPacket()); if (inst->isStore() || inst->isAtomic()) { auto ss = dynamic_cast(state); @@ -142,7 +203,10 @@ LSQUnit::completeDataAccess(PacketPtr pkt) template LSQUnit::LSQUnit(uint32_t lqEntries, uint32_t sqEntries) : lsqID(-1), storeQueue(sqEntries+1), loadQueue(lqEntries+1), - loads(0), stores(0), storesToWB(0), cacheBlockMask(0), stalled(false), + loads(0), stores(0), storesToWB(0), + htmStarts(0), htmStops(0), + lastRetiredHtmUid(0), + cacheBlockMask(0), stalled(false), isStoreBlocked(false), storeInFlight(false), hasPendingRequest(false), pendingRequest(nullptr) { @@ -176,6 +240,9 @@ LSQUnit::resetState() { loads = stores = storesToWB = 0; + // hardware transactional memory + // nesting depth + htmStarts = htmStops = 0; storeWBIt = storeQueue.begin(); @@ -306,6 +373,45 @@ LSQUnit::insertLoad(const DynInstPtr &load_inst) load_inst->lqIt = loadQueue.getIterator(load_inst->lqIdx); ++loads; + + // hardware transactional memory + // transactional state and nesting depth must be tracked + // in the in-order part of the core. + if (load_inst->isHtmStart()) { + htmStarts++; + DPRINTF(HtmCpu, ">> htmStarts++ (%d) : htmStops (%d)\n", + htmStarts, htmStops); + + const int htm_depth = htmStarts - htmStops; + const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr(); + auto htm_uid = htm_cpt->getHtmUid(); + + // for debugging purposes + if (!load_inst->inHtmTransactionalState()) { + htm_uid = htm_cpt->newHtmUid(); + DPRINTF(HtmCpu, "generating new htmUid=%u\n", htm_uid); + if (htm_depth != 1) { + DPRINTF(HtmCpu, + "unusual HTM transactional depth (%d)" + " possibly caused by mispeculation - htmUid=%u\n", + htm_depth, htm_uid); + } + } + load_inst->setHtmTransactionalState(htm_uid, htm_depth); + } + + if (load_inst->isHtmStop()) { + htmStops++; + DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops++ (%d)\n", + htmStarts, htmStops); + + if (htmStops==1 && htmStarts==0) { + DPRINTF(HtmCpu, + "htmStops==1 && htmStarts==0. " + "This generally shouldn't happen " + "(unless due to misspeculation)\n"); + } + } } template @@ -831,6 +937,7 @@ LSQUnit::writebackStores() if (req->request()->isLocalAccess()) { assert(!inst->isStoreConditional()); + assert(!inst->inHtmTransactionalState()); ThreadContext *thread = cpu->tcBase(lsqID); PacketPtr main_pkt = new Packet(req->mainRequest(), MemCmd::WriteReq); @@ -876,6 +983,21 @@ LSQUnit::squash(const InstSeqNum &squashed_num) stallingLoadIdx = 0; } + // hardware transactional memory + // Squashing instructions can alter the transaction nesting depth + // and must be corrected before fetching resumes. + if (loadQueue.back().instruction()->isHtmStart()) + { + htmStarts = (--htmStarts < 0) ? 0 : htmStarts; + DPRINTF(HtmCpu, ">> htmStarts-- (%d) : htmStops (%d)\n", + htmStarts, htmStops); + } + if (loadQueue.back().instruction()->isHtmStop()) + { + htmStops = (--htmStops < 0) ? 0 : htmStops; + DPRINTF(HtmCpu, ">> htmStarts (%d) : htmStops-- (%d)\n", + htmStarts, htmStops); + } // Clear the smart pointer to make sure it is decremented. loadQueue.back().instruction()->setSquashed(); loadQueue.back().clear(); @@ -886,6 +1008,40 @@ LSQUnit::squash(const InstSeqNum &squashed_num) ++lsqSquashedLoads; } + // hardware transactional memory + // scan load queue (from oldest to youngest) for most recent valid htmUid + auto scan_it = loadQueue.begin(); + uint64_t in_flight_uid = 0; + while (scan_it != loadQueue.end()) { + if (scan_it->instruction()->isHtmStart() && + !scan_it->instruction()->isSquashed()) { + in_flight_uid = scan_it->instruction()->getHtmTransactionUid(); + DPRINTF(HtmCpu, "loadQueue[%d]: found valid HtmStart htmUid=%u\n", + scan_it._idx, in_flight_uid); + } + scan_it++; + } + // If there's a HtmStart in the pipeline then use its htmUid, + // otherwise use the most recently committed uid + const auto& htm_cpt = cpu->tcBase(lsqID)->getHtmCheckpointPtr(); + if (htm_cpt) { + const uint64_t old_local_htm_uid = htm_cpt->getHtmUid(); + uint64_t new_local_htm_uid; + if (in_flight_uid > 0) + new_local_htm_uid = in_flight_uid; + else + new_local_htm_uid = lastRetiredHtmUid; + + if (old_local_htm_uid != new_local_htm_uid) { + DPRINTF(HtmCpu, "flush: lastRetiredHtmUid=%u\n", + lastRetiredHtmUid); + DPRINTF(HtmCpu, "flush: resetting localHtmUid=%u\n", + new_local_htm_uid); + + htm_cpt->setHtmUid(new_local_htm_uid); + } + } + if (memDepViolator && squashed_num < memDepViolator->seqNum) { memDepViolator = NULL; } @@ -965,7 +1121,7 @@ LSQUnit::writeback(const DynInstPtr &inst, PacketPtr pkt) // Squashed instructions do not need to complete their access. if (inst->isSquashed()) { - assert(!inst->isStore()); + assert (!inst->isStore() || inst->isStoreConditional()); ++lsqIgnoredResponses; return; } @@ -983,8 +1139,27 @@ LSQUnit::writeback(const DynInstPtr &inst, PacketPtr pkt) // If we have an outstanding fault, the fault should only be of // type ReExec or - in case of a SplitRequest - a partial // translation fault - assert(dynamic_cast(inst->fault.get()) != nullptr || - inst->savedReq->isPartialFault()); + + // Unless it's a hardware transactional memory fault + auto htm_fault = std::dynamic_pointer_cast< + GenericHtmFailureFault>(inst->fault); + + if (!htm_fault) { + assert(dynamic_cast(inst->fault.get()) != nullptr || + inst->savedReq->isPartialFault()); + + } else if (!pkt->htmTransactionFailedInCache()) { + // Situation in which the instruction has a hardware transactional + // memory fault but not the packet itself. This can occur with + // ldp_uop microops since access is spread over multiple packets. + DPRINTF(HtmCpu, + "%s writeback with HTM failure fault, " + "however, completing packet is not aware of " + "transaction failure. cause=%s htmUid=%u\n", + inst->staticInst->getName(), + htmFailureToStr(htm_fault->getHtmFailureFaultCause()), + htm_fault->getHtmUid()); + } DPRINTF(LSQUnit, "Not completing instruction [sn:%lli] access " "due to pending fault.\n", inst->seqNum); diff --git a/src/cpu/o3/mem_dep_unit_impl.hh b/src/cpu/o3/mem_dep_unit_impl.hh index 3a7ad363c..4be98c51d 100644 --- a/src/cpu/o3/mem_dep_unit_impl.hh +++ b/src/cpu/o3/mem_dep_unit_impl.hh @@ -172,7 +172,9 @@ MemDepUnit::insertBarrierSN(const DynInstPtr &barr_inst) { InstSeqNum barr_sn = barr_inst->seqNum; // Memory barriers block loads and stores, write barriers only stores. - if (barr_inst->isMemBarrier()) { + // Required also for hardware transactional memory commands which + // can have strict ordering semantics + if (barr_inst->isMemBarrier() || barr_inst->isHtmCmd()) { loadBarrierSNs.insert(barr_sn); storeBarrierSNs.insert(barr_sn); DPRINTF(MemDepUnit, "Inserted a memory barrier %s SN:%lli\n", @@ -182,6 +184,7 @@ MemDepUnit::insertBarrierSN(const DynInstPtr &barr_inst) DPRINTF(MemDepUnit, "Inserted a write barrier %s SN:%lli\n", barr_inst->pcState(), barr_sn); } + if (loadBarrierSNs.size() || storeBarrierSNs.size()) { DPRINTF(MemDepUnit, "Outstanding load barriers = %d; " "store barriers = %d\n", @@ -440,7 +443,8 @@ MemDepUnit::completeInst(const DynInstPtr &inst) wakeDependents(inst); completed(inst); InstSeqNum barr_sn = inst->seqNum; - if (inst->isMemBarrier()) { + + if (inst->isMemBarrier() || inst->isHtmCmd()) { assert(hasLoadBarrier()); assert(hasStoreBarrier()); loadBarrierSNs.erase(barr_sn); @@ -459,9 +463,10 @@ template void MemDepUnit::wakeDependents(const DynInstPtr &inst) { - // Only stores, atomics and barriers have dependents. + // Only stores, atomics, barriers and + // hardware transactional memory commands have dependents. if (!inst->isStore() && !inst->isAtomic() && !inst->isMemBarrier() && - !inst->isWriteBarrier()) { + !inst->isWriteBarrier() && !inst->isHtmCmd()) { return; } diff --git a/src/cpu/o3/thread_context_impl.hh b/src/cpu/o3/thread_context_impl.hh index 014b0f588..005aa5716 100644 --- a/src/cpu/o3/thread_context_impl.hh +++ b/src/cpu/o3/thread_context_impl.hh @@ -331,21 +331,24 @@ void O3ThreadContext::htmAbortTransaction(uint64_t htmUid, HtmFailureFaultCause cause) { - panic("function not implemented\n"); + cpu->htmSendAbortSignal(thread->threadId(), htmUid, cause); + + conditionalSquash(); } template BaseHTMCheckpointPtr& O3ThreadContext::getHtmCheckpointPtr() { - panic("function not implemented\n"); + return thread->htmCheckpoint; } template void O3ThreadContext::setHtmCheckpointPtr(BaseHTMCheckpointPtr new_cpt) { - panic("function not implemented\n"); + assert(!thread->htmCheckpoint->valid()); + thread->htmCheckpoint = std::move(new_cpt); } #endif //__CPU_O3_THREAD_CONTEXT_IMPL_HH__ diff --git a/src/cpu/o3/thread_state.hh b/src/cpu/o3/thread_state.hh index 6420da99a..32268325a 100644 --- a/src/cpu/o3/thread_state.hh +++ b/src/cpu/o3/thread_state.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 ARM Limited + * Copyright (c) 2012, 2019 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -92,6 +92,9 @@ struct O3ThreadState : public ThreadState { */ bool trapPending; + /** Pointer to the hardware transactional memory checkpoint. */ + std::unique_ptr htmCheckpoint; + O3ThreadState(O3CPU *_cpu, int _thread_num, Process *_process) : ThreadState(_cpu, _thread_num, _process), cpu(_cpu), comInstEventQueue("instruction-based event queue"), -- 2.30.2