From 46da8fb805407cdc224abe788e8c666f3b0dadd1 Mon Sep 17 00:00:00 2001 From: Gabor Dozsa Date: Wed, 27 Feb 2019 17:26:56 +0000 Subject: [PATCH] cpu: Add first-/non-faulting load support to Minor and O3 Some architectures allow masking faults of memory load instructions in some specific circumstances (e.g. first-faulting and non-faulting loads in Arm SVE). This patch adds support for such loads in the Minor and O3 CPU models. Change-Id: I264a81a078f049127779aa834e89f0e693ba0bea Signed-off-by: Gabor Dozsa Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/19178 Reviewed-by: Andreas Sandberg Maintainer: Andreas Sandberg Tested-by: kokoro --- src/cpu/minor/dyn_inst.cc | 6 +- src/cpu/minor/dyn_inst.hh | 11 ++-- src/cpu/minor/exec_context.hh | 9 +-- src/cpu/minor/execute.cc | 20 ++++-- src/cpu/minor/lsq.cc | 115 ++++++++++++++++++++++++++-------- src/cpu/minor/lsq.hh | 23 ++++--- src/cpu/o3/lsq.hh | 14 +++++ src/cpu/o3/lsq_impl.hh | 47 +++++++++----- src/cpu/o3/lsq_unit_impl.hh | 10 +++ 9 files changed, 189 insertions(+), 66 deletions(-) diff --git a/src/cpu/minor/dyn_inst.cc b/src/cpu/minor/dyn_inst.cc index 353163758..087b718d3 100644 --- a/src/cpu/minor/dyn_inst.cc +++ b/src/cpu/minor/dyn_inst.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014, 2016 ARM Limited + * Copyright (c) 2013-2014, 2016,2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -108,6 +108,8 @@ MinorDynInst::reportData(std::ostream &os) const os << "-"; else if (isFault()) os << "F;" << id; + else if (translationFault != NoFault) + os << "TF;" << id; else os << id; } @@ -120,6 +122,8 @@ operator <<(std::ostream &os, const MinorDynInst &inst) if (inst.isFault()) os << "fault: \"" << inst.fault->name() << '"'; + else if (inst.translationFault != NoFault) + os << "translation fault: \"" << inst.translationFault->name() << '"'; else if (inst.staticInst) os << inst.staticInst->getName(); else diff --git a/src/cpu/minor/dyn_inst.hh b/src/cpu/minor/dyn_inst.hh index 0a8ff8acf..3eb7f980f 100644 --- a/src/cpu/minor/dyn_inst.hh +++ b/src/cpu/minor/dyn_inst.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2014 ARM Limited + * Copyright (c) 2013-2014,2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -194,6 +194,9 @@ class MinorDynInst : public RefCounted /** This instruction is in the LSQ, not a functional unit */ bool inLSQ; + /** Translation fault in case of a mem ref */ + Fault translationFault; + /** The instruction has been sent to the store buffer */ bool inStoreBuffer; @@ -233,9 +236,9 @@ class MinorDynInst : public RefCounted staticInst(NULL), id(id_), traceData(NULL), pc(TheISA::PCState(0)), fault(fault_), triedToPredict(false), predictedTaken(false), - fuIndex(0), inLSQ(false), inStoreBuffer(false), - canEarlyIssue(false), predicate(true), memAccPredicate(true), - instToWaitFor(0), extraCommitDelay(Cycles(0)), + fuIndex(0), inLSQ(false), translationFault(NoFault), + inStoreBuffer(false), canEarlyIssue(false), predicate(true), + memAccPredicate(true), instToWaitFor(0), extraCommitDelay(Cycles(0)), extraCommitDelayExpr(NULL), minimumCommitCycle(Cycles(0)) { } diff --git a/src/cpu/minor/exec_context.hh b/src/cpu/minor/exec_context.hh index 9f6fce4cd..1871e2479 100644 --- a/src/cpu/minor/exec_context.hh +++ b/src/cpu/minor/exec_context.hh @@ -116,9 +116,8 @@ class ExecContext : public ::ExecContext const std::vector& byteEnable = std::vector()) override { - execute.getLSQ().pushRequest(inst, true /* load */, nullptr, + return execute.getLSQ().pushRequest(inst, true /* load */, nullptr, size, addr, flags, nullptr, nullptr, byteEnable); - return NoFault; } Fault @@ -128,9 +127,8 @@ class ExecContext : public ::ExecContext override { assert(byteEnable.empty() || byteEnable.size() == size); - execute.getLSQ().pushRequest(inst, false /* store */, data, + return execute.getLSQ().pushRequest(inst, false /* store */, data, size, addr, flags, res, nullptr, byteEnable); - return NoFault; } Fault @@ -138,9 +136,8 @@ class ExecContext : public ::ExecContext AtomicOpFunctor *amo_op) override { // AMO requests are pushed through the store path - execute.getLSQ().pushRequest(inst, false /* amo */, nullptr, + return execute.getLSQ().pushRequest(inst, false /* amo */, nullptr, size, addr, flags, nullptr, amo_op); - return NoFault; } RegVal diff --git a/src/cpu/minor/execute.cc b/src/cpu/minor/execute.cc index 810ff11c6..c7fda489e 100644 --- a/src/cpu/minor/execute.cc +++ b/src/cpu/minor/execute.cc @@ -337,19 +337,19 @@ Execute::handleMemResponse(MinorDynInstPtr inst, * context predicate, otherwise, it will be set to false */ bool use_context_predicate = true; - if (response->fault != NoFault) { + if (inst->translationFault != NoFault) { /* Invoke memory faults. */ DPRINTF(MinorMem, "Completing fault from DTLB access: %s\n", - response->fault->name()); + inst->translationFault->name()); if (inst->staticInst->isPrefetch()) { DPRINTF(MinorMem, "Not taking fault on prefetch: %s\n", - response->fault->name()); + inst->translationFault->name()); /* Don't assign to fault */ } else { /* Take the fault raised during the TLB/memory access */ - fault = response->fault; + fault = inst->translationFault; fault->invoke(thread, inst->staticInst); } @@ -469,6 +469,18 @@ Execute::executeMemRefInst(MinorDynInstPtr inst, BranchData &branch, Fault init_fault = inst->staticInst->initiateAcc(&context, inst->traceData); + if (inst->inLSQ) { + if (init_fault != NoFault) { + assert(inst->translationFault != NoFault); + // Translation faults are dealt with in handleMemResponse() + init_fault = NoFault; + } else { + // If we have a translation fault then it got suppressed by + // initateAcc() + inst->translationFault = NoFault; + } + } + if (init_fault != NoFault) { DPRINTF(MinorExecute, "Fault on memory inst: %s" " initiateAcc: %s\n", *inst, init_fault->name()); diff --git a/src/cpu/minor/lsq.cc b/src/cpu/minor/lsq.cc index 1d9f17e8d..1e5e89647 100644 --- a/src/cpu/minor/lsq.cc +++ b/src/cpu/minor/lsq.cc @@ -65,15 +65,51 @@ LSQ::LSQRequest::LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_, data(data_), packet(NULL), request(), - fault(NoFault), res(res_), skipped(false), issuedToMemory(false), + isTranslationDelayed(false), state(NotIssued) { request = std::make_shared(); } +void +LSQ::LSQRequest::tryToSuppressFault() +{ + SimpleThread &thread = *port.cpu.threads[inst->id.threadId]; + TheISA::PCState old_pc = thread.pcState(); + ExecContext context(port.cpu, thread, port.execute, inst); + Fault M5_VAR_USED fault = inst->translationFault; + + // Give the instruction a chance to suppress a translation fault + inst->translationFault = inst->staticInst->initiateAcc(&context, nullptr); + if (inst->translationFault == NoFault) { + DPRINTFS(MinorMem, (&port), + "Translation fault suppressed for inst:%s\n", *inst); + } else { + assert(inst->translationFault == fault); + } + thread.pcState(old_pc); +} + +void +LSQ::LSQRequest::completeDisabledMemAccess() +{ + DPRINTFS(MinorMem, (&port), "Complete disabled mem access for inst:%s\n", + *inst); + + SimpleThread &thread = *port.cpu.threads[inst->id.threadId]; + TheISA::PCState old_pc = thread.pcState(); + + ExecContext context(port.cpu, thread, port.execute, inst); + + context.setMemAccPredicate(false); + inst->staticInst->completeAcc(nullptr, &context, inst->traceData); + + thread.pcState(old_pc); +} + void LSQ::LSQRequest::disableMemAccess() { @@ -227,16 +263,26 @@ void LSQ::SingleDataRequest::finish(const Fault &fault_, const RequestPtr &request_, ThreadContext *tc, BaseTLB::Mode mode) { - fault = fault_; - port.numAccessesInDTLB--; DPRINTFS(MinorMem, (&port), "Received translation response for" - " request: %s\n", *inst); - - makePacket(); - - setState(Translated); + " request: %s delayed:%d %s\n", *inst, isTranslationDelayed, + fault_ != NoFault ? fault_->name() : ""); + + if (fault_ != NoFault) { + inst->translationFault = fault_; + if (isTranslationDelayed) { + tryToSuppressFault(); + if (inst->translationFault == NoFault) { + completeDisabledMemAccess(); + setState(Complete); + } + } + setState(Translated); + } else { + setState(Translated); + makePacket(); + } port.tryToSendToTransfers(this); /* Let's try and wake up the processor for the next cycle */ @@ -281,8 +327,6 @@ void LSQ::SplitDataRequest::finish(const Fault &fault_, const RequestPtr &request_, ThreadContext *tc, BaseTLB::Mode mode) { - fault = fault_; - port.numAccessesInDTLB--; unsigned int M5_VAR_USED expected_fragment_index = @@ -292,7 +336,9 @@ LSQ::SplitDataRequest::finish(const Fault &fault_, const RequestPtr &request_, numTranslatedFragments++; DPRINTFS(MinorMem, (&port), "Received translation response for fragment" - " %d of request: %s\n", expected_fragment_index, *inst); + " %d of request: %s delayed:%d %s\n", expected_fragment_index, + *inst, isTranslationDelayed, + fault_ != NoFault ? fault_->name() : ""); assert(request_ == fragmentRequests[expected_fragment_index]); @@ -300,18 +346,33 @@ LSQ::SplitDataRequest::finish(const Fault &fault_, const RequestPtr &request_, * tryToSendToTransfers does take */ port.cpu.wakeupOnEvent(Pipeline::ExecuteStageId); - if (fault != NoFault) { + if (fault_ != NoFault) { /* tryToSendToTransfers will handle the fault */ + inst->translationFault = fault_; DPRINTFS(MinorMem, (&port), "Faulting translation for fragment:" " %d of request: %s\n", expected_fragment_index, *inst); - setState(Translated); + if (expected_fragment_index > 0 || isTranslationDelayed) + tryToSuppressFault(); + if (expected_fragment_index == 0) { + if (isTranslationDelayed && inst->translationFault == NoFault) { + completeDisabledMemAccess(); + setState(Complete); + } else { + setState(Translated); + } + } else if (inst->translationFault == NoFault) { + setState(Translated); + numTranslatedFragments--; + makeFragmentPackets(); + } else { + setState(Translated); + } port.tryToSendToTransfers(this); } else if (numTranslatedFragments == numFragments) { makeFragmentPackets(); - setState(Translated); port.tryToSendToTransfers(this); } else { @@ -562,6 +623,7 @@ LSQ::SplitDataRequest::stepToNextPacket() void LSQ::SplitDataRequest::retireResponse(PacketPtr response) { + assert(inst->translationFault == NoFault); assert(numRetiredFragments < numTranslatedFragments); DPRINTFS(MinorMem, (&port), "Retiring fragment addr: 0x%x size: %d" @@ -950,7 +1012,7 @@ LSQ::tryToSendToTransfers(LSQRequestPtr request) return; } - if (request->fault != NoFault) { + if (request->inst->translationFault != NoFault) { if (request->inst->staticInst->isPrefetch()) { DPRINTF(MinorMem, "Not signalling fault for faulting prefetch\n"); } @@ -1508,12 +1570,18 @@ LSQ::needsToTick() return ret; } -void +Fault LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, unsigned int size, Addr addr, Request::Flags flags, uint64_t *res, AtomicOpFunctor *amo_op, const std::vector& byteEnable) { + assert(inst->translationFault == NoFault || inst->inLSQ); + + if (inst->inLSQ) { + return inst->translationFault; + } + bool needs_burst = transferNeedsBurst(addr, size, lineWidth); if (needs_burst && inst->staticInst->isAtomic()) { @@ -1568,12 +1636,13 @@ LSQ::pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, addr, size, flags, cpu.dataMasterId(), /* I've no idea why we need the PC, but give it */ inst->pc.instAddr(), amo_op); - if (!byteEnable.empty()) { - request->request->setByteEnable(byteEnable); - } + request->request->setByteEnable(byteEnable); requests.push(request); + inst->inLSQ = true; request->startAddrTranslation(); + + return inst->translationFault; } void @@ -1642,16 +1711,12 @@ LSQ::issuedMemBarrierInst(MinorDynInstPtr inst) void LSQ::LSQRequest::makePacket() { + assert(inst->translationFault == NoFault); + /* Make the function idempotent */ if (packet) return; - // if the translation faulted, do not create a packet - if (fault != NoFault) { - assert(packet == NULL); - return; - } - packet = makePacketForRequest(request, isLoad, this, data); /* Null the ret data so we know not to deallocate it when the * ret is destroyed. The data now belongs to the ret and diff --git a/src/cpu/minor/lsq.hh b/src/cpu/minor/lsq.hh index 23b47c53c..a7c7cb632 100644 --- a/src/cpu/minor/lsq.hh +++ b/src/cpu/minor/lsq.hh @@ -145,9 +145,6 @@ class LSQ : public Named /** The underlying request of this LSQRequest */ RequestPtr request; - /** Fault generated performing this request */ - Fault fault; - /** Res from pushRequest */ uint64_t *res; @@ -160,6 +157,9 @@ class LSQ : public Named * that's visited the memory system */ bool issuedToMemory; + /** Address translation is delayed due to table walk */ + bool isTranslationDelayed; + enum LSQRequestState { NotIssued, /* Newly created */ @@ -186,9 +186,14 @@ class LSQ : public Named protected: /** BaseTLB::Translation interface */ - void markDelayed() { } + void markDelayed() { isTranslationDelayed = true; } + + /** Instructions may want to suppress translation faults (e.g. + * non-faulting vector loads).*/ + void tryToSuppressFault(); void disableMemAccess(); + void completeDisabledMemAccess(); public: LSQRequest(LSQ &port_, MinorDynInstPtr inst_, bool isLoad_, @@ -701,11 +706,11 @@ class LSQ : public Named /** Single interface for readMem/writeMem/amoMem to issue requests into * the LSQ */ - void pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, - unsigned int size, Addr addr, Request::Flags flags, - uint64_t *res, AtomicOpFunctor *amo_op, - const std::vector& byteEnable = - std::vector()); + Fault pushRequest(MinorDynInstPtr inst, bool isLoad, uint8_t *data, + unsigned int size, Addr addr, Request::Flags flags, + uint64_t *res, AtomicOpFunctor *amo_op, + const std::vector& byteEnable = + std::vector()); /** Push a predicate failed-representing request into the queues just * to maintain commit order */ diff --git a/src/cpu/o3/lsq.hh b/src/cpu/o3/lsq.hh index 84f1411a5..6f7820113 100644 --- a/src/cpu/o3/lsq.hh +++ b/src/cpu/o3/lsq.hh @@ -226,6 +226,7 @@ class LSQ Complete, Squashed, Fault, + PartialFault, }; State _state; LSQSenderState* _senderState; @@ -564,6 +565,19 @@ class LSQ return flags.isSet(Flag::Sent); } + bool + isPartialFault() + { + return _state == State::PartialFault; + } + + bool + isMemAccessRequired() + { + return (_state == State::Request || + (isPartialFault() && isLoad())); + } + /** * The LSQ entry is cleared */ diff --git a/src/cpu/o3/lsq_impl.hh b/src/cpu/o3/lsq_impl.hh index d4e0a289e..27a563071 100644 --- a/src/cpu/o3/lsq_impl.hh +++ b/src/cpu/o3/lsq_impl.hh @@ -733,7 +733,7 @@ LSQ::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, /* This is the place were instructions get the effAddr. */ if (req->isTranslationComplete()) { - if (inst->getFault() == NoFault) { + if (req->isMemAccessRequired()) { inst->effAddr = req->getVaddr(); inst->effSize = size; inst->effAddrValid(true); @@ -741,10 +741,17 @@ LSQ::pushRequest(const DynInstPtr& inst, bool isLoad, uint8_t *data, if (cpu->checker) { inst->reqToVerify = std::make_shared(*req->request()); } + Fault fault; if (isLoad) - inst->getFault() = cpu->read(req, inst->lqIdx); + fault = cpu->read(req, inst->lqIdx); else - inst->getFault() = cpu->write(req, data, inst->sqIdx); + fault = cpu->write(req, data, inst->sqIdx); + // inst->getFault() may have the first-fault of a + // multi-access split request at this point. + // Overwrite that only if we got another type of fault + // (e.g. re-exec). + if (fault != NoFault) + inst->getFault() = fault; } else if (isLoad) { inst->setMemAccPredicate(false); // Commit will have to clean up whatever happened. Set this @@ -797,13 +804,16 @@ void LSQ::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req, ThreadContext* tc, BaseTLB::Mode mode) { - _fault.push_back(fault); - assert(req == _requests[numTranslatedFragments] || this->isDelayed()); + int i; + for (i = 0; i < _requests.size() && _requests[i] != req; i++); + assert(i < _requests.size()); + _fault[i] = fault; numInTranslationFragments--; numTranslatedFragments++; - mainReq->setFlags(req->getFlags()); + if (fault == NoFault) + mainReq->setFlags(req->getFlags()); if (numTranslatedFragments == _requests.size()) { if (_inst->isSquashed()) { @@ -811,27 +821,30 @@ LSQ::SplitDataRequest::finish(const Fault &fault, const RequestPtr &req, } else { _inst->strictlyOrdered(mainReq->isStrictlyOrdered()); flags.set(Flag::TranslationFinished); - auto fault_it = _fault.begin(); - /* Ffwd to the first NoFault. */ - while (fault_it != _fault.end() && *fault_it == NoFault) - fault_it++; - /* If none of the fragments faulted: */ - if (fault_it == _fault.end()) { - _inst->physEffAddr = request(0)->getPaddr(); + _inst->translationCompleted(true); + for (i = 0; i < _fault.size() && _fault[i] == NoFault; i++); + if (i > 0) { + _inst->physEffAddr = request(0)->getPaddr(); _inst->memReqFlags = mainReq->getFlags(); if (mainReq->isCondSwap()) { + assert (i == _fault.size()); assert(_res); mainReq->setExtraData(*_res); } - setState(State::Request); - _inst->fault = NoFault; + if (i == _fault.size()) { + _inst->fault = NoFault; + setState(State::Request); + } else { + _inst->fault = _fault[i]; + setState(State::PartialFault); + } } else { + _inst->fault = _fault[0]; setState(State::Fault); - _inst->fault = *fault_it; } - _inst->translationCompleted(true); } + } } diff --git a/src/cpu/o3/lsq_unit_impl.hh b/src/cpu/o3/lsq_unit_impl.hh index 21bed99fa..b71ed7f78 100644 --- a/src/cpu/o3/lsq_unit_impl.hh +++ b/src/cpu/o3/lsq_unit_impl.hh @@ -554,6 +554,16 @@ LSQUnit::executeLoad(const DynInstPtr &inst) if (inst->isTranslationDelayed() && load_fault == NoFault) return load_fault; + if (load_fault != NoFault && inst->translationCompleted() && + inst->savedReq->isPartialFault() && !inst->savedReq->isComplete()) { + assert(inst->savedReq->isSplit()); + // If we have a partial fault where the mem access is not complete yet + // then the cache must have been blocked. This load will be re-executed + // when the cache gets unblocked. We will handle the fault when the + // mem access is complete. + return NoFault; + } + // If the instruction faulted or predicated false, then we need to send it // along to commit without the instruction completing. if (load_fault != NoFault || !inst->readPredicate()) { -- 2.30.2