From: Ivan Pizarro Date: Mon, 3 Dec 2018 22:03:01 +0000 (+0100) Subject: mem-cache: A Best-Offset Prefetcher X-Git-Tag: v19.0.0.0~1102 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d63f735663d710839f79a4b88af67da968995168;p=gem5.git mem-cache: A Best-Offset Prefetcher Michaud, P. (2015, June). A best-offset prefetcher. In 2nd Data Prefetching Championship. Change-Id: I61bb89ca5639356d54aeb04e856d5bf6e8805c22 Reviewed-on: https://gem5-review.googlesource.com/c/14820 Reviewed-by: Nikos Nikoleris Maintainer: Nikos Nikoleris --- diff --git a/src/mem/cache/base.cc b/src/mem/cache/base.cc index 6049ca6a6..5b4307b76 100644 --- a/src/mem/cache/base.cc +++ b/src/mem/cache/base.cc @@ -478,6 +478,7 @@ BaseCache::recvTimingResp(PacketPtr pkt) writeAllocator->allocate() : mshr->allocOnFill(); blk = handleFill(pkt, blk, writebacks, allocate); assert(blk != nullptr); + ppFill->notify(pkt); } if (blk && blk->isValid() && pkt->isClean() && !pkt->isInvalidate()) { @@ -2223,6 +2224,7 @@ BaseCache::regProbePoints() { ppHit = new ProbePointArg(this->getProbeManager(), "Hit"); ppMiss = new ProbePointArg(this->getProbeManager(), "Miss"); + ppFill = new ProbePointArg(this->getProbeManager(), "Fill"); } /////////////// diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh index 6144fd584..11372418a 100644 --- a/src/mem/cache/base.hh +++ b/src/mem/cache/base.hh @@ -331,6 +331,9 @@ class BaseCache : public MemObject /** To probe when a cache miss occurs */ ProbePointArg *ppMiss; + /** To probe when a cache fill occurs */ + ProbePointArg *ppFill; + /** * The writeAllocator drive optimizations for streaming writes. * It first determines whether a WriteReq MSHR should be delayed, diff --git a/src/mem/cache/prefetch/Prefetcher.py b/src/mem/cache/prefetch/Prefetcher.py index 36e254151..31d748698 100644 --- a/src/mem/cache/prefetch/Prefetcher.py +++ b/src/mem/cache/prefetch/Prefetcher.py @@ -341,3 +341,25 @@ class SlimAMPMPrefetcher(QueuedPrefetcher): dcpt = Param.DeltaCorrelatingPredictionTables( SlimDeltaCorrelatingPredictionTables(), "Delta Correlating Prediction Tables object") + +class BOPPrefetcher(QueuedPrefetcher): + type = "BOPPrefetcher" + cxx_class = "BOPPrefetcher" + cxx_header = "mem/cache/prefetch/bop.hh" + score_max = Param.Unsigned(31, "Max. score to update the best offset") + round_max = Param.Unsigned(100, "Max. round to update the best offset") + bad_score = Param.Unsigned(10, "Score at which the HWP is disabled") + rr_size = Param.Unsigned(64, "Number of entries of each RR bank") + tag_bits = Param.Unsigned(12, "Bits used to store the tag") + offset_list_size = Param.Unsigned(46, + "Number of entries in the offsets list") + negative_offsets_enable = Param.Bool(True, + "Initialize the offsets list also with negative values \ + (i.e. the table will have half of the entries with positive \ + offsets and the other half with negative ones)") + delay_queue_enable = Param.Bool(True, "Enable the delay queue") + delay_queue_size = Param.Unsigned(15, + "Number of entries in the delay queue") + delay_queue_cycles = Param.Cycles(60, + "Cycles to delay a write in the left RR table from the delay \ + queue") diff --git a/src/mem/cache/prefetch/SConscript b/src/mem/cache/prefetch/SConscript index 48ee218a9..d87b694ed 100644 --- a/src/mem/cache/prefetch/SConscript +++ b/src/mem/cache/prefetch/SConscript @@ -34,6 +34,7 @@ SimObject('Prefetcher.py') Source('access_map_pattern_matching.cc') Source('base.cc') +Source('bop.cc') Source('delta_correlating_prediction_tables.cc') Source('irregular_stream_buffer.cc') Source('queued.cc') diff --git a/src/mem/cache/prefetch/base.cc b/src/mem/cache/prefetch/base.cc index cd3eade09..52f5d1a9a 100644 --- a/src/mem/cache/prefetch/base.cc +++ b/src/mem/cache/prefetch/base.cc @@ -72,7 +72,11 @@ BasePrefetcher::PrefetchInfo::PrefetchInfo(PrefetchInfo const &pfi, Addr addr) void BasePrefetcher::PrefetchListener::notify(const PacketPtr &pkt) { - parent.probeNotify(pkt); + if (isFill) { + parent.notifyFill(pkt); + } else { + parent.probeNotify(pkt); + } } BasePrefetcher::BasePrefetcher(const BasePrefetcherParams *p) @@ -224,6 +228,7 @@ BasePrefetcher::regProbeListeners() if (listeners.empty() && cache != nullptr) { ProbeManager *pm(cache->getProbeManager()); listeners.push_back(new PrefetchListener(*this, pm, "Miss")); + listeners.push_back(new PrefetchListener(*this, pm, "Fill", true)); if (prefetchOnAccess) { listeners.push_back(new PrefetchListener(*this, pm, "Hit")); } diff --git a/src/mem/cache/prefetch/base.hh b/src/mem/cache/prefetch/base.hh index de275f85c..4df8428c6 100644 --- a/src/mem/cache/prefetch/base.hh +++ b/src/mem/cache/prefetch/base.hh @@ -67,12 +67,13 @@ class BasePrefetcher : public ClockedObject { public: PrefetchListener(BasePrefetcher &_parent, ProbeManager *pm, - const std::string &name) + const std::string &name, bool _isFill = false) : ProbeListenerArgBase(pm, name), - parent(_parent) {} + parent(_parent), isFill(_isFill) {} void notify(const PacketPtr &pkt) override; protected: BasePrefetcher &parent; + bool isFill; }; std::vector listeners; @@ -253,6 +254,10 @@ class BasePrefetcher : public ClockedObject */ virtual void notify(const PacketPtr &pkt, const PrefetchInfo &pfi) = 0; + /** Notify prefetcher of cache fill */ + virtual void notifyFill(const PacketPtr &pkt) + {} + virtual PacketPtr getPacket() = 0; virtual Tick nextPrefetchReadyTime() const = 0; diff --git a/src/mem/cache/prefetch/bop.cc b/src/mem/cache/prefetch/bop.cc new file mode 100644 index 000000000..b79082f62 --- /dev/null +++ b/src/mem/cache/prefetch/bop.cc @@ -0,0 +1,267 @@ +/** + * Copyright (c) 2018 Metempsy Technology Consulting + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Ivan Pizarro + */ + +#include "mem/cache/prefetch/bop.hh" + +#include "debug/HWPrefetch.hh" +#include "params/BOPPrefetcher.hh" + +BOPPrefetcher::BOPPrefetcher(const BOPPrefetcherParams *p) + : QueuedPrefetcher(p), + scoreMax(p->score_max), roundMax(p->round_max), + badScore(p->bad_score), rrEntries(p->rr_size), + tagMask((1 << p->tag_bits) - 1), + delayQueueEnabled(p->delay_queue_enable), + delayQueueSize(p->delay_queue_size), + delayTicks(cyclesToTicks(p->delay_queue_cycles)), + delayQueueEvent([this]{ delayQueueEventWrapper(); }, name()), + issuePrefetchRequests(false), bestOffset(1), phaseBestOffset(0), + bestScore(0), round(0) +{ + if (!isPowerOf2(rrEntries)) { + fatal("%s: number of RR entries is not power of 2\n", name()); + } + if (!isPowerOf2(blkSize)) { + fatal("%s: cache line size is not power of 2\n", name()); + } + if (!(p->negative_offsets_enable && (p->offset_list_size % 2 == 0))) { + fatal("%s: negative offsets enabled with odd offset list size\n", + name()); + } + + rrLeft.resize(rrEntries); + rrRight.resize(rrEntries); + + // Following the paper implementation, a list with the specified number + // of offsets which are of the form 2^i * 3^j * 5^k with i,j,k >= 0 + const int factors[] = { 2, 3, 5 }; + unsigned int i = 0; + int64_t offset_i = 1; + + while (i < p->offset_list_size) + { + int64_t offset = offset_i; + + for (int n : factors) { + while ((offset % n) == 0) { + offset /= n; + } + } + + if (offset == 1) { + offsetsList.push_back(OffsetListEntry(offset_i, 0)); + i++; + // If we want to use negative offsets, add also the negative value + // of the offset just calculated + if (p->negative_offsets_enable) { + offsetsList.push_back(OffsetListEntry(-offset_i, 0)); + i++; + } + } + + offset_i++; + } + + offsetsListIterator = offsetsList.begin(); +} + +void +BOPPrefetcher::delayQueueEventWrapper() +{ + while (!delayQueue.empty() && + delayQueue.front().processTick <= curTick()) + { + Addr addr_x = delayQueue.front().baseAddr; + insertIntoRR(addr_x, RRWay::Left); + delayQueue.pop_front(); + } + + // Schedule an event for the next element if there is one + if (!delayQueue.empty()) { + schedule(delayQueueEvent, delayQueue.front().processTick); + } +} + +unsigned int +BOPPrefetcher::hash(Addr addr, unsigned int way) const +{ + Addr hash1 = addr >> way; + Addr hash2 = hash1 >> floorLog2(rrEntries); + return (hash1 ^ hash2) & (Addr)(rrEntries - 1); +} + +void +BOPPrefetcher::insertIntoRR(Addr addr, unsigned int way) +{ + switch (way) { + case RRWay::Left: + rrLeft[hash(addr, RRWay::Left)] = addr; + break; + case RRWay::Right: + rrRight[hash(addr, RRWay::Right)] = addr; + break; + } +} + +void +BOPPrefetcher::insertIntoDelayQueue(Addr x) +{ + if (delayQueue.size() == delayQueueSize) { + return; + } + + // Add the address to the delay queue and schedule an event to process + // it after the specified delay cycles + Tick process_tick = curTick() + delayTicks; + + delayQueue.push_back(DelayQueueEntry(x, process_tick)); + + if (!delayQueueEvent.scheduled()) { + schedule(delayQueueEvent, process_tick); + } +} + +void +BOPPrefetcher::resetScores() +{ + for (auto& it : offsetsList) { + it.second = 0; + } +} + +inline Addr +BOPPrefetcher::tag(Addr addr) const +{ + return (addr >> blkSize) & tagMask; +} + +bool +BOPPrefetcher::testRR(Addr addr) const +{ + for (auto& it : rrLeft) { + if (it == addr) { + return true; + } + } + + for (auto& it : rrRight) { + if (it == addr) { + return true; + } + } + + return false; +} + +void +BOPPrefetcher::bestOffsetLearning(Addr x) +{ + Addr offset_addr = (*offsetsListIterator).first; + Addr lookup_addr = x - offset_addr; + + // There was a hit in the RR table, increment the score for this offset + if (testRR(lookup_addr)) { + DPRINTF(HWPrefetch, "Address %#lx found in the RR table\n", x); + (*offsetsListIterator).second++; + if ((*offsetsListIterator).second > bestScore) { + bestScore = (*offsetsListIterator).second; + phaseBestOffset = (*offsetsListIterator).first; + DPRINTF(HWPrefetch, "New best score is %lu\n", bestScore); + } + } + + offsetsListIterator++; + + // All the offsets in the list were visited meaning that a learning + // phase finished. Check if + if (offsetsListIterator == offsetsList.end()) { + offsetsListIterator = offsetsList.begin(); + round++; + + // Check if the best offset must be updated if: + // (1) One of the scores equals SCORE_MAX + // (2) The number of rounds equals ROUND_MAX + if ((bestScore >= scoreMax) || (round == roundMax)) { + bestOffset = phaseBestOffset; + round = 0; + bestScore = 0; + phaseBestOffset = 0; + resetScores(); + issuePrefetchRequests = true; + } else if (phaseBestOffset <= badScore) { + issuePrefetchRequests = false; + } + } +} + +void +BOPPrefetcher::calculatePrefetch(const PrefetchInfo &pfi, + std::vector &addresses) +{ + Addr addr = pfi.getAddr(); + Addr tag_x = tag(addr); + + if (delayQueueEnabled) { + insertIntoDelayQueue(tag_x); + } else { + insertIntoRR(tag_x, RRWay::Left); + } + + // Go through the nth offset and update the score, the best score and the + // current best offset if a better one is found + bestOffsetLearning(tag_x); + + // This prefetcher is a degree 1 prefetch, so it will only generate one + // prefetch at most per access + if (issuePrefetchRequests) { + Addr prefetch_addr = addr + (bestOffset << lBlkSize); + addresses.push_back(AddrPriority(prefetch_addr, 0)); + DPRINTF(HWPrefetch, "Generated prefetch %#lx\n", prefetch_addr); + } +} + +void +BOPPrefetcher::notifyFill(const PacketPtr& pkt) +{ + // Only insert into the RR right way if it's the pkt is a HWP + if (!pkt->cmd.isHWPrefetch()) return; + + Addr tag_y = tag(pkt->getAddr()); + + if (issuePrefetchRequests) { + insertIntoRR(tag_y - bestOffset, RRWay::Right); + } +} + +BOPPrefetcher* +BOPPrefetcherParams::create() +{ + return new BOPPrefetcher(this); +} diff --git a/src/mem/cache/prefetch/bop.hh b/src/mem/cache/prefetch/bop.hh new file mode 100644 index 000000000..e8fd77057 --- /dev/null +++ b/src/mem/cache/prefetch/bop.hh @@ -0,0 +1,157 @@ +/** + * Copyright (c) 2018 Metempsy Technology Consulting + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Authors: Ivan Pizarro + */ + +/** + * Implementation of the 'A Best-Offset Prefetcher' + * Reference: + * Michaud, P. (2015, June). A best-offset prefetcher. + * In 2nd Data Prefetching Championship. + */ + +#ifndef __MEM_CACHE_PREFETCH_BOP_HH__ +#define __MEM_CACHE_PREFETCH_BOP_HH__ + +#include + +#include "mem/cache/prefetch/queued.hh" +#include "mem/packet.hh" + +struct BOPPrefetcherParams; + +class BOPPrefetcher : public QueuedPrefetcher +{ + private: + + enum RRWay { + Left, + Right + }; + + /** Learning phase parameters */ + const unsigned int scoreMax; + const unsigned int roundMax; + const unsigned int badScore; + /** Recent requests table parameteres */ + const unsigned int rrEntries; + const unsigned int tagMask; + /** Delay queue parameters */ + const bool delayQueueEnabled; + const unsigned int delayQueueSize; + const unsigned int delayTicks; + + std::vector rrLeft; + std::vector rrRight; + + /** Structure to save the offset and the score */ + typedef std::pair OffsetListEntry; + std::vector offsetsList; + + /** In a first implementation of the BO prefetcher, both banks of the + * RR were written simultaneously when a prefetched line is inserted + * into the cache. Adding the delay queue tries to avoid always + * striving for timeless prefetches, which has been found to not + * always being optimal. + */ + struct DelayQueueEntry + { + Addr baseAddr; + Tick processTick; + + DelayQueueEntry(Addr x, Tick t) : baseAddr(x), processTick(t) + {} + }; + + std::deque delayQueue; + + /** Event to handle the delay queue processing */ + void delayQueueEventWrapper(); + EventFunctionWrapper delayQueueEvent; + + /** Hardware prefetcher enabled */ + bool issuePrefetchRequests; + /** Current best offset to issue prefetches */ + Addr bestOffset; + /** Current best offset found in the learning phase */ + Addr phaseBestOffset; + /** Current test offset index */ + std::vector::iterator offsetsListIterator; + /** Max score found so far */ + unsigned int bestScore; + /** Current round */ + unsigned int round; + + /** Generate a hash for the specified address to index the RR table + * @param addr: address to hash + * @param way: RR table to which is addressed (left/right) + */ + unsigned int hash(Addr addr, unsigned int way) const; + + /** Insert the specified address into the RR table + * @param addr: address to insert + * @param way: RR table to which the address will be inserted + */ + void insertIntoRR(Addr addr, unsigned int way); + + /** Insert the specified address into the delay queue. This will + * trigger an event after the delay cycles pass + * @param addr: address to insert into the delay queue + */ + void insertIntoDelayQueue(Addr addr); + + /** Reset all the scores from the offset list */ + void resetScores(); + + /** Generate the tag for the specified address based on the tag bits + * and the block size + * @param addr: address to get the tag from + */ + Addr tag(Addr addr) const; + + /** Test if @X-O is hitting in the RR table to update the + offset score */ + bool testRR(Addr) const; + + /** Learning phase of the BOP. Update the intermediate values of the + round and update the best offset if found */ + void bestOffsetLearning(Addr); + + /** Update the RR right table after a prefetch fill */ + void notifyFill(const PacketPtr& pkt) override; + + public: + + BOPPrefetcher(const BOPPrefetcherParams *p); + ~BOPPrefetcher() {} + + void calculatePrefetch(const PrefetchInfo &pfi, + std::vector &addresses); +}; + +#endif /* __MEM_CACHE_PREFETCH_BOP_HH__ */