X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;ds=sidebyside;f=src%2Fmem%2Fcache%2Fbase.hh;h=cd467c8ad6bb685f4fafc8636e50751966bfa3c1;hb=70dc35a659d024a4362c7b3f08887f04285b34f9;hp=47218f82833ed3d2a5a9624eed34c529141238e1;hpb=e57d8f2d897bc26aade774e090842367e38e974b;p=gem5.git diff --git a/src/mem/cache/base.hh b/src/mem/cache/base.hh index 47218f828..cd467c8ad 100644 --- a/src/mem/cache/base.hh +++ b/src/mem/cache/base.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2013, 2015-2016, 2018 ARM Limited + * Copyright (c) 2012-2013, 2015-2016, 2018-2019 ARM Limited * All rights reserved. * * The license below extends only to copyright in the software and shall @@ -64,23 +64,24 @@ #include "debug/CachePort.hh" #include "enums/Clusivity.hh" #include "mem/cache/cache_blk.hh" +#include "mem/cache/compressors/base.hh" #include "mem/cache/mshr_queue.hh" #include "mem/cache/tags/base.hh" #include "mem/cache/write_queue.hh" #include "mem/cache/write_queue_entry.hh" -#include "mem/mem_object.hh" #include "mem/packet.hh" #include "mem/packet_queue.hh" #include "mem/qport.hh" #include "mem/request.hh" +#include "params/WriteAllocator.hh" +#include "sim/clocked_object.hh" #include "sim/eventq.hh" +#include "sim/probe/probe.hh" #include "sim/serialize.hh" #include "sim/sim_exit.hh" #include "sim/system.hh" -class BaseMasterPort; class BasePrefetcher; -class BaseSlavePort; class MSHR; class MasterPort; class QueueEntry; @@ -89,7 +90,7 @@ struct BaseCacheParams; /** * A basic cache interface. Implements some common functions for speed. */ -class BaseCache : public MemObject +class BaseCache : public ClockedObject { protected: /** @@ -187,10 +188,12 @@ class BaseCache : public MemObject * send out, and if so simply stall any requests, and schedule * a send event at the same time as the next snoop response is * being sent out. + * + * @param pkt The packet to check for conflicts against. */ - bool checkConflictingSnoop(Addr addr) + bool checkConflictingSnoop(const PacketPtr pkt) { - if (snoopRespQueue.hasAddr(addr)) { + if (snoopRespQueue.checkConflict(pkt, cache.blkSize)) { DPRINTF(CachePort, "Waiting for snoop response to be " "sent\n"); Tick when = snoopRespQueue.deferredPacketReadyTime(); @@ -320,13 +323,36 @@ class BaseCache : public MemObject /** Tag and data Storage */ BaseTags *tags; + /** Compression method being used. */ + BaseCacheCompressor* compressor; + /** Prefetcher */ BasePrefetcher *prefetcher; + /** To probe when a cache hit occurs */ + ProbePointArg *ppHit; + + /** To probe when a cache miss occurs */ + ProbePointArg *ppMiss; + + /** To probe when a cache fill occurs */ + ProbePointArg *ppFill; + /** - * Notify the prefetcher on every access, not just misses. + * The writeAllocator drive optimizations for streaming writes. + * It first determines whether a WriteReq MSHR should be delayed, + * thus ensuring that we wait longer in cases when we are write + * coalescing and allowing all the bytes of the line to be written + * before the MSHR packet is sent downstream. This works in unison + * with the tracking in the MSHR to check if the entire line is + * written. The write mode also affects the behaviour on filling + * any whole-line writes. Normally the cache allocates the line + * when receiving the InvalidateResp, but after seeing enough + * consecutive lines we switch to using the tempBlock, and thus + * end up not allocating the line, and instead turning the + * whole-line write into a writeback straight away. */ - const bool prefetchOnAccess; + WriteAllocator * const writeAllocator; /** * Temporary cache block for occasional transitory use. We use @@ -399,6 +425,28 @@ class BaseCache : public MemObject */ Addr regenerateBlkAddr(CacheBlk* blk); + /** + * Calculate latency of accesses that only touch the tag array. + * @sa calculateAccessLatency + * + * @param delay The delay until the packet's metadata is present. + * @param lookup_lat Latency of the respective tag lookup. + * @return The number of ticks that pass due to a tag-only access. + */ + Cycles calculateTagOnlyLatency(const uint32_t delay, + const Cycles lookup_lat) const; + /** + * Calculate access latency in ticks given a tag lookup latency, and + * whether access was a hit or miss. + * + * @param blk The cache block that was accessed. + * @param delay The delay until the packet's metadata is present. + * @param lookup_lat Latency of the respective tag lookup. + * @return The number of ticks that pass due to a block access. + */ + Cycles calculateAccessLatency(const CacheBlk* blk, const uint32_t delay, + const Cycles lookup_lat) const; + /** * Does all the processing necessary to perform the provided request. * @param pkt The memory request to perform. @@ -465,16 +513,14 @@ class BaseCache : public MemObject * Service non-deferred MSHR targets using the received response * * Iterates through the list of targets that can be serviced with - * the current response. Any writebacks that need to performed - * must be appended to the writebacks parameter. + * the current response. * * @param mshr The MSHR that corresponds to the reponse * @param pkt The response packet * @param blk The reference block - * @param writebacks List of writebacks that need to be performed */ virtual void serviceMSHRTargets(MSHR *mshr, const PacketPtr pkt, - CacheBlk *blk, PacketList& writebacks) = 0; + CacheBlk *blk) = 0; /** * Handles a response (cache line fill/write ack) from the bus. @@ -611,6 +657,33 @@ class BaseCache : public MemObject */ EventFunctionWrapper writebackTempBlockAtomicEvent; + /** + * When a block is overwriten, its compression information must be updated, + * and it may need to be recompressed. If the compression size changes, the + * block may either become smaller, in which case there is no side effect, + * or bigger (data expansion; fat write), in which case the block might not + * fit in its current location anymore. If that happens, there are usually + * two options to be taken: + * + * - The co-allocated blocks must be evicted to make room for this block. + * Simpler, but ignores replacement data. + * - The block itself is moved elsewhere (used in policies where the CF + * determines the location of the block). + * + * This implementation uses the first approach. + * + * Notice that this is only called for writebacks, which means that L1 + * caches (which see regular Writes), do not support compression. + * @sa CompressedTags + * + * @param blk The block to be overwriten. + * @param data A pointer to the data to be compressed (blk's new data). + * @param writebacks List for any writebacks that need to be performed. + * @return Whether operation is successful or not. + */ + bool updateCompressionData(CacheBlk *blk, const uint64_t* data, + PacketList &writebacks); + /** * Perform any necessary updates to the block and perform any data * exchange between the packet and the block. The flags of the @@ -688,7 +761,7 @@ class BaseCache : public MemObject * @param blk Block to invalidate * @param writebacks Return a list of packets with writebacks */ - virtual void evictBlock(CacheBlk *blk, PacketList &writebacks) = 0; + void evictBlock(CacheBlk *blk, PacketList &writebacks); /** * Invalidate a cache block. @@ -786,6 +859,11 @@ class BaseCache : public MemObject */ const Cycles responseLatency; + /** + * Whether tags and data are accessed sequentially. + */ + const bool sequentialAccess; + /** The number of targets for each MSHR. */ const int numTarget; @@ -834,143 +912,158 @@ class BaseCache : public MemObject /** System we are currently operating in. */ System *system; - // Statistics - /** - * @addtogroup CacheStatistics - * @{ - */ - - /** Number of hits per thread for each type of command. - @sa Packet::Command */ - Stats::Vector hits[MemCmd::NUM_MEM_CMDS]; - /** Number of hits for demand accesses. */ - Stats::Formula demandHits; - /** Number of hit for all accesses. */ - Stats::Formula overallHits; - - /** Number of misses per thread for each type of command. - @sa Packet::Command */ - Stats::Vector misses[MemCmd::NUM_MEM_CMDS]; - /** Number of misses for demand accesses. */ - Stats::Formula demandMisses; - /** Number of misses for all accesses. */ - Stats::Formula overallMisses; - - /** - * Total number of cycles per thread/command spent waiting for a miss. - * Used to calculate the average miss latency. - */ - Stats::Vector missLatency[MemCmd::NUM_MEM_CMDS]; - /** Total number of cycles spent waiting for demand misses. */ - Stats::Formula demandMissLatency; - /** Total number of cycles spent waiting for all misses. */ - Stats::Formula overallMissLatency; - - /** The number of accesses per command and thread. */ - Stats::Formula accesses[MemCmd::NUM_MEM_CMDS]; - /** The number of demand accesses. */ - Stats::Formula demandAccesses; - /** The number of overall accesses. */ - Stats::Formula overallAccesses; - - /** The miss rate per command and thread. */ - Stats::Formula missRate[MemCmd::NUM_MEM_CMDS]; - /** The miss rate of all demand accesses. */ - Stats::Formula demandMissRate; - /** The miss rate for all accesses. */ - Stats::Formula overallMissRate; - - /** The average miss latency per command and thread. */ - Stats::Formula avgMissLatency[MemCmd::NUM_MEM_CMDS]; - /** The average miss latency for demand misses. */ - Stats::Formula demandAvgMissLatency; - /** The average miss latency for all misses. */ - Stats::Formula overallAvgMissLatency; - - /** The total number of cycles blocked for each blocked cause. */ - Stats::Vector blocked_cycles; - /** The number of times this cache blocked for each blocked cause. */ - Stats::Vector blocked_causes; - - /** The average number of cycles blocked for each blocked cause. */ - Stats::Formula avg_blocked; - - /** The number of times a HW-prefetched block is evicted w/o reference. */ - Stats::Scalar unusedPrefetches; - - /** Number of blocks written back per thread. */ - Stats::Vector writebacks; - - /** Number of misses that hit in the MSHRs per command and thread. */ - Stats::Vector mshr_hits[MemCmd::NUM_MEM_CMDS]; - /** Demand misses that hit in the MSHRs. */ - Stats::Formula demandMshrHits; - /** Total number of misses that hit in the MSHRs. */ - Stats::Formula overallMshrHits; - - /** Number of misses that miss in the MSHRs, per command and thread. */ - Stats::Vector mshr_misses[MemCmd::NUM_MEM_CMDS]; - /** Demand misses that miss in the MSHRs. */ - Stats::Formula demandMshrMisses; - /** Total number of misses that miss in the MSHRs. */ - Stats::Formula overallMshrMisses; - - /** Number of misses that miss in the MSHRs, per command and thread. */ - Stats::Vector mshr_uncacheable[MemCmd::NUM_MEM_CMDS]; - /** Total number of misses that miss in the MSHRs. */ - Stats::Formula overallMshrUncacheable; - - /** Total cycle latency of each MSHR miss, per command and thread. */ - Stats::Vector mshr_miss_latency[MemCmd::NUM_MEM_CMDS]; - /** Total cycle latency of demand MSHR misses. */ - Stats::Formula demandMshrMissLatency; - /** Total cycle latency of overall MSHR misses. */ - Stats::Formula overallMshrMissLatency; - - /** Total cycle latency of each MSHR miss, per command and thread. */ - Stats::Vector mshr_uncacheable_lat[MemCmd::NUM_MEM_CMDS]; - /** Total cycle latency of overall MSHR misses. */ - Stats::Formula overallMshrUncacheableLatency; - -#if 0 - /** The total number of MSHR accesses per command and thread. */ - Stats::Formula mshrAccesses[MemCmd::NUM_MEM_CMDS]; - /** The total number of demand MSHR accesses. */ - Stats::Formula demandMshrAccesses; - /** The total number of MSHR accesses. */ - Stats::Formula overallMshrAccesses; -#endif - - /** The miss rate in the MSHRs pre command and thread. */ - Stats::Formula mshrMissRate[MemCmd::NUM_MEM_CMDS]; - /** The demand miss rate in the MSHRs. */ - Stats::Formula demandMshrMissRate; - /** The overall miss rate in the MSHRs. */ - Stats::Formula overallMshrMissRate; - - /** The average latency of an MSHR miss, per command and thread. */ - Stats::Formula avgMshrMissLatency[MemCmd::NUM_MEM_CMDS]; - /** The average latency of a demand MSHR miss. */ - Stats::Formula demandAvgMshrMissLatency; - /** The average overall latency of an MSHR miss. */ - Stats::Formula overallAvgMshrMissLatency; - - /** The average latency of an MSHR miss, per command and thread. */ - Stats::Formula avgMshrUncacheableLatency[MemCmd::NUM_MEM_CMDS]; - /** The average overall latency of an MSHR miss. */ - Stats::Formula overallAvgMshrUncacheableLatency; - - /** Number of replacements of valid blocks. */ - Stats::Scalar replacements; - - /** - * @} - */ - - /** - * Register stats for this object. - */ - void regStats() override; + struct CacheCmdStats : public Stats::Group + { + CacheCmdStats(BaseCache &c, const std::string &name); + + /** + * Callback to register stats from parent + * CacheStats::regStats(). We can't use the normal flow since + * there is is no guaranteed order and CacheStats::regStats() + * needs to rely on these stats being initialised. + */ + void regStatsFromParent(); + + const BaseCache &cache; + + /** Number of hits per thread for each type of command. + @sa Packet::Command */ + Stats::Vector hits; + /** Number of misses per thread for each type of command. + @sa Packet::Command */ + Stats::Vector misses; + /** + * Total number of cycles per thread/command spent waiting for a miss. + * Used to calculate the average miss latency. + */ + Stats::Vector missLatency; + /** The number of accesses per command and thread. */ + Stats::Formula accesses; + /** The miss rate per command and thread. */ + Stats::Formula missRate; + /** The average miss latency per command and thread. */ + Stats::Formula avgMissLatency; + /** Number of misses that hit in the MSHRs per command and thread. */ + Stats::Vector mshr_hits; + /** Number of misses that miss in the MSHRs, per command and thread. */ + Stats::Vector mshr_misses; + /** Number of misses that miss in the MSHRs, per command and thread. */ + Stats::Vector mshr_uncacheable; + /** Total cycle latency of each MSHR miss, per command and thread. */ + Stats::Vector mshr_miss_latency; + /** Total cycle latency of each MSHR miss, per command and thread. */ + Stats::Vector mshr_uncacheable_lat; + /** The miss rate in the MSHRs pre command and thread. */ + Stats::Formula mshrMissRate; + /** The average latency of an MSHR miss, per command and thread. */ + Stats::Formula avgMshrMissLatency; + /** The average latency of an MSHR miss, per command and thread. */ + Stats::Formula avgMshrUncacheableLatency; + }; + + struct CacheStats : public Stats::Group + { + CacheStats(BaseCache &c); + + void regStats() override; + + CacheCmdStats &cmdStats(const PacketPtr p) { + return *cmd[p->cmdToIndex()]; + } + + const BaseCache &cache; + + /** Number of hits for demand accesses. */ + Stats::Formula demandHits; + /** Number of hit for all accesses. */ + Stats::Formula overallHits; + + /** Number of misses for demand accesses. */ + Stats::Formula demandMisses; + /** Number of misses for all accesses. */ + Stats::Formula overallMisses; + + /** Total number of cycles spent waiting for demand misses. */ + Stats::Formula demandMissLatency; + /** Total number of cycles spent waiting for all misses. */ + Stats::Formula overallMissLatency; + + /** The number of demand accesses. */ + Stats::Formula demandAccesses; + /** The number of overall accesses. */ + Stats::Formula overallAccesses; + + /** The miss rate of all demand accesses. */ + Stats::Formula demandMissRate; + /** The miss rate for all accesses. */ + Stats::Formula overallMissRate; + + /** The average miss latency for demand misses. */ + Stats::Formula demandAvgMissLatency; + /** The average miss latency for all misses. */ + Stats::Formula overallAvgMissLatency; + + /** The total number of cycles blocked for each blocked cause. */ + Stats::Vector blocked_cycles; + /** The number of times this cache blocked for each blocked cause. */ + Stats::Vector blocked_causes; + + /** The average number of cycles blocked for each blocked cause. */ + Stats::Formula avg_blocked; + + /** The number of times a HW-prefetched block is evicted w/o + * reference. */ + Stats::Scalar unusedPrefetches; + + /** Number of blocks written back per thread. */ + Stats::Vector writebacks; + + /** Demand misses that hit in the MSHRs. */ + Stats::Formula demandMshrHits; + /** Total number of misses that hit in the MSHRs. */ + Stats::Formula overallMshrHits; + + /** Demand misses that miss in the MSHRs. */ + Stats::Formula demandMshrMisses; + /** Total number of misses that miss in the MSHRs. */ + Stats::Formula overallMshrMisses; + + /** Total number of misses that miss in the MSHRs. */ + Stats::Formula overallMshrUncacheable; + + /** Total cycle latency of demand MSHR misses. */ + Stats::Formula demandMshrMissLatency; + /** Total cycle latency of overall MSHR misses. */ + Stats::Formula overallMshrMissLatency; + + /** Total cycle latency of overall MSHR misses. */ + Stats::Formula overallMshrUncacheableLatency; + + /** The demand miss rate in the MSHRs. */ + Stats::Formula demandMshrMissRate; + /** The overall miss rate in the MSHRs. */ + Stats::Formula overallMshrMissRate; + + /** The average latency of a demand MSHR miss. */ + Stats::Formula demandAvgMshrMissLatency; + /** The average overall latency of an MSHR miss. */ + Stats::Formula overallAvgMshrMissLatency; + + /** The average overall latency of an MSHR miss. */ + Stats::Formula overallAvgMshrUncacheableLatency; + + /** Number of replacements of valid blocks. */ + Stats::Scalar replacements; + + /** Number of data expansions. */ + Stats::Scalar dataExpansions; + + /** Per-command statistics */ + std::vector> cmd; + } stats; + + /** Registers probes. */ + void regProbePoints() override; public: BaseCache(const BaseCacheParams *p, unsigned blk_size); @@ -978,10 +1071,8 @@ class BaseCache : public MemObject void init() override; - BaseMasterPort &getMasterPort(const std::string &if_name, - PortID idx = InvalidPortID) override; - BaseSlavePort &getSlavePort(const std::string &if_name, - PortID idx = InvalidPortID) override; + Port &getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; /** * Query block size of a cache. @@ -1020,6 +1111,15 @@ class BaseCache : public MemObject Addr blk_addr = pkt->getBlockAddr(blkSize); + // If using compression, on evictions the block is decompressed and + // the operation's latency is added to the payload delay. Consume + // that payload delay here, meaning that the data is always stored + // uncompressed in the writebuffer + if (compressor) { + time += pkt->payloadDelay; + pkt->payloadDelay = 0; + } + WriteQueueEntry *wq_entry = writeBuffer.findMatch(blk_addr, pkt->isSecure()); if (wq_entry && !wq_entry->inService) { @@ -1053,7 +1153,7 @@ class BaseCache : public MemObject { uint8_t flag = 1 << cause; if (blocked == 0) { - blocked_causes[cause]++; + stats.blocked_causes[cause]++; blockedCycle = curCycle(); cpuSidePort.setBlocked(); } @@ -1074,7 +1174,7 @@ class BaseCache : public MemObject blocked &= ~flag; DPRINTF(Cache,"Unblocking for cause %d, mask=%d\n", cause, blocked); if (blocked == 0) { - blocked_cycles[cause] += curCycle() - blockedCycle; + stats.blocked_cycles[cause] += curCycle() - blockedCycle; cpuSidePort.clearBlocked(); } } @@ -1096,6 +1196,15 @@ class BaseCache : public MemObject return tags->findBlock(addr, is_secure); } + bool hasBeenPrefetched(Addr addr, bool is_secure) const { + CacheBlk *block = tags->findBlock(addr, is_secure); + if (block) { + return block->wasPrefetched(); + } else { + return false; + } + } + bool inMissQueue(Addr addr, bool is_secure) const { return mshrQueue.findMatch(addr, is_secure); } @@ -1103,7 +1212,7 @@ class BaseCache : public MemObject void incMissCount(PacketPtr pkt) { assert(pkt->req->masterId() < system->maxMasters()); - misses[pkt->cmdToIndex()][pkt->req->masterId()]++; + stats.cmdStats(pkt).misses[pkt->req->masterId()]++; pkt->req->incAccessDepth(); if (missCount) { --missCount; @@ -1114,10 +1223,17 @@ class BaseCache : public MemObject void incHitCount(PacketPtr pkt) { assert(pkt->req->masterId() < system->maxMasters()); - hits[pkt->cmdToIndex()][pkt->req->masterId()]++; - + stats.cmdStats(pkt).hits[pkt->req->masterId()]++; } + /** + * Checks if the cache is coalescing writes + * + * @return True if the cache is coalescing writes + */ + bool coalesce() const; + + /** * Cache block visitor that writes back dirty cache blocks using * functional writes. @@ -1158,7 +1274,138 @@ class BaseCache : public MemObject */ void serialize(CheckpointOut &cp) const override; void unserialize(CheckpointIn &cp) override; +}; + +/** + * The write allocator inspects write packets and detects streaming + * patterns. The write allocator supports a single stream where writes + * are expected to access consecutive locations and keeps track of + * size of the area covered by the concecutive writes in byteCount. + * + * 1) When byteCount has surpassed the coallesceLimit the mode + * switches from ALLOCATE to COALESCE where writes should be delayed + * until the whole block is written at which point a single packet + * (whole line write) can service them. + * + * 2) When byteCount has also exceeded the noAllocateLimit (whole + * line) we switch to NO_ALLOCATE when writes should not allocate in + * the cache but rather send a whole line write to the memory below. + */ +class WriteAllocator : public SimObject { + public: + WriteAllocator(const WriteAllocatorParams *p) : + SimObject(p), + coalesceLimit(p->coalesce_limit * p->block_size), + noAllocateLimit(p->no_allocate_limit * p->block_size), + delayThreshold(p->delay_threshold) + { + reset(); + } + + /** + * Should writes be coalesced? This is true if the mode is set to + * NO_ALLOCATE. + * + * @return return true if the cache should coalesce writes. + */ + bool coalesce() const { + return mode != WriteMode::ALLOCATE; + } + + /** + * Should writes allocate? + * + * @return return true if the cache should not allocate for writes. + */ + bool allocate() const { + return mode != WriteMode::NO_ALLOCATE; + } + + /** + * Reset the write allocator state, meaning that it allocates for + * writes and has not recorded any information about qualifying + * writes that might trigger a switch to coalescing and later no + * allocation. + */ + void reset() { + mode = WriteMode::ALLOCATE; + byteCount = 0; + nextAddr = 0; + } + + /** + * Access whether we need to delay the current write. + * + * @param blk_addr The block address the packet writes to + * @return true if the current packet should be delayed + */ + bool delay(Addr blk_addr) { + if (delayCtr[blk_addr] > 0) { + --delayCtr[blk_addr]; + return true; + } else { + return false; + } + } + + /** + * Clear delay counter for the input block + * + * @param blk_addr The accessed cache block + */ + void resetDelay(Addr blk_addr) { + delayCtr.erase(blk_addr); + } + + /** + * Update the write mode based on the current write + * packet. This method compares the packet's address with any + * current stream, and updates the tracking and the mode + * accordingly. + * + * @param write_addr Start address of the write request + * @param write_size Size of the write request + * @param blk_addr The block address that this packet writes to + */ + void updateMode(Addr write_addr, unsigned write_size, Addr blk_addr); + private: + /** + * The current mode for write coalescing and allocation, either + * normal operation (ALLOCATE), write coalescing (COALESCE), or + * write coalescing without allocation (NO_ALLOCATE). + */ + enum class WriteMode : char { + ALLOCATE, + COALESCE, + NO_ALLOCATE, + }; + WriteMode mode; + + /** Address to match writes against to detect streams. */ + Addr nextAddr; + + /** + * Bytes written contiguously. Saturating once we no longer + * allocate. + */ + uint32_t byteCount; + + /** + * Limits for when to switch between the different write modes. + */ + const uint32_t coalesceLimit; + const uint32_t noAllocateLimit; + /** + * The number of times the allocator will delay an WriteReq MSHR. + */ + const uint32_t delayThreshold; + + /** + * Keep track of the number of times the allocator has delayed an + * WriteReq MSHR. + */ + std::unordered_map delayCtr; }; #endif //__MEM_CACHE_BASE_HH__