mem-cache: Make (de)compression latencies params
authorDaniel R. Carvalho <odanrc@yahoo.com.br>
Thu, 22 Oct 2020 16:45:12 +0000 (18:45 +0200)
committerDaniel Carvalho <odanrc@yahoo.com.br>
Fri, 23 Oct 2020 21:49:11 +0000 (21:49 +0000)
Add 4 params to calculate compression and decompression latencies.
A pair of params informs how many chunks are parsed per cycle, and
the other pair informs how many extra cycles are needed after the
chunks are parsed to finish the (de)compression.

Change-Id: Ie67b0c298f06a08011f553789e3a9a1d89dd7c4f
Signed-off-by: Daniel R. Carvalho <odanrc@yahoo.com.br>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/36497
Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com>
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
12 files changed:
src/mem/cache/compressors/Compressors.py
src/mem/cache/compressors/base.cc
src/mem/cache/compressors/base.hh
src/mem/cache/compressors/base_delta_impl.hh
src/mem/cache/compressors/cpack.cc
src/mem/cache/compressors/cpack.hh
src/mem/cache/compressors/dictionary_compressor.hh
src/mem/cache/compressors/dictionary_compressor_impl.hh
src/mem/cache/compressors/fpcd.cc
src/mem/cache/compressors/fpcd.hh
src/mem/cache/compressors/multi.cc
src/mem/cache/compressors/perfect.cc

index 689a42e2e97df1d0d85fa52588cc4331c7955d2d..a1f3706fd6aeae9b2485e8fd4a55f07bdcf66193 100644 (file)
@@ -41,6 +41,15 @@ class BaseCacheCompressor(SimObject):
         "Minimum percentage of the block size, a compressed block must "
         "achieve to be stored in compressed format")
 
+    comp_chunks_per_cycle = Param.Unsigned(1,
+        "Number of chunks that can be compressed in parallel per cycle.")
+    comp_extra_latency = Param.Cycles(1, "Number of extra cycles required "
+        "to finish compression (e.g., due to shifting and packaging).")
+    decomp_chunks_per_cycle = Param.Unsigned(1,
+        "Number of chunks that can be decompressed in parallel per cycle.")
+    decomp_extra_latency = Param.Cycles(1, "Number of extra cycles required "
+        "to finish decompression (e.g., due to shifting and packaging).")
+
 class BaseDictionaryCompressor(BaseCacheCompressor):
     type = 'BaseDictionaryCompressor'
     abstract = True
@@ -57,6 +66,12 @@ class Base64Delta8(BaseDictionaryCompressor):
 
     chunk_size_bits = 64
 
+    # Base-delta compressors achieve 1-cycle latencies
+    comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    comp_extra_latency = 0
+    decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    decomp_extra_latency = 0
+
 class Base64Delta16(BaseDictionaryCompressor):
     type = 'Base64Delta16'
     cxx_class = 'Compressor::Base64Delta16'
@@ -64,6 +79,12 @@ class Base64Delta16(BaseDictionaryCompressor):
 
     chunk_size_bits = 64
 
+    # Base-delta compressors achieve 1-cycle latencies
+    comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    comp_extra_latency = 0
+    decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    decomp_extra_latency = 0
+
 class Base64Delta32(BaseDictionaryCompressor):
     type = 'Base64Delta32'
     cxx_class = 'Compressor::Base64Delta32'
@@ -71,6 +92,12 @@ class Base64Delta32(BaseDictionaryCompressor):
 
     chunk_size_bits = 64
 
+    # Base-delta compressors achieve 1-cycle latencies
+    comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    comp_extra_latency = 0
+    decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    decomp_extra_latency = 0
+
 class Base32Delta8(BaseDictionaryCompressor):
     type = 'Base32Delta8'
     cxx_class = 'Compressor::Base32Delta8'
@@ -78,6 +105,12 @@ class Base32Delta8(BaseDictionaryCompressor):
 
     chunk_size_bits = 32
 
+    # Base-delta compressors achieve 1-cycle latencies
+    comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    comp_extra_latency = 0
+    decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    decomp_extra_latency = 0
+
 class Base32Delta16(BaseDictionaryCompressor):
     type = 'Base32Delta16'
     cxx_class = 'Compressor::Base32Delta16'
@@ -85,6 +118,12 @@ class Base32Delta16(BaseDictionaryCompressor):
 
     chunk_size_bits = 32
 
+    # Base-delta compressors achieve 1-cycle latencies
+    comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    comp_extra_latency = 0
+    decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    decomp_extra_latency = 0
+
 class Base16Delta8(BaseDictionaryCompressor):
     type = 'Base16Delta8'
     cxx_class = 'Compressor::Base16Delta8'
@@ -92,16 +131,36 @@ class Base16Delta8(BaseDictionaryCompressor):
 
     chunk_size_bits = 16
 
+    # Base-delta compressors achieve 1-cycle latencies
+    comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    comp_extra_latency = 0
+    decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    decomp_extra_latency = 0
+
 class CPack(BaseDictionaryCompressor):
     type = 'CPack'
     cxx_class = 'Compressor::CPack'
     cxx_header = "mem/cache/compressors/cpack.hh"
 
+    comp_chunks_per_cycle = 2
+    # Accounts for pattern matching, length generation, packaging and shifting
+    comp_extra_latency = 5
+    decomp_chunks_per_cycle = 2
+    decomp_extra_latency = 1
+
 class FPCD(BaseDictionaryCompressor):
     type = 'FPCD'
     cxx_class = 'Compressor::FPCD'
     cxx_header = "mem/cache/compressors/fpcd.hh"
 
+    # Accounts for checking all patterns, selecting patterns, and shifting
+    # The original claim of a decompression latency of 2 cycles would likely
+    # generate an unrealistically complex circuit
+    comp_chunks_per_cycle = 4
+    comp_extra_latency = 1
+    decomp_chunks_per_cycle = 4
+    decomp_extra_latency = 0
+
     dictionary_size = 2
 
 class MultiCompressor(BaseCacheCompressor):
@@ -116,8 +175,17 @@ class MultiCompressor(BaseCacheCompressor):
     encoding_in_tags = Param.Bool(False, "If set the bits to inform which "
         "sub-compressor compressed some data are added to its corresponding "
         "tag entry.")
-    extra_decomp_lat = Param.Unsigned(0, "Extra latency to be added to the "
-        "sub-compressor's decompression latency")
+
+    # Use the sub-compressors' latencies
+    comp_chunks_per_cycle = 0
+    decomp_chunks_per_cycle = 0
+
+    # Assume extra 1 cycle to select the results of the winning sub-compressor
+    comp_extra_latency = 1
+
+    # Multi-compressors may need a couple of extra cycles to the select
+    # which sub-compressor should be used to decompress the data
+    decomp_extra_latency = 1
 
 class PerfectCompressor(BaseCacheCompressor):
     type = 'PerfectCompressor'
@@ -128,10 +196,11 @@ class PerfectCompressor(BaseCacheCompressor):
 
     max_compression_ratio = Param.Int("Maximum compression ratio allowed")
 
-    compression_latency = Param.Cycles(1,
-        "Number of cycles to perform data compression")
-    decompression_latency = Param.Cycles(1,
-        "Number of cycles to perform data decompression")
+    # In a perfect world compression and decompression happen in 1 cycle
+    comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    comp_extra_latency = 0
+    decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    decomp_extra_latency = 0
 
 class RepeatedQwordsCompressor(BaseDictionaryCompressor):
     type = 'RepeatedQwordsCompressor'
@@ -140,6 +209,12 @@ class RepeatedQwordsCompressor(BaseDictionaryCompressor):
 
     chunk_size_bits = 64
 
+    # Assume 1-cycle latencies
+    comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    comp_extra_latency = 0
+    decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    decomp_extra_latency = 0
+
 class ZeroCompressor(BaseDictionaryCompressor):
     type = 'ZeroCompressor'
     cxx_class = 'Compressor::Zero'
@@ -147,8 +222,13 @@ class ZeroCompressor(BaseDictionaryCompressor):
 
     chunk_size_bits = 64
 
+    # Assume 1-cycle latencies
+    comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    comp_extra_latency = 0
+    decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+    decomp_extra_latency = 0
+
 class BDI(MultiCompressor):
-    encoding_in_tags=True
     compressors = [
         ZeroCompressor(size_threshold_percentage=99),
         RepeatedQwordsCompressor(size_threshold_percentage=99),
@@ -159,3 +239,8 @@ class BDI(MultiCompressor):
         Base32Delta16(size_threshold_percentage=99),
         Base16Delta8(size_threshold_percentage=99),
     ]
+
+    # By default assume that the encoding is stored in the tags, and is
+    # retrieved and decoded while (and ends before) the data is being read.
+    decomp_extra_latency = 0
+    encoding_in_tags=True
index 3e392ae0b5db660f72df68d6c4df076d5d3e6462..9778605bf70b6d052126324bae8dcfe2cfd9fa3e 100644 (file)
@@ -78,11 +78,22 @@ Base::CompressionData::getSize() const
 Base::Base(const Params &p)
   : SimObject(p), blkSize(p.block_size), chunkSizeBits(p.chunk_size_bits),
     sizeThreshold((blkSize * p.size_threshold_percentage) / 100),
+    compChunksPerCycle(p.comp_chunks_per_cycle),
+    compExtraLatency(p.comp_extra_latency),
+    decompChunksPerCycle(p.decomp_chunks_per_cycle),
+    decompExtraLatency(p.decomp_extra_latency),
     stats(*this)
 {
     fatal_if(64 % chunkSizeBits,
         "64 must be a multiple of the chunk granularity.");
 
+    fatal_if(((CHAR_BIT * blkSize) / chunkSizeBits) < compChunksPerCycle,
+        "Compressor processes more chunks per cycle than the number of "
+        "chunks in the input");
+    fatal_if(((CHAR_BIT * blkSize) / chunkSizeBits) < decompChunksPerCycle,
+        "Decompressor processes more chunks per cycle than the number of "
+        "chunks in the input");
+
     fatal_if(blkSize < sizeThreshold, "Compressed data must fit in a block");
 }
 
index 77dcc924824ef43a66690de2823bcf38d0793ec8..767eda5281ecd33e49b43d06691fa8b2a6841d95 100644 (file)
@@ -95,6 +95,30 @@ class Base : public SimObject
      */
     const std::size_t sizeThreshold;
 
+    /**
+     * Degree of parallelization of the compression process. It is the
+     * number of chunks that can be processed in a cycle.
+     */
+    const Cycles compChunksPerCycle;
+
+    /**
+     * Extra latency added to compression due to packaging, shifting or
+     * other operations.
+     */
+    const Cycles compExtraLatency;
+
+    /**
+     * Degree of parallelization of the decompression process. It is the
+     * number of chunks that can be processed in a cycle.
+     */
+    const Cycles decompChunksPerCycle;
+
+    /**
+     * Extra latency added to decompression due to packaging, shifting or
+     * other operations.
+     */
+    const Cycles decompExtraLatency;
+
     struct BaseStats : public Stats::Group
     {
         const Base& compressor;
index 9686cb75b2e3703672e56560e00d357166254c91..5b94f048e97f2957e4e33845fa5fa0ab56a28dd8 100644 (file)
@@ -72,7 +72,7 @@ BaseDelta<BaseType, DeltaSizeBits>::compress(
     Cycles& decomp_lat)
 {
     std::unique_ptr<Base::CompressionData> comp_data =
-        DictionaryCompressor<BaseType>::compress(chunks);
+        DictionaryCompressor<BaseType>::compress(chunks, comp_lat, decomp_lat);
 
     // If there are more bases than the maximum, the compressor failed.
     // Otherwise, we have to take into account all bases that have not
@@ -89,14 +89,6 @@ BaseDelta<BaseType, DeltaSizeBits>::compress(
             8 * sizeof(BaseType) * diff);
     }
 
-    // Set compression latency (Assumes 1 cycle per entry and 1 cycle for
-    // packing)
-    comp_lat = Cycles(1 + (DictionaryCompressor<BaseType>::blkSize /
-        sizeof(BaseType)));
-
-    // Set decompression latency
-    decomp_lat = Cycles(1);
-
     // Return compressed line
     return comp_data;
 }
index fe8af164fe79c60619cb3948aeefd64e367df52a..bf241d0ed8a0ccd319d2503256055198b9b3c71e 100644 (file)
@@ -49,24 +49,6 @@ CPack::addToDictionary(DictionaryEntry data)
     dictionary[numEntries++] = data;
 }
 
-std::unique_ptr<Base::CompressionData>
-CPack::compress(const std::vector<Chunk>& chunks,
-    Cycles& comp_lat, Cycles& decomp_lat)
-{
-    std::unique_ptr<Base::CompressionData> comp_data =
-        DictionaryCompressor<uint32_t>::compress(chunks);
-
-    // Set compression latency (Accounts for pattern matching, length
-    // generation, packaging and shifting)
-    comp_lat = Cycles(blkSize/8+5);
-
-    // Set decompression latency (1 qword per cycle)
-    decomp_lat = Cycles(blkSize/8);
-
-    // Return compressed line
-    return comp_data;
-}
-
 } // namespace Compressor
 
 Compressor::CPack*
index 694ba816f6c4b0feb1b7c3bc15d4996b5f2ed1d1..480c0dce99b84a0e31b5b6aab614ae7d8e3d2290 100644 (file)
@@ -98,10 +98,6 @@ class CPack : public DictionaryCompressor<uint32_t>
 
     void addToDictionary(DictionaryEntry data) override;
 
-    std::unique_ptr<Base::CompressionData> compress(
-        const std::vector<Base::Chunk>& chunks,
-        Cycles& comp_lat, Cycles& decomp_lat) override;
-
   public:
     /** Convenience typedef. */
      typedef CPackParams Params;
index 5b8ca239c1cf0cbc3cfca48190b8f2c0958368b7..e5d67d7ed57220bc97eacec9961f1a0caa1b418a 100644 (file)
@@ -239,14 +239,12 @@ class DictionaryCompressor : public BaseDictionaryCompressor
     std::unique_ptr<Base::CompressionData> compress(
         const std::vector<Chunk>& chunks);
 
+    std::unique_ptr<Base::CompressionData> compress(
+        const std::vector<Chunk>& chunks,
+        Cycles& comp_lat, Cycles& decomp_lat) override;
+
     using BaseDictionaryCompressor::compress;
 
-    /**
-     * Decompress data.
-     *
-     * @param comp_data Compressed cache line.
-     * @param data The cache line to be decompressed.
-     */
     void decompress(const CompressionData* comp_data, uint64_t* data) override;
 
     /**
index 36a58d085e4cf32c30f906854465042cb7ef1ee1..2a228129126b23de76fabb189623a77d96416803 100644 (file)
@@ -144,6 +144,21 @@ DictionaryCompressor<T>::compress(const std::vector<Chunk>& chunks)
     return comp_data;
 }
 
+template <class T>
+std::unique_ptr<Base::CompressionData>
+DictionaryCompressor<T>::compress(const std::vector<Chunk>& chunks,
+    Cycles& comp_lat, Cycles& decomp_lat)
+{
+    // Set latencies based on the degree of parallelization, and any extra
+    // latencies due to shifting or packaging
+    comp_lat = Cycles(compExtraLatency +
+        (chunks.size() / compChunksPerCycle));
+    decomp_lat = Cycles(decompExtraLatency +
+        (chunks.size() / decompChunksPerCycle));
+
+    return compress(chunks);
+}
+
 template <class T>
 T
 DictionaryCompressor<T>::decompressValue(const Pattern* pattern)
index 29ee1d3bd85bdc3e9da1b3aa331802ef53799c77..67a9129ccb10201f0fbc7dd950dcc77212c3bd76 100644 (file)
@@ -54,26 +54,6 @@ FPCD::addToDictionary(DictionaryEntry data)
     }
 }
 
-std::unique_ptr<Base::CompressionData>
-FPCD::compress(const std::vector<Chunk>& chunks,
-    Cycles& comp_lat, Cycles& decomp_lat)
-{
-    std::unique_ptr<Base::CompressionData> comp_data =
-        DictionaryCompressor<uint32_t>::compress(chunks);
-
-    // Set compression latency (Accounts for zero checks, ones check, match
-    // previous check, match penultimate check, repeated values check, pattern
-    // selection, shifting, at a rate of 16B per cycle)
-    comp_lat = Cycles(blkSize/2);
-
-    // Set decompression latency. The original claim of 2 cycles is likely
-    // too unrealistic
-    decomp_lat = Cycles(4);
-
-    // Return compressed line
-    return comp_data;
-}
-
 } // namespace Compressor
 
 Compressor::FPCD*
index 6c4eac1b113355fd9ed38f9b1e2246f19b1fc793..a6a27c3d493771ddb2af975c75e42f65ba3cf187 100644 (file)
@@ -139,10 +139,6 @@ class FPCD : public DictionaryCompressor<uint32_t>
 
     void addToDictionary(DictionaryEntry data) override;
 
-    std::unique_ptr<Base::CompressionData> compress(
-        const std::vector<Base::Chunk>& chunks,
-        Cycles& comp_lat, Cycles& decomp_lat) override;
-
   public:
     typedef FPCDParams Params;
     FPCD(const Params &p);
index 241f5deccccf69e55758b50c46287988abd54402..16d848cc7641769fcda9f6c9182dd73b6a8d8a1d 100644 (file)
@@ -61,7 +61,6 @@ Multi::Multi(const Params &p)
   : Base(p), compressors(p.compressors),
     numEncodingBits(p.encoding_in_tags ? 0 :
         std::log2(alignToPowerOfTwo(compressors.size()))),
-    extraDecompressionLatency(p.extra_decomp_lat),
     multiStats(stats, *this)
 {
     fatal_if(compressors.size() == 0, "There must be at least one compressor");
@@ -153,7 +152,7 @@ Multi::compress(const std::vector<Chunk>& chunks, Cycles& comp_lat,
     DPRINTF(CacheComp, "Best compressor: %d\n", best_index);
 
     // Set decompression latency of the best compressor
-    decomp_lat = results.top()->decompLat + extraDecompressionLatency;
+    decomp_lat = results.top()->decompLat + decompExtraLatency;
 
     // Update compressor ranking stats
     for (int rank = 0; rank < compressors.size(); rank++) {
@@ -163,7 +162,7 @@ Multi::compress(const std::vector<Chunk>& chunks, Cycles& comp_lat,
 
     // Set compression latency (compression latency of the slowest compressor
     // and 1 cycle to pack)
-    comp_lat = Cycles(max_comp_lat + 1);
+    comp_lat = Cycles(max_comp_lat + compExtraLatency);
 
     return multi_comp_data;
 }
index 064ea33ae5bc5ca4716626de049338d6891cd063..54fb4061ea6611226177bfa77458833aab97c9f6 100644 (file)
@@ -41,9 +41,7 @@
 namespace Compressor {
 
 Perfect::Perfect(const Params &p)
-  : Base(p), compressedSize(8 * blkSize / p.max_compression_ratio),
-    compressionLatency(p.compression_latency),
-    decompressionLatency(p.decompression_latency)
+  : Base(p), compressedSize(8 * blkSize / p.max_compression_ratio)
 {
 }
 
@@ -56,8 +54,12 @@ Perfect::compress(const std::vector<Chunk>& chunks,
 
     // Set relevant metadata
     comp_data->setSizeBits(compressedSize);
-    comp_lat = compressionLatency;
-    decomp_lat = decompressionLatency;
+
+    // Set latencies based on the degree of parallelization, and any extra
+    // latencies due to shifting or packaging
+    comp_lat = Cycles((chunks.size() / compChunksPerCycle) + compExtraLatency);
+    decomp_lat = Cycles((chunks.size() / decompChunksPerCycle) +
+        decompExtraLatency);
 
     return comp_data;
 }