Add 4 params to calculate compression and decompression latencies.
A pair of params informs how many chunks are parsed per cycle, and
the other pair informs how many extra cycles are needed after the
chunks are parsed to finish the (de)compression.
Change-Id: Ie67b0c298f06a08011f553789e3a9a1d89dd7c4f
Signed-off-by: Daniel R. Carvalho <odanrc@yahoo.com.br>
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/36497
Reviewed-by: Nikos Nikoleris <nikos.nikoleris@arm.com>
Maintainer: Nikos Nikoleris <nikos.nikoleris@arm.com>
Tested-by: kokoro <noreply+kokoro@google.com>
"Minimum percentage of the block size, a compressed block must "
"achieve to be stored in compressed format")
+ comp_chunks_per_cycle = Param.Unsigned(1,
+ "Number of chunks that can be compressed in parallel per cycle.")
+ comp_extra_latency = Param.Cycles(1, "Number of extra cycles required "
+ "to finish compression (e.g., due to shifting and packaging).")
+ decomp_chunks_per_cycle = Param.Unsigned(1,
+ "Number of chunks that can be decompressed in parallel per cycle.")
+ decomp_extra_latency = Param.Cycles(1, "Number of extra cycles required "
+ "to finish decompression (e.g., due to shifting and packaging).")
+
class BaseDictionaryCompressor(BaseCacheCompressor):
type = 'BaseDictionaryCompressor'
abstract = True
chunk_size_bits = 64
+ # Base-delta compressors achieve 1-cycle latencies
+ comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ comp_extra_latency = 0
+ decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ decomp_extra_latency = 0
+
class Base64Delta16(BaseDictionaryCompressor):
type = 'Base64Delta16'
cxx_class = 'Compressor::Base64Delta16'
chunk_size_bits = 64
+ # Base-delta compressors achieve 1-cycle latencies
+ comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ comp_extra_latency = 0
+ decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ decomp_extra_latency = 0
+
class Base64Delta32(BaseDictionaryCompressor):
type = 'Base64Delta32'
cxx_class = 'Compressor::Base64Delta32'
chunk_size_bits = 64
+ # Base-delta compressors achieve 1-cycle latencies
+ comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ comp_extra_latency = 0
+ decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ decomp_extra_latency = 0
+
class Base32Delta8(BaseDictionaryCompressor):
type = 'Base32Delta8'
cxx_class = 'Compressor::Base32Delta8'
chunk_size_bits = 32
+ # Base-delta compressors achieve 1-cycle latencies
+ comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ comp_extra_latency = 0
+ decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ decomp_extra_latency = 0
+
class Base32Delta16(BaseDictionaryCompressor):
type = 'Base32Delta16'
cxx_class = 'Compressor::Base32Delta16'
chunk_size_bits = 32
+ # Base-delta compressors achieve 1-cycle latencies
+ comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ comp_extra_latency = 0
+ decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ decomp_extra_latency = 0
+
class Base16Delta8(BaseDictionaryCompressor):
type = 'Base16Delta8'
cxx_class = 'Compressor::Base16Delta8'
chunk_size_bits = 16
+ # Base-delta compressors achieve 1-cycle latencies
+ comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ comp_extra_latency = 0
+ decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ decomp_extra_latency = 0
+
class CPack(BaseDictionaryCompressor):
type = 'CPack'
cxx_class = 'Compressor::CPack'
cxx_header = "mem/cache/compressors/cpack.hh"
+ comp_chunks_per_cycle = 2
+ # Accounts for pattern matching, length generation, packaging and shifting
+ comp_extra_latency = 5
+ decomp_chunks_per_cycle = 2
+ decomp_extra_latency = 1
+
class FPCD(BaseDictionaryCompressor):
type = 'FPCD'
cxx_class = 'Compressor::FPCD'
cxx_header = "mem/cache/compressors/fpcd.hh"
+ # Accounts for checking all patterns, selecting patterns, and shifting
+ # The original claim of a decompression latency of 2 cycles would likely
+ # generate an unrealistically complex circuit
+ comp_chunks_per_cycle = 4
+ comp_extra_latency = 1
+ decomp_chunks_per_cycle = 4
+ decomp_extra_latency = 0
+
dictionary_size = 2
class MultiCompressor(BaseCacheCompressor):
encoding_in_tags = Param.Bool(False, "If set the bits to inform which "
"sub-compressor compressed some data are added to its corresponding "
"tag entry.")
- extra_decomp_lat = Param.Unsigned(0, "Extra latency to be added to the "
- "sub-compressor's decompression latency")
+
+ # Use the sub-compressors' latencies
+ comp_chunks_per_cycle = 0
+ decomp_chunks_per_cycle = 0
+
+ # Assume extra 1 cycle to select the results of the winning sub-compressor
+ comp_extra_latency = 1
+
+ # Multi-compressors may need a couple of extra cycles to the select
+ # which sub-compressor should be used to decompress the data
+ decomp_extra_latency = 1
class PerfectCompressor(BaseCacheCompressor):
type = 'PerfectCompressor'
max_compression_ratio = Param.Int("Maximum compression ratio allowed")
- compression_latency = Param.Cycles(1,
- "Number of cycles to perform data compression")
- decompression_latency = Param.Cycles(1,
- "Number of cycles to perform data decompression")
+ # In a perfect world compression and decompression happen in 1 cycle
+ comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ comp_extra_latency = 0
+ decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ decomp_extra_latency = 0
class RepeatedQwordsCompressor(BaseDictionaryCompressor):
type = 'RepeatedQwordsCompressor'
chunk_size_bits = 64
+ # Assume 1-cycle latencies
+ comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ comp_extra_latency = 0
+ decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ decomp_extra_latency = 0
+
class ZeroCompressor(BaseDictionaryCompressor):
type = 'ZeroCompressor'
cxx_class = 'Compressor::Zero'
chunk_size_bits = 64
+ # Assume 1-cycle latencies
+ comp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ comp_extra_latency = 0
+ decomp_chunks_per_cycle = 8 * Self.block_size / Self.chunk_size_bits
+ decomp_extra_latency = 0
+
class BDI(MultiCompressor):
- encoding_in_tags=True
compressors = [
ZeroCompressor(size_threshold_percentage=99),
RepeatedQwordsCompressor(size_threshold_percentage=99),
Base32Delta16(size_threshold_percentage=99),
Base16Delta8(size_threshold_percentage=99),
]
+
+ # By default assume that the encoding is stored in the tags, and is
+ # retrieved and decoded while (and ends before) the data is being read.
+ decomp_extra_latency = 0
+ encoding_in_tags=True
Base::Base(const Params &p)
: SimObject(p), blkSize(p.block_size), chunkSizeBits(p.chunk_size_bits),
sizeThreshold((blkSize * p.size_threshold_percentage) / 100),
+ compChunksPerCycle(p.comp_chunks_per_cycle),
+ compExtraLatency(p.comp_extra_latency),
+ decompChunksPerCycle(p.decomp_chunks_per_cycle),
+ decompExtraLatency(p.decomp_extra_latency),
stats(*this)
{
fatal_if(64 % chunkSizeBits,
"64 must be a multiple of the chunk granularity.");
+ fatal_if(((CHAR_BIT * blkSize) / chunkSizeBits) < compChunksPerCycle,
+ "Compressor processes more chunks per cycle than the number of "
+ "chunks in the input");
+ fatal_if(((CHAR_BIT * blkSize) / chunkSizeBits) < decompChunksPerCycle,
+ "Decompressor processes more chunks per cycle than the number of "
+ "chunks in the input");
+
fatal_if(blkSize < sizeThreshold, "Compressed data must fit in a block");
}
*/
const std::size_t sizeThreshold;
+ /**
+ * Degree of parallelization of the compression process. It is the
+ * number of chunks that can be processed in a cycle.
+ */
+ const Cycles compChunksPerCycle;
+
+ /**
+ * Extra latency added to compression due to packaging, shifting or
+ * other operations.
+ */
+ const Cycles compExtraLatency;
+
+ /**
+ * Degree of parallelization of the decompression process. It is the
+ * number of chunks that can be processed in a cycle.
+ */
+ const Cycles decompChunksPerCycle;
+
+ /**
+ * Extra latency added to decompression due to packaging, shifting or
+ * other operations.
+ */
+ const Cycles decompExtraLatency;
+
struct BaseStats : public Stats::Group
{
const Base& compressor;
Cycles& decomp_lat)
{
std::unique_ptr<Base::CompressionData> comp_data =
- DictionaryCompressor<BaseType>::compress(chunks);
+ DictionaryCompressor<BaseType>::compress(chunks, comp_lat, decomp_lat);
// If there are more bases than the maximum, the compressor failed.
// Otherwise, we have to take into account all bases that have not
8 * sizeof(BaseType) * diff);
}
- // Set compression latency (Assumes 1 cycle per entry and 1 cycle for
- // packing)
- comp_lat = Cycles(1 + (DictionaryCompressor<BaseType>::blkSize /
- sizeof(BaseType)));
-
- // Set decompression latency
- decomp_lat = Cycles(1);
-
// Return compressed line
return comp_data;
}
dictionary[numEntries++] = data;
}
-std::unique_ptr<Base::CompressionData>
-CPack::compress(const std::vector<Chunk>& chunks,
- Cycles& comp_lat, Cycles& decomp_lat)
-{
- std::unique_ptr<Base::CompressionData> comp_data =
- DictionaryCompressor<uint32_t>::compress(chunks);
-
- // Set compression latency (Accounts for pattern matching, length
- // generation, packaging and shifting)
- comp_lat = Cycles(blkSize/8+5);
-
- // Set decompression latency (1 qword per cycle)
- decomp_lat = Cycles(blkSize/8);
-
- // Return compressed line
- return comp_data;
-}
-
} // namespace Compressor
Compressor::CPack*
void addToDictionary(DictionaryEntry data) override;
- std::unique_ptr<Base::CompressionData> compress(
- const std::vector<Base::Chunk>& chunks,
- Cycles& comp_lat, Cycles& decomp_lat) override;
-
public:
/** Convenience typedef. */
typedef CPackParams Params;
std::unique_ptr<Base::CompressionData> compress(
const std::vector<Chunk>& chunks);
+ std::unique_ptr<Base::CompressionData> compress(
+ const std::vector<Chunk>& chunks,
+ Cycles& comp_lat, Cycles& decomp_lat) override;
+
using BaseDictionaryCompressor::compress;
- /**
- * Decompress data.
- *
- * @param comp_data Compressed cache line.
- * @param data The cache line to be decompressed.
- */
void decompress(const CompressionData* comp_data, uint64_t* data) override;
/**
return comp_data;
}
+template <class T>
+std::unique_ptr<Base::CompressionData>
+DictionaryCompressor<T>::compress(const std::vector<Chunk>& chunks,
+ Cycles& comp_lat, Cycles& decomp_lat)
+{
+ // Set latencies based on the degree of parallelization, and any extra
+ // latencies due to shifting or packaging
+ comp_lat = Cycles(compExtraLatency +
+ (chunks.size() / compChunksPerCycle));
+ decomp_lat = Cycles(decompExtraLatency +
+ (chunks.size() / decompChunksPerCycle));
+
+ return compress(chunks);
+}
+
template <class T>
T
DictionaryCompressor<T>::decompressValue(const Pattern* pattern)
}
}
-std::unique_ptr<Base::CompressionData>
-FPCD::compress(const std::vector<Chunk>& chunks,
- Cycles& comp_lat, Cycles& decomp_lat)
-{
- std::unique_ptr<Base::CompressionData> comp_data =
- DictionaryCompressor<uint32_t>::compress(chunks);
-
- // Set compression latency (Accounts for zero checks, ones check, match
- // previous check, match penultimate check, repeated values check, pattern
- // selection, shifting, at a rate of 16B per cycle)
- comp_lat = Cycles(blkSize/2);
-
- // Set decompression latency. The original claim of 2 cycles is likely
- // too unrealistic
- decomp_lat = Cycles(4);
-
- // Return compressed line
- return comp_data;
-}
-
} // namespace Compressor
Compressor::FPCD*
void addToDictionary(DictionaryEntry data) override;
- std::unique_ptr<Base::CompressionData> compress(
- const std::vector<Base::Chunk>& chunks,
- Cycles& comp_lat, Cycles& decomp_lat) override;
-
public:
typedef FPCDParams Params;
FPCD(const Params &p);
: Base(p), compressors(p.compressors),
numEncodingBits(p.encoding_in_tags ? 0 :
std::log2(alignToPowerOfTwo(compressors.size()))),
- extraDecompressionLatency(p.extra_decomp_lat),
multiStats(stats, *this)
{
fatal_if(compressors.size() == 0, "There must be at least one compressor");
DPRINTF(CacheComp, "Best compressor: %d\n", best_index);
// Set decompression latency of the best compressor
- decomp_lat = results.top()->decompLat + extraDecompressionLatency;
+ decomp_lat = results.top()->decompLat + decompExtraLatency;
// Update compressor ranking stats
for (int rank = 0; rank < compressors.size(); rank++) {
// Set compression latency (compression latency of the slowest compressor
// and 1 cycle to pack)
- comp_lat = Cycles(max_comp_lat + 1);
+ comp_lat = Cycles(max_comp_lat + compExtraLatency);
return multi_comp_data;
}
namespace Compressor {
Perfect::Perfect(const Params &p)
- : Base(p), compressedSize(8 * blkSize / p.max_compression_ratio),
- compressionLatency(p.compression_latency),
- decompressionLatency(p.decompression_latency)
+ : Base(p), compressedSize(8 * blkSize / p.max_compression_ratio)
{
}
// Set relevant metadata
comp_data->setSizeBits(compressedSize);
- comp_lat = compressionLatency;
- decomp_lat = decompressionLatency;
+
+ // Set latencies based on the degree of parallelization, and any extra
+ // latencies due to shifting or packaging
+ comp_lat = Cycles((chunks.size() / compChunksPerCycle) + compExtraLatency);
+ decomp_lat = Cycles((chunks.size() / decompChunksPerCycle) +
+ decompExtraLatency);
return comp_data;
}