From: Jacob Lifshay Date: Wed, 25 Nov 2020 06:01:17 +0000 (-0800) Subject: add efficient variable-length instruction decoding algorithm X-Git-Tag: convert-csv-opcode-to-binary~1663 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8cbcc10c8a2fd8255c0ca9fec547387d90edcd9a;p=libreriscv.git add efficient variable-length instruction decoding algorithm --- diff --git a/openpower/sv/16_bit_compressed.mdwn b/openpower/sv/16_bit_compressed.mdwn index 807662835..270a5ad11 100644 --- a/openpower/sv/16_bit_compressed.mdwn +++ b/openpower/sv/16_bit_compressed.mdwn @@ -633,3 +633,6 @@ the register allocation order in GCC (i.e., it takes the earliest available regi [[demo]] +# Efficient Decoding Algorithm + +[[decoding]] diff --git a/openpower/sv/16_bit_compressed/decoding.mdwn b/openpower/sv/16_bit_compressed/decoding.mdwn new file mode 100644 index 000000000..e78605bf0 --- /dev/null +++ b/openpower/sv/16_bit_compressed/decoding.mdwn @@ -0,0 +1,207 @@ +# Decoding variable-length instructions efficiently + +We can use a method similar to carry look-ahead to make decoding `N` instructions take time `O(log N)`: + +Using the encoding in: + +and +[[demo]] + +See following code [on Compiler Explorer](https://gcc.godbolt.org/z/8e3WEn): + +```C++ +#include +#include +#include + +enum Mode { + Standard, + Compressed, + StandardOnceThenCompressed, +}; + +struct SizeAndMode { + size_t size; // size in bytes; always even + Mode mode; +}; + +struct RangeSummary { + // SizeAndMode::mode fields are the new mode after executing + // the instructions summarized by this RangeSummary + SizeAndMode mode_was_standard; + SizeAndMode mode_was_compressed; + SizeAndMode mode_was_standard_once_then_compressed; +}; + +// predecodes a single uint16_t, treating it as if it were the first +// uint16_t in an instruction +RangeSummary predecode_one(uint16_t nth, uint16_t n_plus_1th) { + uint16_t po_field = nth >> 10; + uint16_t scm_field = nth & 0x1; + uint16_t scmt_field = nth >> 15; + uint16_t xo_field = n_plus_1th >> 1; + SizeAndMode mode_was_standard = { + .size = 4, // 32-bit + .mode = Standard, + }; + SizeAndMode mode_was_standard_once_then_compressed = { + .size = 4, // 32-bit + .mode = Compressed, + }; + switch(po_field) { + case 0: // 32/48-bits + // check for bit set by "Support Processor Attention" instruction + if((xo_field & 256) == 0) { + mode_was_standard.size = 6; // 48-bit + mode_was_standard_once_then_compressed.size = 6; + } else { + // SVP32 or "Support Processor Attention" + // leave sizes as 32-bit + } + break; + case 1: // PowerISA v3.1 prefix & SVP64 + mode_was_standard.size = 8; // 64-bit + mode_was_standard_once_then_compressed.size = 8; + break; + case 5: + mode_was_standard.size = 2; // 16-bit + mode_was_standard_once_then_compressed.size = 2; + if(scm_field != 0) // swap to compressed mode + mode_was_standard.mode = Compressed; + break; + default: // standard PowerISA instructions + // leave sizes as 32-bit + break; + } + SizeAndMode mode_was_compressed = { + .size = 2, // 16-bit + .mode = Compressed, + }; + if(scm_field) { // swap to standard mode + if(scmt_field) // swap to standard mode temporarily + mode_was_compressed.mode = StandardOnceThenCompressed; + else + mode_was_compressed.mode = Standard; + } else + mode_was_compressed.mode = Compressed; + return RangeSummary{ + .mode_was_standard = mode_was_standard, + .mode_was_compressed = mode_was_compressed, + .mode_was_standard_once_then_compressed = + mode_was_standard_once_then_compressed, + }; +} + +SizeAndMode propagate_through_range(SizeAndMode in, + RangeSummary range_summary) { + SizeAndMode picked; + switch(in.mode) { + case Standard: + picked = range_summary.mode_was_standard; + break; + case Compressed: + picked = range_summary.mode_was_compressed; + break; + case StandardOnceThenCompressed: + picked = range_summary.mode_was_standard_once_then_compressed; + break; + } + return SizeAndMode{ + .size = in.size + picked.size, + .mode = picked.mode, + }; +} + +RangeSummary merge(RangeSummary first, RangeSummary second) { + SizeAndMode s = propagate_through_range( + first.mode_was_standard, second); + SizeAndMode c = propagate_through_range( + first.mode_was_compressed, second); + SizeAndMode sc = propagate_through_range( + first.mode_was_standard_once_then_compressed, second); + return RangeSummary{ + .mode_was_standard = s, + .mode_was_compressed = c, + .mode_was_standard_once_then_compressed = sc, + }; +} + +enum Endian { + LittleEndian, + BigEndian, +}; + +const size_t INSTRUCTION_MEMORY_SIZE = 0x1000000; // just for demo purposes +uint8_t instruction_memory[INSTRUCTION_MEMORY_SIZE]; + +// get the uint16_t corresponding to the instruction indicated by the PC +uint16_t get_u16(Endian endian, size_t pc) { + assert((pc & 1) == 0); // pc must be aligned to 2 bytes + uint8_t lsb_byte; + uint8_t msb_byte; + switch(endian) { + case LittleEndian: + msb_byte = instruction_memory[(pc ^ 2) + 1]; + lsb_byte = instruction_memory[pc ^ 2]; + break; + case BigEndian: + msb_byte = instruction_memory[pc]; + lsb_byte = instruction_memory[pc + 1]; + break; + } + return (uint16_t)lsb_byte | ((uint16_t)msb_byte << 8); +} + +struct PredecodedInstruction { + Mode mode; // mode for this instruction + bool is_start; // true if this is the initial uint16_t in an instruction +}; + +// decode width in uint16_t units +// can replace with whatever value you like +const size_t DECODE_WIDTH = 8; + +void predecode_all(Endian endian, size_t pc, Mode starting_mode, + PredecodedInstruction outputs[DECODE_WIDTH]) { + RangeSummary range_summaries[DECODE_WIDTH]; + for(size_t i = 0; i < DECODE_WIDTH; i++) { + uint16_t nth = get_u16(endian, pc + i * 2); + uint16_t n_plus_1th = get_u16(endian, pc + (i + 1) * 2); + range_summaries[i] = predecode_one(nth, n_plus_1th); + } + + // use a parallel prefix-sum algorithm, + // using `merge()` as the sum's operation. + // use a simple recursive algorithm + for(size_t step = 1; step < DECODE_WIDTH; step *= 2) { + // outer loop loops ceil(log2(DECODE_WIDTH)) times + + RangeSummary temp[DECODE_WIDTH]; + for(size_t i = 0; i < DECODE_WIDTH; i++) { + if(i < step) + temp[i] = range_summaries[i]; + else + temp[i] = merge(range_summaries[i - step], + range_summaries[i]); + } + for(size_t i = 0; i < DECODE_WIDTH; i++) { + range_summaries[i] = temp[i]; + } + } + + for(size_t i = 0; i < DECODE_WIDTH; i++) { + SizeAndMode size_and_mode{ + .size = 0, + .mode = starting_mode, + }; + if(i > 0) { + size_and_mode = propagate_through_range(size_and_mode, + range_summaries[i - 1]); + } + outputs[i] = PredecodedInstruction{ + .mode = size_and_mode.mode, + .is_start = (size_and_mode.size == pc + i * 2), + }; + } +} +```