5 minor opcode allocation
8 | ------ |--| --------- |
14 | 101 |0 | ternarycr |
20 | dest | src1 | subop | op |
21 | ---- | ---- | ----- | -------- |
22 | RT | RA | .. | bmatflip |
23 | RT | RA | size | crc32 |
24 | RT | RA | size | crc32c |
28 | dest | src1 | src2 | subop | op |
29 | ---- | ---- | ---- | ----- | -------- |
30 | RT | RA | RB | or | bmatflip |
31 | RT | RA | RB | xor | bmatflip |
32 | RT | RA | RB | bdep | dep/ext |
33 | RT | RA | RB | bext | dep/ext |
34 | RT | RA | RB | | grev |
35 | RT | RA | RB | | gorc |
36 | RT | RA | RB | shuf | shuffle |
37 | RT | RA | RB | unshuf| shuffle |
38 | RT | RA | RB | width | xperm |
39 | RT | RA | RB | type | minmax |
50 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
51 | -- | -- | --- | --- | ----- | -------- |--| ------ |
52 | NN | RT | RA | RB | RC | mode 001 |Rc| ternary |
53 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc| ternaryi |
54 | NN | RS | RA | RB | deg | 00 011 |Rc| gfmul |
55 | NN | RS | RA | RB | deg | 01 011 |Rc| gfadd |
56 | NN | RT | RA | RB | deg | 10 011 |Rc| gfinv |
57 | NN | RS | RA | RB | deg | 11 011 |Rc| gf rsvd |
59 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
60 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
61 | NN | RT | RA | imm | mask | 101 |1 | ternaryv |
63 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
64 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
65 | NN | BA | BB | BC |0 |imm | mask | 101 |0 | ternarycr |
69 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
70 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
71 | NN | RA | RB | | | 0 | 0000 110 |Rc| rsvd |
72 | NN | RA | RB | RC | itype | 1 | 0000 110 |Rc| xperm |
73 | NN | RA | RB | RC | itype | 0 | 0100 110 |Rc| minmax |
74 | NN | RA | RB | | | 1 | 0100 110 |Rc| rsvd |
75 | NN | RA | RB | sh | itype | SH | 1000 110 |Rc| bmopsi |
76 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
77 | NN | RA | RB | | | 0 | 0001 110 |Rc| rsvd |
78 | NN | RA | RB | | | 0 | 0101 110 |Rc| rsvd |
79 | NN | RA | RB | RC | 00 | 0 | 0010 110 |Rc| gorc |
80 | NN | RA | RB | sh | 00 | SH | 1010 110 |Rc| gorci |
81 | NN | RA | RB | RC | 00 | 0 | 0110 110 |Rc| gorcw |
82 | NN | RA | RB | sh | 00 | 0 | 1110 110 |Rc| gorcwi |
83 | NN | RA | RB | RC | 00 | 1 | 1110 110 |Rc| bmator |
84 | NN | RA | RB | RC | 01 | 0 | 0010 110 |Rc| grev |
85 | NN | RA | RB | sh | 01 | SH | 1010 110 |Rc| grevi |
86 | NN | RA | RB | RC | 01 | 0 | 0110 110 |Rc| grevw |
87 | NN | RA | RB | sh | 01 | 0 | 1110 110 |Rc| grevwi |
88 | NN | RA | RB | RC | 01 | 1 | 1110 110 |Rc| bmatxor |
89 | NN | RA | RB | RC | 10 | 0 | 0010 110 |Rc| shfl |
90 | NN | RA | RB | sh | 10 | SH | 1010 110 |Rc| shfli |
91 | NN | RA | RB | RC | 10 | 0 | 0110 110 |Rc| shflw |
92 | NN | RA | RB | RC | 10 | 0 | 1110 110 |Rc| bdep |
93 | NN | RA | RB | RC | 10 | 1 | 1110 110 |Rc| bext |
94 | NN | RA | RB | | 11 | | 1110 110 |Rc| rsvd |
95 | NN | RA | RB | | | | NN11 110 |Rc| rsvd |
99 similar to matrix permute in RV bitmanip, which has XOR and OR variants
103 b = VSR[VRB+32].dword[i].byte[k].bit[j]
104 VSR[VRT+32].dword[i].byte[j].bit[k] = b
108 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
111 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
112 result = VSR[VRA+32].dword[i].bit[63-k]
113 VSR[VRT+32].dword[i].bit[63-m] = result
119 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
122 for (int i = 0, j = 0; i < XLEN; i++)
125 r |= uint_xlen_t(1) << i;
135 other way round: identical to RV bext
138 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
141 for (int i = 0, j = 0; i < XLEN; i++)
144 r |= uint_xlen_t(1) << j;
153 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
155 signed/unsigned min/max gives more flexibility.
159 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
161 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
162 | -- | -- | --- | --- | ----- | -------- |--|
163 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
166 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
167 RT[i] = (imm & (1<<idx)) != 0
169 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
171 a 4 operand variant which becomes more along the lines of an FPGA:
173 | 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
174 | -- | -- | --- | --- | --- | -------- |--|
175 | NN | RT | RA | RB | RC | mode 001 |Rc|
178 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
179 RT[i] = (RC & (1<<idx)) != 0
181 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
184 also, another possible variant involving swizzle and vec4:
186 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
187 | -- | -- | --- | ----- | ---- | ----- |--|
188 | NN | RT | RA | imm | mask | 101 |1 |
191 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
192 res = (imm & (1<<idx)) != 0
194 if mask[j]: RT[i+j*8] = res
196 another mode selection would be CRs not Ints.
198 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
199 | -- | -- | --- | --- |- |-----|----- | -----|--|
200 | NN | BA | BB | BC |0 |imm | mask | 101 |0 |
203 if not mask[i] continue
204 idx = crregs[BA][i] << 2 |
207 crregs[BA][i] = (imm & (1<<idx)) != 0
211 based on RV bitmanip singlebit set, instruction format similar to shift
212 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask).
213 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
215 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
216 | -- | -- | --- | --- | --- | ------- |--|
217 | NN | RT | RA | RB | RC | mode 010 |Rc|
220 uint_xlen_t bmset(RA, RB, sh)
222 int shamt = RB & (XLEN - 1);
224 return RA | (mask << shamt);
227 uint_xlen_t bmclr(RA, RB, sh)
229 int shamt = RB & (XLEN - 1);
231 return RA & ~(mask << shamt);
234 uint_xlen_t bminv(RA, RB, sh)
236 int shamt = RB & (XLEN - 1);
238 return RA ^ (mask << shamt);
241 uint_xlen_t bmext(RA, RB, sh)
243 int shamt = RB & (XLEN - 1);
245 return mask & (RA >> shamt);
254 uint64_t grev64(uint64_t RA, uint64_t RB)
258 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
259 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
260 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
261 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
262 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
263 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
264 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
265 ((x & 0xFF00FF00FF00FF00LL) >> 8);
266 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
267 ((x & 0xFFFF0000FFFF0000LL) >> 16);
268 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
269 ((x & 0xFFFFFFFF00000000LL) >> 32);
275 # shuffle / unshuffle
280 uint32_t shfl32(uint32_t RA, uint32_t RB)
284 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
285 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
286 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
287 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
290 uint32_t unshfl32(uint32_t RA, uint32_t RB)
294 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
295 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
296 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
297 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
301 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
303 uint64_t x = src & ~(maskL | maskR);
304 x |= ((src << N) & maskL) | ((src >> N) & maskR);
307 uint64_t shfl64(uint64_t RA, uint64_t RB)
311 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
312 0x00000000ffff0000LL, 16);
313 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
314 0x0000ff000000ff00LL, 8);
315 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
316 0x00f000f000f000f0LL, 4);
317 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
318 0x0c0c0c0c0c0c0c0cLL, 2);
319 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
320 0x2222222222222222LL, 1);
323 uint64_t unshfl64(uint64_t RA, uint64_t RB)
327 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
328 0x2222222222222222LL, 1);
329 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
330 0x0c0c0c0c0c0c0c0cLL, 2);
331 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
332 0x00f000f000f000f0LL, 4);
333 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
334 0x0000ff000000ff00LL, 8);
335 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
336 0x00000000ffff0000LL, 16);
346 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
349 uint_xlen_t sz = 1LL << sz_log2;
350 uint_xlen_t mask = (1LL << sz) - 1;
351 for (int i = 0; i < XLEN; i += sz) {
352 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
354 r |= ((RA >> pos) & mask) << i;
358 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
359 { return xperm(RA, RB, 2); }
360 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
361 { return xperm(RA, RB, 3); }
362 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
363 { return xperm(RA, RB, 4); }
364 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
365 { return xperm(RA, RB, 5); }
373 uint32_t gorc32(uint32_t RA, uint32_t RB)
377 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
378 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
379 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
380 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
381 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
384 uint64_t gorc64(uint64_t RA, uint64_t RB)
388 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
389 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
390 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
391 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
392 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
393 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
394 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
395 ((x & 0xFF00FF00FF00FF00LL) >> 8);
396 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
397 ((x & 0xFFFF0000FFFF0000LL) >> 16);
398 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
399 ((x & 0xFFFFFFFF00000000LL) >> 32);
407 based on RV bitmanip, covered by ternary bitops
410 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
411 return (RA & RB) | (RC & ~RB);
418 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
421 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
424 for (int i = 0; i < XLEN; i++)
429 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
432 for (int i = 1; i < XLEN; i++)
437 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
440 for (int i = 0; i < XLEN; i++)
442 x ^= RA >> (XLEN-i-1);
450 this requires 3 parameters and a "degree"
452 RT = GFMUL(RA, RB, gfdegree, modulo=RC)
454 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
456 RS = GFMUL(RS, RA, gfdegree, modulo=RB)
458 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
459 | -- | -- | --- | --- | --- | ------- |--|
460 | NN | RS | RA | RB | deg | 00 011 |Rc|
462 where the SimpleV variant may override RS-as-src differently from RS-as-dest
467 from functools import reduce
469 # constants used in the multGF2 function
470 mask1 = mask2 = polyred = None
472 def setGF2(degree, irPoly):
473 """Define parameters of binary finite field GF(2^m)/g(x)
474 - degree: extension degree of binary field
475 - irPoly: coefficients of irreducible polynomial g(x)
478 """Convert an integer into a polynomial"""
479 return [(sInt >> i) & 1
480 for i in reversed(range(sInt.bit_length()))]
482 global mask1, mask2, polyred
483 mask1 = mask2 = 1 << degree
485 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
488 """Multiply two polynomials in GF(2^m)/g(x)"""
499 if __name__ == "__main__":
501 # Define binary field GF(2^3)/x^3 + x + 1
504 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
505 print("{:02x}".format(multGF2(0b111, 0b101)))
507 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
508 # (used in the Advanced Encryption Standard-AES)
509 setGF2(8, 0b100011011)
511 # Evaluate the product (x^7)(x^7 + x + 1)
512 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
516 RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
518 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
519 | -- | -- | --- | --- | --- | ------- |--|
520 | NN | RS | RA | RB | deg | 01 011 |Rc|
533 def gf_invert(a, mod=0x1B) :
548 a %= 256 # Emulating 8-bit overflow
549 g1 %= 256 # Emulating 8-bit overflow
551 j = gf_degree(a) - gf_degree(v)
558 * <https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq>
559 * <https://en.wikipedia.org/wiki/Cyclic_redundancy_check>
562 uint_xlen_t crc32(uint_xlen_t x, int nbits)
564 for (int i = 0; i < nbits; i++)
565 x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
568 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
570 for (int i = 0; i < nbits; i++)
571 x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
574 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
575 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
576 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
577 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
578 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
579 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
581 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
582 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
589 uint64_t bmatflip(uint64_t RA)
597 uint64_t bmatxor(uint64_t RA, uint64_t RB)
600 uint64_t RBt = bmatflip(RB);
601 uint8_t u[8]; // rows of RA
602 uint8_t v[8]; // cols of RB
603 for (int i = 0; i < 8; i++) {
608 for (int i = 0; i < 64; i++) {
609 if (pcnt(u[i / 8] & v[i % 8]) & 1)
614 uint64_t bmator(uint64_t RA, uint64_t RB)
617 uint64_t RBt = bmatflip(RB);
618 uint8_t u[8]; // rows of RA
619 uint8_t v[8]; // cols of RB
620 for (int i = 0; i < 8; i++) {
625 for (int i = 0; i < 64; i++) {
626 if ((u[i / 8] & v[i % 8]) != 0)