7 | dest | src1 | subop | op |
8 | ---- | ---- | ----- | -------- |
9 | RT | RA | .. | bmatflip |
10 | RT | RA | size | crc32 |
11 | RT | RA | size | crc32c |
15 | dest | src1 | src2 | subop | op |
16 | ---- | ---- | ---- | ----- | -------- |
17 | RT | RA | RB | or | bmatflip |
18 | RT | RA | RB | xor | bmatflip |
19 | RT | RA | RB | bdep | dep/ext |
20 | RT | RA | RB | bext | dep/ext |
21 | RT | RA | RB | | grev |
22 | RT | RA | RB | | gorc |
23 | RT | RA | RB | shuf | shuffle |
24 | RT | RA | RB | unshuf| shuffle |
25 | RT | RA | RB | width | xperm |
26 | RT | RA | RB | type | clmul |
27 | RT | RA | RB | type | minmax |
38 | 0.5|6.10|11.15|16.20|21.25| 26....30 |31| name |
39 | -- | -- | --- | --- | --- | -------- |--| ------ |
40 | NN | RT | RA | RB | RC | mode 001 |Rc| ternary |
44 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
45 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
46 | NN | RA | RB | | | 0 | 0000 110 |Rc| rsvd |
47 | NN | RA | RB | RC | itype | 1 | 0000 110 |Rc| xperm |
48 | NN | RA | RB | RC | itype | 0 | 0100 110 |Rc| minmax |
49 | NN | RA | RB | | | 1 | 0100 110 |Rc| rsvd |
50 | NN | RA | RB | sh | itype | SH | 1000 110 |Rc| bmopsi |
51 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
52 | NN | RA | RB | | | 0 | 0001 110 |Rc| rsvd |
53 | NN | RA | RB | | | 0 | 0101 110 |Rc| rsvd |
54 | NN | RA | RB | RC | 00 | 0 | 0010 110 |Rc| gorc |
55 | NN | RA | RB | sh | 00 | SH | 1010 110 |Rc| gorci |
56 | NN | RA | RB | RC | 00 | 0 | 0110 110 |Rc| gorcw |
57 | NN | RA | RB | sh | 00 | 0 | 1110 110 |Rc| gorcwi |
58 | NN | RA | RB | RC | 00 | 1 | 1110 110 |Rc| bmator |
59 | NN | RA | RB | RC | 01 | 0 | 0010 110 |Rc| grev |
60 | NN | RA | RB | sh | 01 | SH | 1010 110 |Rc| grevi |
61 | NN | RA | RB | RC | 01 | 0 | 0110 110 |Rc| grevw |
62 | NN | RA | RB | sh | 01 | 0 | 1110 110 |Rc| grevwi |
63 | NN | RA | RB | RC | 01 | 1 | 1110 110 |Rc| bmatxor |
64 | NN | RA | RB | RC | 10 | 0 | 0010 110 |Rc| shfl |
65 | NN | RA | RB | sh | 10 | SH | 1010 110 |Rc| shfli |
66 | NN | RA | RB | RC | 10 | 0 | 0110 110 |Rc| shflw |
67 | NN | RA | RB | RC | 10 | 0 | 1110 110 |Rc| bdep |
68 | NN | RA | RB | RC | 10 | 1 | 1110 110 |Rc| bext |
69 | NN | RA | RB | | 11 | | 1110 110 |Rc| rsvd |
70 | NN | RA | RB | | | | NN11 110 |Rc| rsvd |
74 similar to matrix permute in RV bitmanip, which has XOR and OR variants
78 b = VSR[VRB+32].dword[i].byte[k].bit[j]
79 VSR[VRT+32].dword[i].byte[j].bit[k] = b
83 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
86 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
87 result = VSR[VRA+32].dword[i].bit[63-k]
88 VSR[VRT+32].dword[i].bit[63-m] = result
94 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
97 for (int i = 0, j = 0; i < XLEN; i++)
100 r |= uint_xlen_t(1) << i;
110 other way round: identical to RV bext
113 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
116 for (int i = 0, j = 0; i < XLEN; i++)
119 r |= uint_xlen_t(1) << j;
128 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
130 signed/unsigned min/max gives more flexibility.
134 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
136 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
137 | -- | -- | --- | --- | ----- | -------- |--|
138 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
141 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
142 RT[i] = (imm & (1<<idx)) != 0
144 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
146 a 4 operand variant which becomes more along the lines of an FPGA:
148 | 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
149 | -- | -- | --- | --- | --- | -------- |--|
150 | NN | RT | RA | RB | RC | mode 001 |Rc|
153 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
154 RT[i] = (RC & (1<<idx)) != 0
156 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
159 also, another possible variant involving swizzle and vec4:
161 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
162 | -- | -- | --- | ----- | ---- | ----- |--|
163 | NN | RT | RA | imm | mask | 101 |1 |
166 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
167 res = (imm & (1<<idx)) != 0
169 if mask[j]: RT[i+j*8] = res
171 another mode selection would be CRs not Ints.
173 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
174 | -- | -- | --- | --- |- |-----|----- | -----|--|
175 | NN | BA | BB | BC |0 |imm | mask | 101 |0 |
178 if not mask[i] continue
179 idx = crregs[BA][i] << 2 |
182 crregs[BA][i] = (imm & (1<<idx)) != 0
186 based on RV bitmanip singlebit set, instruction format similar to shift
187 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask).
188 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
190 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
191 | -- | -- | --- | --- | --- | ------- |--|
192 | NN | RT | RA | RB | RC | mode 010 |Rc|
195 uint_xlen_t bmset(RA, RB, sh)
197 int shamt = RB & (XLEN - 1);
199 return RA | (mask << shamt);
202 uint_xlen_t bmclr(RA, RB, sh)
204 int shamt = RB & (XLEN - 1);
206 return RA & ~(mask << shamt);
209 uint_xlen_t bminv(RA, RB, sh)
211 int shamt = RB & (XLEN - 1);
213 return RA ^ (mask << shamt);
216 uint_xlen_t bmext(RA, RB, sh)
218 int shamt = RB & (XLEN - 1);
220 return mask & (RA >> shamt);
229 uint64_t grev64(uint64_t RA, uint64_t RB)
233 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
234 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
235 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
236 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
237 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
238 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
239 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
240 ((x & 0xFF00FF00FF00FF00LL) >> 8);
241 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
242 ((x & 0xFFFF0000FFFF0000LL) >> 16);
243 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
244 ((x & 0xFFFFFFFF00000000LL) >> 32);
250 # shuffle / unshuffle
255 uint32_t shfl32(uint32_t RA, uint32_t RB)
259 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
260 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
261 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
262 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
265 uint32_t unshfl32(uint32_t RA, uint32_t RB)
269 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
270 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
271 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
272 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
276 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
278 uint64_t x = src & ~(maskL | maskR);
279 x |= ((src << N) & maskL) | ((src >> N) & maskR);
282 uint64_t shfl64(uint64_t RA, uint64_t RB)
286 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
287 0x00000000ffff0000LL, 16);
288 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
289 0x0000ff000000ff00LL, 8);
290 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
291 0x00f000f000f000f0LL, 4);
292 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
293 0x0c0c0c0c0c0c0c0cLL, 2);
294 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
295 0x2222222222222222LL, 1);
298 uint64_t unshfl64(uint64_t RA, uint64_t RB)
302 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
303 0x2222222222222222LL, 1);
304 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
305 0x0c0c0c0c0c0c0c0cLL, 2);
306 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
307 0x00f000f000f000f0LL, 4);
308 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
309 0x0000ff000000ff00LL, 8);
310 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
311 0x00000000ffff0000LL, 16);
321 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
324 uint_xlen_t sz = 1LL << sz_log2;
325 uint_xlen_t mask = (1LL << sz) - 1;
326 for (int i = 0; i < XLEN; i += sz) {
327 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
329 r |= ((RA >> pos) & mask) << i;
333 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
334 { return xperm(RA, RB, 2); }
335 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
336 { return xperm(RA, RB, 3); }
337 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
338 { return xperm(RA, RB, 4); }
339 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
340 { return xperm(RA, RB, 5); }
348 uint32_t gorc32(uint32_t RA, uint32_t RB)
352 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
353 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
354 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
355 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
356 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
359 uint64_t gorc64(uint64_t RA, uint64_t RB)
363 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
364 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
365 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
366 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
367 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
368 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
369 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
370 ((x & 0xFF00FF00FF00FF00LL) >> 8);
371 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
372 ((x & 0xFFFF0000FFFF0000LL) >> 16);
373 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
374 ((x & 0xFFFFFFFF00000000LL) >> 32);
382 based on RV bitmanip, covered by ternary bitops
385 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
386 return (RA & RB) | (RC & ~RB);
393 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
396 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
399 for (int i = 0; i < XLEN; i++)
404 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
407 for (int i = 1; i < XLEN; i++)
412 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
415 for (int i = 0; i < XLEN; i++)
417 x ^= RA >> (XLEN-i-1);
425 this requires 3 parameters and a "degree"
427 RT = GFMUL(RA, RB, gfdegree, modulo=RC)
429 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
431 RS = GFMUL(RS, RA, gfdegree, modulo=RB)
433 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
434 | -- | -- | --- | --- | --- | ------- |--|
435 | NN | RS | RA | RB | deg | 00 011 |Rc|
437 where the SimpleV variant may override RS-as-src differently from RS-as-dest
442 from functools import reduce
444 # constants used in the multGF2 function
445 mask1 = mask2 = polyred = None
447 def setGF2(degree, irPoly):
448 """Define parameters of binary finite field GF(2^m)/g(x)
449 - degree: extension degree of binary field
450 - irPoly: coefficients of irreducible polynomial g(x)
453 """Convert an integer into a polynomial"""
454 return [(sInt >> i) & 1
455 for i in reversed(range(sInt.bit_length()))]
457 global mask1, mask2, polyred
458 mask1 = mask2 = 1 << degree
460 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
463 """Multiply two polynomials in GF(2^m)/g(x)"""
474 if __name__ == "__main__":
476 # Define binary field GF(2^3)/x^3 + x + 1
479 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
480 print("{:02x}".format(multGF2(0b111, 0b101)))
482 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
483 # (used in the Advanced Encryption Standard-AES)
484 setGF2(8, 0b100011011)
486 # Evaluate the product (x^7)(x^7 + x + 1)
487 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
491 RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
493 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
494 | -- | -- | --- | --- | --- | ------- |--|
495 | NN | RS | RA | RB | deg | 01 011 |Rc|
508 def gf_invert(a, mod=0x1B) :
523 a %= 256 # Emulating 8-bit overflow
524 g1 %= 256 # Emulating 8-bit overflow
526 j = gf_degree(a) - gf_degree(v)
533 * <https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq>
534 * <https://en.wikipedia.org/wiki/Cyclic_redundancy_check>
537 uint_xlen_t crc32(uint_xlen_t x, int nbits)
539 for (int i = 0; i < nbits; i++)
540 x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
543 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
545 for (int i = 0; i < nbits; i++)
546 x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
549 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
550 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
551 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
552 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
553 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
554 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
556 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
557 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
564 uint64_t bmatflip(uint64_t RA)
572 uint64_t bmatxor(uint64_t RA, uint64_t RB)
575 uint64_t RBt = bmatflip(RB);
576 uint8_t u[8]; // rows of RA
577 uint8_t v[8]; // cols of RB
578 for (int i = 0; i < 8; i++) {
583 for (int i = 0; i < 64; i++) {
584 if (pcnt(u[i / 8] & v[i % 8]) & 1)
589 uint64_t bmator(uint64_t RA, uint64_t RB)
592 uint64_t RBt = bmatflip(RB);
593 uint8_t u[8]; // rows of RA
594 uint8_t v[8]; // cols of RB
595 for (int i = 0; i < 8; i++) {
600 for (int i = 0; i < 64; i++) {
601 if ((u[i / 8] & v[i % 8]) != 0)