5 minor opcode allocation
8 | ------ |--| --------- |
14 | 101 |0 | ternarycr |
20 | dest | src1 | subop | op |
21 | ---- | ---- | ----- | -------- |
22 | RT | RA | .. | bmatflip |
26 | dest | src1 | src2 | subop | op |
27 | ---- | ---- | ---- | ----- | -------- |
28 | RT | RA | RB | or | bmatflip |
29 | RT | RA | RB | xor | bmatflip |
30 | RT | RA | RB | bdep | dep/ext |
31 | RT | RA | RB | bext | dep/ext |
32 | RT | RA | RB | | grev |
33 | RT | RA | RB | | gorc |
34 | RT | RA | RB | shuf | shuffle |
35 | RT | RA | RB | unshuf| shuffle |
36 | RT | RA | RB | width | xperm |
37 | RT | RA | RB | type | minmax |
48 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
49 | -- | -- | --- | --- | ----- | -------- |--| ------ |
50 | NN | RT | RA | RB | RC | mode 001 |Rc| ternary |
51 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc| ternaryi |
52 | NN | RS | RA | RB | deg | 00 011 |Rc| gfmul |
53 | NN | RS | RA | RB | deg | 01 011 |Rc| gfadd |
54 | NN | RT | RA | RB | deg | 10 011 |Rc| gfinv |
55 | NN | RS | RA | RB | deg | 11 011 |Rc| gf rsvd |
57 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
58 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
59 | NN | RT | RA | imm | mask | 101 |1 | ternaryv |
61 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
62 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
63 | NN | BA | BB | BC |0 |imm | mask | 101 |0 | ternarycr |
67 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
68 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
69 | NN | RA | RB | | | 0 | 0000 110 |Rc| rsvd |
70 | NN | RA | RB | RC | itype | 1 | 0000 110 |Rc| xperm |
71 | NN | RA | RB | RC | itype | 0 | 0100 110 |Rc| minmax |
72 | NN | RA | RB | | | 1 | 0100 110 |Rc| rsvd |
73 | NN | RA | RB | sh | itype | SH | 1000 110 |Rc| bmopsi |
74 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
75 | NN | RA | RB | | | 0 | 0001 110 |Rc| rsvd |
76 | NN | RA | RB | | | 0 | 0101 110 |Rc| rsvd |
77 | NN | RA | RB | RC | 00 | 0 | 0010 110 |Rc| gorc |
78 | NN | RA | RB | sh | 00 | SH | 1010 110 |Rc| gorci |
79 | NN | RA | RB | RC | 00 | 0 | 0110 110 |Rc| gorcw |
80 | NN | RA | RB | sh | 00 | 0 | 1110 110 |Rc| gorcwi |
81 | NN | RA | RB | RC | 00 | 1 | 1110 110 |Rc| bmator |
82 | NN | RA | RB | RC | 01 | 0 | 0010 110 |Rc| grev |
83 | NN | RA | RB | sh | 01 | SH | 1010 110 |Rc| grevi |
84 | NN | RA | RB | RC | 01 | 0 | 0110 110 |Rc| grevw |
85 | NN | RA | RB | sh | 01 | 0 | 1110 110 |Rc| grevwi |
86 | NN | RA | RB | RC | 01 | 1 | 1110 110 |Rc| bmatxor |
87 | NN | RA | RB | RC | 10 | 0 | 0010 110 |Rc| shfl |
88 | NN | RA | RB | sh | 10 | SH | 1010 110 |Rc| shfli |
89 | NN | RA | RB | RC | 10 | 0 | 0110 110 |Rc| shflw |
90 | NN | RA | RB | RC | 10 | 0 | 1110 110 |Rc| bdep |
91 | NN | RA | RB | RC | 10 | 1 | 1110 110 |Rc| bext |
92 | NN | RA | RB | | 11 | | 1110 110 |Rc| rsvd |
93 | NN | RA | RB | | | | NN11 110 |Rc| rsvd |
97 similar to matrix permute in RV bitmanip, which has XOR and OR variants
101 b = VSR[VRB+32].dword[i].byte[k].bit[j]
102 VSR[VRT+32].dword[i].byte[j].bit[k] = b
106 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
109 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
110 result = VSR[VRA+32].dword[i].bit[63-k]
111 VSR[VRT+32].dword[i].bit[63-m] = result
117 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
120 for (int i = 0, j = 0; i < XLEN; i++)
123 r |= uint_xlen_t(1) << i;
133 other way round: identical to RV bext
136 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
139 for (int i = 0, j = 0; i < XLEN; i++)
142 r |= uint_xlen_t(1) << j;
151 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
153 signed/unsigned min/max gives more flexibility.
157 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
159 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
160 | -- | -- | --- | --- | ----- | -------- |--|
161 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
164 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
165 RT[i] = (imm & (1<<idx)) != 0
167 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
169 a 4 operand variant which becomes more along the lines of an FPGA:
171 | 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
172 | -- | -- | --- | --- | --- | -------- |--|
173 | NN | RT | RA | RB | RC | mode 001 |Rc|
176 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
177 RT[i] = (RC & (1<<idx)) != 0
179 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
182 also, another possible variant involving swizzle and vec4:
184 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
185 | -- | -- | --- | ----- | ---- | ----- |--|
186 | NN | RT | RA | imm | mask | 101 |1 |
189 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
190 res = (imm & (1<<idx)) != 0
192 if mask[j]: RT[i+j*8] = res
194 another mode selection would be CRs not Ints.
196 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
197 | -- | -- | --- | --- |- |-----|----- | -----|--|
198 | NN | BA | BB | BC |0 |imm | mask | 101 |0 |
201 if not mask[i] continue
202 idx = crregs[BA][i] << 2 |
205 crregs[BA][i] = (imm & (1<<idx)) != 0
209 based on RV bitmanip singlebit set, instruction format similar to shift
210 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask).
211 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
213 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
214 | -- | -- | --- | --- | --- | ------- |--|
215 | NN | RT | RA | RB | RC | mode 010 |Rc|
218 uint_xlen_t bmset(RA, RB, sh)
220 int shamt = RB & (XLEN - 1);
222 return RA | (mask << shamt);
225 uint_xlen_t bmclr(RA, RB, sh)
227 int shamt = RB & (XLEN - 1);
229 return RA & ~(mask << shamt);
232 uint_xlen_t bminv(RA, RB, sh)
234 int shamt = RB & (XLEN - 1);
236 return RA ^ (mask << shamt);
239 uint_xlen_t bmext(RA, RB, sh)
241 int shamt = RB & (XLEN - 1);
243 return mask & (RA >> shamt);
252 uint64_t grev64(uint64_t RA, uint64_t RB)
256 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
257 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
258 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
259 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
260 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
261 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
262 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
263 ((x & 0xFF00FF00FF00FF00LL) >> 8);
264 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
265 ((x & 0xFFFF0000FFFF0000LL) >> 16);
266 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
267 ((x & 0xFFFFFFFF00000000LL) >> 32);
273 # shuffle / unshuffle
278 uint32_t shfl32(uint32_t RA, uint32_t RB)
282 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
283 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
284 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
285 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
288 uint32_t unshfl32(uint32_t RA, uint32_t RB)
292 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
293 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
294 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
295 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
299 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
301 uint64_t x = src & ~(maskL | maskR);
302 x |= ((src << N) & maskL) | ((src >> N) & maskR);
305 uint64_t shfl64(uint64_t RA, uint64_t RB)
309 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
310 0x00000000ffff0000LL, 16);
311 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
312 0x0000ff000000ff00LL, 8);
313 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
314 0x00f000f000f000f0LL, 4);
315 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
316 0x0c0c0c0c0c0c0c0cLL, 2);
317 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
318 0x2222222222222222LL, 1);
321 uint64_t unshfl64(uint64_t RA, uint64_t RB)
325 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
326 0x2222222222222222LL, 1);
327 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
328 0x0c0c0c0c0c0c0c0cLL, 2);
329 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
330 0x00f000f000f000f0LL, 4);
331 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
332 0x0000ff000000ff00LL, 8);
333 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
334 0x00000000ffff0000LL, 16);
344 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
347 uint_xlen_t sz = 1LL << sz_log2;
348 uint_xlen_t mask = (1LL << sz) - 1;
349 for (int i = 0; i < XLEN; i += sz) {
350 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
352 r |= ((RA >> pos) & mask) << i;
356 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
357 { return xperm(RA, RB, 2); }
358 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
359 { return xperm(RA, RB, 3); }
360 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
361 { return xperm(RA, RB, 4); }
362 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
363 { return xperm(RA, RB, 5); }
371 uint32_t gorc32(uint32_t RA, uint32_t RB)
375 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
376 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
377 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
378 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
379 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
382 uint64_t gorc64(uint64_t RA, uint64_t RB)
386 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
387 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
388 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
389 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
390 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
391 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
392 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
393 ((x & 0xFF00FF00FF00FF00LL) >> 8);
394 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
395 ((x & 0xFFFF0000FFFF0000LL) >> 16);
396 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
397 ((x & 0xFFFFFFFF00000000LL) >> 32);
405 based on RV bitmanip, covered by ternary bitops
408 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
409 return (RA & RB) | (RC & ~RB);
416 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
419 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
422 for (int i = 0; i < XLEN; i++)
427 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
430 for (int i = 1; i < XLEN; i++)
435 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
438 for (int i = 0; i < XLEN; i++)
440 x ^= RA >> (XLEN-i-1);
448 this requires 3 parameters and a "degree"
450 RT = GFMUL(RA, RB, gfdegree, modulo=RC)
452 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
454 RS = GFMUL(RS, RA, gfdegree, modulo=RB)
456 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
457 | -- | -- | --- | --- | --- | ------- |--|
458 | NN | RS | RA | RB | deg | 00 011 |Rc|
460 where the SimpleV variant may override RS-as-src differently from RS-as-dest
465 from functools import reduce
467 # constants used in the multGF2 function
468 mask1 = mask2 = polyred = None
470 def setGF2(degree, irPoly):
471 """Define parameters of binary finite field GF(2^m)/g(x)
472 - degree: extension degree of binary field
473 - irPoly: coefficients of irreducible polynomial g(x)
476 """Convert an integer into a polynomial"""
477 return [(sInt >> i) & 1
478 for i in reversed(range(sInt.bit_length()))]
480 global mask1, mask2, polyred
481 mask1 = mask2 = 1 << degree
483 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
486 """Multiply two polynomials in GF(2^m)/g(x)"""
497 if __name__ == "__main__":
499 # Define binary field GF(2^3)/x^3 + x + 1
502 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
503 print("{:02x}".format(multGF2(0b111, 0b101)))
505 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
506 # (used in the Advanced Encryption Standard-AES)
507 setGF2(8, 0b100011011)
509 # Evaluate the product (x^7)(x^7 + x + 1)
510 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
514 RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
516 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
517 | -- | -- | --- | --- | --- | ------- |--|
518 | NN | RS | RA | RB | deg | 01 011 |Rc|
531 def gf_invert(a, mod=0x1B) :
546 a %= 256 # Emulating 8-bit overflow
547 g1 %= 256 # Emulating 8-bit overflow
549 j = gf_degree(a) - gf_degree(v)
557 uint64_t bmatflip(uint64_t RA)
565 uint64_t bmatxor(uint64_t RA, uint64_t RB)
568 uint64_t RBt = bmatflip(RB);
569 uint8_t u[8]; // rows of RA
570 uint8_t v[8]; // cols of RB
571 for (int i = 0; i < 8; i++) {
576 for (int i = 0; i < 64; i++) {
577 if (pcnt(u[i / 8] & v[i % 8]) & 1)
582 uint64_t bmator(uint64_t RA, uint64_t RB)
585 uint64_t RBt = bmatflip(RB);
586 uint8_t u[8]; // rows of RA
587 uint8_t v[8]; // cols of RB
588 for (int i = 0; i < 8; i++) {
593 for (int i = 0; i < 64; i++) {
594 if ((u[i / 8] & v[i % 8]) != 0)