7 | dest | src1 | subop | op |
8 | ---- | ---- | ----- | -------- |
9 | RT | RA | .. | bmatflip |
10 | RT | RA | size | crc32 |
11 | RT | RA | size | crc32c |
15 | dest | src1 | src2 | subop | op |
16 | ---- | ---- | ---- | ----- | -------- |
17 | RT | RA | RB | or | bmatflip |
18 | RT | RA | RB | xor | bmatflip |
19 | RT | RA | RB | bdep | dep/ext |
20 | RT | RA | RB | bext | dep/ext |
21 | RT | RA | RB | | grev |
22 | RT | RA | RB | | gorc |
23 | RT | RA | RB | shuf | shuffle |
24 | RT | RA | RB | unshuf| shuffle |
25 | RT | RA | RB | width | xperm |
26 | RT | RA | RB | type | clmul |
27 | RT | RA | RB | type | minmax |
39 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24..30 |31| name |
40 | -- | -- | --- | --- | ----- | -- | ------- |--| ---- |
41 | NN | RA | RB | | | 0 | 0000110 |Rc| rsvd |
42 | NN | RA | RB | RC | itype | 1 | 0000110 |Rc| xperm |
43 | NN | RA | RB | RC | itype | 0 | 0100110 |Rc| minmax |
44 | NN | RA | RB | | | 1 | 0100110 |Rc| rsvd |
45 | NN | RA | RB | sh | itype | SH | 1000110 |Rc| bmopsi |
46 | NN | RA | RB | | | | 1100110 |Rc| rsvd |
47 | NN | RA | RB | RC | itype | 0 | 0001110 |Rc| clmul |
48 | NN | RA | RB | sh | itype | 0 | 0101110 |Rc| clmulw |
49 | NN | RA | RB | RC | 00 | 0 | 0010110 |Rc| gorc |
50 | NN | RA | RB | sh | 00 | SH | 1010110 |Rc| gorci |
51 | NN | RA | RB | RC | 00 | 0 | 0110110 |Rc| gorcw |
52 | NN | RA | RB | sh | 00 | 0 | 1110110 |Rc| gorcwi |
53 | NN | RA | RB | RC | 00 | 1 | 1110110 |Rc| bmator |
54 | NN | RA | RB | RC | 01 | 0 | 0010110 |Rc| grev |
55 | NN | RA | RB | sh | 01 | SH | 1010110 |Rc| grevi |
56 | NN | RA | RB | RC | 01 | 0 | 0110110 |Rc| grevw |
57 | NN | RA | RB | sh | 01 | 0 | 1110110 |Rc| grevwi |
58 | NN | RA | RB | RC | 01 | 1 | 1110110 |Rc| bmatxor |
59 | NN | RA | RB | RC | 10 | 0 | 0010110 |Rc| shfl |
60 | NN | RA | RB | sh | 10 | SH | 1010110 |Rc| shfli |
61 | NN | RA | RB | RC | 10 | 0 | 0110110 |Rc| shflw |
62 | NN | RA | RB | RC | 10 | 0 | 1110110 |Rc| bdep |
63 | NN | RA | RB | RC | 10 | 1 | 1110110 |Rc| bext |
64 | NN | RA | RB | | 11 | | 1110110 |Rc| rsvd |
65 | NN | RA | RB | | | | NN11110 |Rc| rsvd |
69 similar to matrix permute in RV bitmanip, which has XOR and OR variants
73 b = VSR[VRB+32].dword[i].byte[k].bit[j]
74 VSR[VRT+32].dword[i].byte[j].bit[k] = b
78 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
81 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
82 result = VSR[VRA+32].dword[i].bit[63-k]
83 VSR[VRT+32].dword[i].bit[63-m] = result
89 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
92 for (int i = 0, j = 0; i < XLEN; i++)
95 r |= uint_xlen_t(1) << i;
105 other way round: identical to RV bext
108 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
111 for (int i = 0, j = 0; i < XLEN; i++)
114 r |= uint_xlen_t(1) << j;
123 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
125 signed/unsigned min/max gives more flexibility.
129 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
131 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
132 | -- | -- | --- | --- | ----- | -------- |--|
133 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
136 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
137 RT[i] = (imm & (1<<idx)) != 0
139 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
141 a 4 operand variant which becomes more along the lines of an FPGA:
143 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
144 | -- | -- | --- | --- | --- | ------- |--|
145 | NN | RT | RA | RB | RC | mode 1 |1 |
148 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
149 RT[i] = (RC & (1<<idx)) != 0
151 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul.
153 also, another possible variant involving swizzle and vec4:
156 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
157 RT[i] = (RA.w[i] & (1<<idx)) != 0
159 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
160 | -- | -- | --- | ----- | ---- | ----- |--|
161 | NN | RT | RA | xyzw | mask | mode 1 |1 |
164 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
165 res = (RA.w[i] & (1<<idx)) != 0
167 if mask[j]: RT[i+j*8] = res
169 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
170 | -- | -- | --- | ----- | ---- | ----- |--|
171 | NN | RT | RA | imm | mask | mode 1 |1 |
174 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
175 res = (imm & (1<<idx)) != 0
177 if mask[j]: RT[i+j*8] = res
179 another mode selection would be CRs not Ints.
181 | 0.5|6.8 | 9.11|12.14|15.17|18.20| 21..25| 26.29|30|31|
182 | -- | -- | --- | --- | --- |-----| ----- | ---- |--|--|
183 | NN | BT | BA | BB | BC |im5-7| im0-4 | mask |1 |0 |
186 if not mask[i] continue
187 idx = crregs[BA][i] << 2 |
190 crregs[BT][i] = (imm & (1<<idx)) != 0
194 based on RV bitmanip singlebit set, instruction format similar to shift
196 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
197 | -- | -- | --- | --- | --- | ------- |--|
198 | NN | RT | RA | RB | RC | mode 010 |Rc|
201 uint_xlen_t bmset(RA, RB, sh)
203 int shamt = RB & (XLEN - 1);
205 return RA | (mask << shamt);
208 uint_xlen_t bmclr(RA, RB, sh)
210 int shamt = RB & (XLEN - 1);
212 return RA & ~(mask << shamt);
215 uint_xlen_t bminv(RA, RB, sh)
217 int shamt = RB & (XLEN - 1);
219 return RA ^ (mask << shamt);
222 uint_xlen_t bmext(RA, RB, sh)
224 int shamt = RB & (XLEN - 1);
226 return mask & (RA >> shamt);
235 uint64_t grev64(uint64_t RA, uint64_t RB)
239 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
240 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
241 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
242 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
243 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
244 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
245 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
246 ((x & 0xFF00FF00FF00FF00LL) >> 8);
247 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
248 ((x & 0xFFFF0000FFFF0000LL) >> 16);
249 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
250 ((x & 0xFFFFFFFF00000000LL) >> 32);
256 # shuffle / unshuffle
261 uint32_t shfl32(uint32_t RA, uint32_t RB)
265 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
266 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
267 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
268 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
271 uint32_t unshfl32(uint32_t RA, uint32_t RB)
275 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
276 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
277 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
278 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
282 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
284 uint64_t x = src & ~(maskL | maskR);
285 x |= ((src << N) & maskL) | ((src >> N) & maskR);
288 uint64_t shfl64(uint64_t RA, uint64_t RB)
292 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
293 0x00000000ffff0000LL, 16);
294 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
295 0x0000ff000000ff00LL, 8);
296 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
297 0x00f000f000f000f0LL, 4);
298 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
299 0x0c0c0c0c0c0c0c0cLL, 2);
300 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
301 0x2222222222222222LL, 1);
304 uint64_t unshfl64(uint64_t RA, uint64_t RB)
308 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
309 0x2222222222222222LL, 1);
310 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
311 0x0c0c0c0c0c0c0c0cLL, 2);
312 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
313 0x00f000f000f000f0LL, 4);
314 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
315 0x0000ff000000ff00LL, 8);
316 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
317 0x00000000ffff0000LL, 16);
327 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
330 uint_xlen_t sz = 1LL << sz_log2;
331 uint_xlen_t mask = (1LL << sz) - 1;
332 for (int i = 0; i < XLEN; i += sz) {
333 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
335 r |= ((RA >> pos) & mask) << i;
339 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
340 { return xperm(RA, RB, 2); }
341 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
342 { return xperm(RA, RB, 3); }
343 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
344 { return xperm(RA, RB, 4); }
345 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
346 { return xperm(RA, RB, 5); }
354 uint32_t gorc32(uint32_t RA, uint32_t RB)
358 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
359 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
360 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
361 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
362 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
365 uint64_t gorc64(uint64_t RA, uint64_t RB)
369 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
370 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
371 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
372 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
373 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
374 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
375 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
376 ((x & 0xFF00FF00FF00FF00LL) >> 8);
377 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
378 ((x & 0xFFFF0000FFFF0000LL) >> 16);
379 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
380 ((x & 0xFFFFFFFF00000000LL) >> 32);
388 based on RV bitmanip, covered by ternary bitops
391 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
392 return (RA & RB) | (RC & ~RB);
401 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
404 for (int i = 0; i < XLEN; i++)
409 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
412 for (int i = 1; i < XLEN; i++)
417 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
420 for (int i = 0; i < XLEN; i++)
422 x ^= RA >> (XLEN-i-1);
430 uint_xlen_t crc32(uint_xlen_t x, int nbits)
432 for (int i = 0; i < nbits; i++)
433 x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
436 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
438 for (int i = 0; i < nbits; i++)
439 x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
442 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
443 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
444 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
445 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
446 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
447 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
449 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
450 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
457 uint64_t bmatflip(uint64_t RA)
465 uint64_t bmatxor(uint64_t RA, uint64_t RB)
468 uint64_t RBt = bmatflip(RB);
469 uint8_t u[8]; // rows of RA
470 uint8_t v[8]; // cols of RB
471 for (int i = 0; i < 8; i++) {
476 for (int i = 0; i < 64; i++) {
477 if (pcnt(u[i / 8] & v[i % 8]) & 1)
482 uint64_t bmator(uint64_t RA, uint64_t RB)
485 uint64_t RBt = bmatflip(RB);
486 uint8_t u[8]; // rows of RA
487 uint8_t v[8]; // cols of RB
488 for (int i = 0; i < 8; i++) {
493 for (int i = 0; i < 64; i++) {
494 if ((u[i / 8] & v[i % 8]) != 0)