fc88baf2e3ad6d0aa21d752fadf4e50b12ba3f09
[libreriscv.git] / openpower / sv / bitmanip.mdwn
1 [[!tag standards]]
2
3 # summary
4
5 1-op and variants
6
7 | dest | src1 | subop | op |
8 | ---- | ---- | ----- | -------- |
9 | RT | RA | .. | bmatflip |
10 | RT | RA | size | crc32 |
11 | RT | RA | size | crc32c |
12
13 2-op and variants
14
15 | dest | src1 | src2 | subop | op |
16 | ---- | ---- | ---- | ----- | -------- |
17 | RT | RA | RB | or | bmatflip |
18 | RT | RA | RB | xor | bmatflip |
19 | RT | RA | RB | bdep | dep/ext |
20 | RT | RA | RB | bext | dep/ext |
21 | RT | RA | RB | | grev |
22 | RT | RA | RB | | gorc |
23 | RT | RA | RB | shuf | shuffle |
24 | RT | RA | RB | unshuf| shuffle |
25 | RT | RA | RB | width | xperm |
26 | RT | RA | RB | type | clmul |
27 | RT | RA | RB | type | minmax |
28 | RT | RA | RB | | |
29 | RT | RA | RB | | |
30 | RT | RA | RB | | |
31
32 3 ops
33
34 * bitmask swt/extract
35 * ternary bitops
36
37 ops
38
39 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24..30 |31| name |
40 | -- | -- | --- | --- | ----- | -- | ------- |--| ---- |
41 | NN | RA | RB | | | 0 | 0000110 |Rc| rsvd |
42 | NN | RA | RB | RC | itype | 1 | 0000110 |Rc| xperm |
43 | NN | RA | RB | RC | itype | 0 | 0100110 |Rc| minmax |
44 | NN | RA | RB | | | 1 | 0100110 |Rc| rsvd |
45 | NN | RA | RB | sh | itype | SH | 1000110 |Rc| bmopsi |
46 | NN | RA | RB | | | | 1100110 |Rc| rsvd |
47 | NN | RA | RB | RC | itype | 0 | 0001110 |Rc| clmul |
48 | NN | RA | RB | sh | itype | 0 | 0101110 |Rc| clmulw |
49 | NN | RA | RB | RC | 00 | 0 | 0010110 |Rc| gorc |
50 | NN | RA | RB | sh | 00 | SH | 1010110 |Rc| gorci |
51 | NN | RA | RB | RC | 00 | 0 | 0110110 |Rc| gorcw |
52 | NN | RA | RB | sh | 00 | 0 | 1110110 |Rc| gorcwi |
53 | NN | RA | RB | RC | 00 | 1 | 1110110 |Rc| bmator |
54 | NN | RA | RB | RC | 01 | 0 | 0010110 |Rc| grev |
55 | NN | RA | RB | sh | 01 | SH | 1010110 |Rc| grevi |
56 | NN | RA | RB | RC | 01 | 0 | 0110110 |Rc| grevw |
57 | NN | RA | RB | sh | 01 | 0 | 1110110 |Rc| grevwi |
58 | NN | RA | RB | RC | 01 | 1 | 1110110 |Rc| bmatxor |
59 | NN | RA | RB | RC | 10 | 0 | 0010110 |Rc| shfl |
60 | NN | RA | RB | sh | 10 | SH | 1010110 |Rc| shfli |
61 | NN | RA | RB | RC | 10 | 0 | 0110110 |Rc| shflw |
62 | NN | RA | RB | RC | 10 | 0 | 1110110 |Rc| bdep |
63 | NN | RA | RB | RC | 10 | 1 | 1110110 |Rc| bext |
64 | NN | RA | RB | | 11 | | 1110110 |Rc| rsvd |
65 | NN | RA | RB | | | | NN11110 |Rc| rsvd |
66
67 # bit to byte permute
68
69 similar to matrix permute in RV bitmanip, which has XOR and OR variants
70
71 do j = 0 to 7
72 do k = 0 to 7
73 b = VSR[VRB+32].dword[i].byte[k].bit[j]
74 VSR[VRT+32].dword[i].byte[j].bit[k] = b
75
76 # vector bit deposit
77
78 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
79
80 do while(m < 64)
81 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
82 result = VSR[VRA+32].dword[i].bit[63-k]
83 VSR[VRT+32].dword[i].bit[63-m] = result
84 k = k + 1
85 m = m + 1
86
87 ```
88
89 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
90 {
91 uint_xlen_t r = 0;
92 for (int i = 0, j = 0; i < XLEN; i++)
93 if ((RB >> i) & 1) {
94 if ((RA >> j) & 1)
95 r |= uint_xlen_t(1) << i;
96 j++;
97 }
98 return r;
99 }
100
101 ```
102
103 # vector bit extract
104
105 other way round: identical to RV bext
106
107 ```
108 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
109 {
110 uint_xlen_t r = 0;
111 for (int i = 0, j = 0; i < XLEN; i++)
112 if ((RB >> i) & 1) {
113 if ((RA >> i) & 1)
114 r |= uint_xlen_t(1) << j;
115 j++;
116 }
117 return r;
118 }
119 ```
120
121 # int min/max
122
123 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
124
125 signed/unsigned min/max gives more flexibility.
126
127 # ternary bitops
128
129 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
130
131 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
132 | -- | -- | --- | --- | ----- | -------- |--|
133 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
134
135 for i in range(64):
136 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
137 RT[i] = (imm & (1<<idx)) != 0
138
139 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
140
141 a 4 operand variant which becomes more along the lines of an FPGA:
142
143 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
144 | -- | -- | --- | --- | --- | ------- |--|
145 | NN | RT | RA | RB | RC | mode 1 |1 |
146
147 for i in range(64):
148 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
149 RT[i] = (RC & (1<<idx)) != 0
150
151 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul.
152
153 also, another possible variant involving swizzle and vec4:
154
155 for i in range(8):
156 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
157 RT[i] = (RA.w[i] & (1<<idx)) != 0
158
159 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
160 | -- | -- | --- | ----- | ---- | ----- |--|
161 | NN | RT | RA | xyzw | mask | mode 1 |1 |
162
163 for i in range(8):
164 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
165 res = (RA.w[i] & (1<<idx)) != 0
166 for j in range(4):
167 if mask[j]: RT[i+j*8] = res
168
169 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
170 | -- | -- | --- | ----- | ---- | ----- |--|
171 | NN | RT | RA | imm | mask | mode 1 |1 |
172
173 for i in range(8):
174 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
175 res = (imm & (1<<idx)) != 0
176 for j in range(3):
177 if mask[j]: RT[i+j*8] = res
178
179 another mode selection would be CRs not Ints.
180
181 | 0.5|6.8 | 9.11|12.14|15.17|18.20| 21..25| 26.29|30|31|
182 | -- | -- | --- | --- | --- |-----| ----- | ---- |--|--|
183 | NN | BT | BA | BB | BC |im5-7| im0-4 | mask |1 |0 |
184
185 for i in range(4):
186 if not mask[i] continue
187 idx = crregs[BA][i] << 2 |
188 crregs[BB][i] << 1 |
189 crregs[BC][i]
190 crregs[BT][i] = (imm & (1<<idx)) != 0
191
192 # bitmask set
193
194 based on RV bitmanip singlebit set, instruction format similar to shift
195
196 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
197 | -- | -- | --- | --- | --- | ------- |--|
198 | NN | RT | RA | RB | RC | mode 010 |Rc|
199
200 ```
201 uint_xlen_t bmset(RA, RB, sh)
202 {
203 int shamt = RB & (XLEN - 1);
204 mask = (2<<sh)-1;
205 return RA | (mask << shamt);
206 }
207
208 uint_xlen_t bmclr(RA, RB, sh)
209 {
210 int shamt = RB & (XLEN - 1);
211 mask = (2<<sh)-1;
212 return RA & ~(mask << shamt);
213 }
214
215 uint_xlen_t bminv(RA, RB, sh)
216 {
217 int shamt = RB & (XLEN - 1);
218 mask = (2<<sh)-1;
219 return RA ^ (mask << shamt);
220 }
221
222 uint_xlen_t bmext(RA, RB, sh)
223 {
224 int shamt = RB & (XLEN - 1);
225 mask = (2<<sh)-1;
226 return mask & (RA >> shamt);
227 }
228 ```
229
230 # grev
231
232 based on RV bitmanip
233
234 ```
235 uint64_t grev64(uint64_t RA, uint64_t RB)
236 {
237 uint64_t x = RA;
238 int shamt = RB & 63;
239 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
240 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
241 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
242 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
243 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
244 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
245 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
246 ((x & 0xFF00FF00FF00FF00LL) >> 8);
247 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
248 ((x & 0xFFFF0000FFFF0000LL) >> 16);
249 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
250 ((x & 0xFFFFFFFF00000000LL) >> 32);
251 return x;
252 }
253
254 ```
255
256 # shuffle / unshuffle
257
258 based on RV bitmanip
259
260 ```
261 uint32_t shfl32(uint32_t RA, uint32_t RB)
262 {
263 uint32_t x = RA;
264 int shamt = RB & 15;
265 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
266 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
267 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
268 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
269 return x;
270 }
271 uint32_t unshfl32(uint32_t RA, uint32_t RB)
272 {
273 uint32_t x = RA;
274 int shamt = RB & 15;
275 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
276 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
277 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
278 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
279 return x;
280 }
281
282 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
283 {
284 uint64_t x = src & ~(maskL | maskR);
285 x |= ((src << N) & maskL) | ((src >> N) & maskR);
286 return x;
287 }
288 uint64_t shfl64(uint64_t RA, uint64_t RB)
289 {
290 uint64_t x = RA;
291 int shamt = RB & 31;
292 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
293 0x00000000ffff0000LL, 16);
294 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
295 0x0000ff000000ff00LL, 8);
296 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
297 0x00f000f000f000f0LL, 4);
298 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
299 0x0c0c0c0c0c0c0c0cLL, 2);
300 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
301 0x2222222222222222LL, 1);
302 return x;
303 }
304 uint64_t unshfl64(uint64_t RA, uint64_t RB)
305 {
306 uint64_t x = RA;
307 int shamt = RB & 31;
308 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
309 0x2222222222222222LL, 1);
310 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
311 0x0c0c0c0c0c0c0c0cLL, 2);
312 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
313 0x00f000f000f000f0LL, 4);
314 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
315 0x0000ff000000ff00LL, 8);
316 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
317 0x00000000ffff0000LL, 16);
318 return x;
319 }
320 ```
321
322 # xperm
323
324 based on RV bitmanip
325
326 ```
327 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
328 {
329 uint_xlen_t r = 0;
330 uint_xlen_t sz = 1LL << sz_log2;
331 uint_xlen_t mask = (1LL << sz) - 1;
332 for (int i = 0; i < XLEN; i += sz) {
333 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
334 if (pos < XLEN)
335 r |= ((RA >> pos) & mask) << i;
336 }
337 return r;
338 }
339 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
340 { return xperm(RA, RB, 2); }
341 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
342 { return xperm(RA, RB, 3); }
343 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
344 { return xperm(RA, RB, 4); }
345 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
346 { return xperm(RA, RB, 5); }
347 ```
348
349 # gorc
350
351 based on RV bitmanip
352
353 ```
354 uint32_t gorc32(uint32_t RA, uint32_t RB)
355 {
356 uint32_t x = RA;
357 int shamt = RB & 31;
358 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
359 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
360 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
361 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
362 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
363 return x;
364 }
365 uint64_t gorc64(uint64_t RA, uint64_t RB)
366 {
367 uint64_t x = RA;
368 int shamt = RB & 63;
369 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
370 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
371 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
372 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
373 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
374 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
375 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
376 ((x & 0xFF00FF00FF00FF00LL) >> 8);
377 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
378 ((x & 0xFFFF0000FFFF0000LL) >> 16);
379 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
380 ((x & 0xFFFFFFFF00000000LL) >> 32);
381 return x;
382 }
383
384 ```
385
386 # cmix
387
388 based on RV bitmanip, covered by ternary bitops
389
390 ```
391 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
392 return (RA & RB) | (RC & ~RB);
393 }
394 ```
395
396 # carryless mul
397
398 based on RV bitmanip
399
400 ```
401 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
402 {
403 uint_xlen_t x = 0;
404 for (int i = 0; i < XLEN; i++)
405 if ((RB >> i) & 1)
406 x ^= RA << i;
407 return x;
408 }
409 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
410 {
411 uint_xlen_t x = 0;
412 for (int i = 1; i < XLEN; i++)
413 if ((RB >> i) & 1)
414 x ^= RA >> (XLEN-i);
415 return x;
416 }
417 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
418 {
419 uint_xlen_t x = 0;
420 for (int i = 0; i < XLEN; i++)
421 if ((RB >> i) & 1)
422 x ^= RA >> (XLEN-i-1);
423 return x;
424 }
425 ```
426
427 # crc
428
429 ```
430 uint_xlen_t crc32(uint_xlen_t x, int nbits)
431 {
432 for (int i = 0; i < nbits; i++)
433 x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
434 return x;
435 }
436 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
437 {
438 for (int i = 0; i < nbits; i++)
439 x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
440 return x;
441 }
442 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
443 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
444 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
445 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
446 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
447 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
448 #if XLEN > 32
449 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
450 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
451 #endif
452 ```
453
454 # bitmatrix
455
456 ```
457 uint64_t bmatflip(uint64_t RA)
458 {
459 uint64_t x = RA;
460 x = shfl64(x, 31);
461 x = shfl64(x, 31);
462 x = shfl64(x, 31);
463 return x;
464 }
465 uint64_t bmatxor(uint64_t RA, uint64_t RB)
466 {
467 // transpose of RB
468 uint64_t RBt = bmatflip(RB);
469 uint8_t u[8]; // rows of RA
470 uint8_t v[8]; // cols of RB
471 for (int i = 0; i < 8; i++) {
472 u[i] = RA >> (i*8);
473 v[i] = RBt >> (i*8);
474 }
475 uint64_t x = 0;
476 for (int i = 0; i < 64; i++) {
477 if (pcnt(u[i / 8] & v[i % 8]) & 1)
478 x |= 1LL << i;
479 }
480 return x;
481 }
482 uint64_t bmator(uint64_t RA, uint64_t RB)
483 {
484 // transpose of RB
485 uint64_t RBt = bmatflip(RB);
486 uint8_t u[8]; // rows of RA
487 uint8_t v[8]; // cols of RB
488 for (int i = 0; i < 8; i++) {
489 u[i] = RA >> (i*8);
490 v[i] = RBt >> (i*8);
491 }
492 uint64_t x = 0;
493 for (int i = 0; i < 64; i++) {
494 if ((u[i / 8] & v[i % 8]) != 0)
495 x |= 1LL << i;
496 }
497 return x;
498 }
499
500 ```