acf728cb8157c79f89a0eb1ee26f83fad6e0bac8
[libreriscv.git] / openpower / sv / bitmanip.mdwn
1 [[!tag standards]]
2
3 # summary
4
5 1-op and variants
6
7 | dest | src1 | subop | op |
8 | ---- | ---- | ----- | -------- |
9 | RT | RA | .. | bmatflip |
10 | RT | RA | size | crc32 |
11 | RT | RA | size | crc32c |
12
13 2-op and variants
14
15 | dest | src1 | src2 | subop | op |
16 | ---- | ---- | ---- | ----- | -------- |
17 | RT | RA | RB | or | bmatflip |
18 | RT | RA | RB | xor | bmatflip |
19 | RT | RA | RB | bdep | dep/ext |
20 | RT | RA | RB | bext | dep/ext |
21 | RT | RA | RB | | grev |
22 | RT | RA | RB | | gorc |
23 | RT | RA | RB | shuf | shuffle |
24 | RT | RA | RB | unshuf| shuffle |
25 | RT | RA | RB | width | xperm |
26 | RT | RA | RB | type | clmul |
27 | RT | RA | RB | type | minmax |
28 | RT | RA | RB | | |
29 | RT | RA | RB | | |
30 | RT | RA | RB | | |
31
32 3 ops
33
34 * bitmask swt/extract
35 * ternary bitops
36
37 ops
38
39 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24..30 |31| name |
40 | -- | -- | --- | --- | ----- | -- | ------- |--| ---- |
41 | NN | RA | RB | | | 0 | 0000110 |Rc| rsvd |
42 | NN | RA | RB | RC | itype | 1 | 0000110 |Rc| xperm |
43 | NN | RA | RB | RC | itype | 0 | 0100110 |Rc| minmax |
44 | NN | RA | RB | | | 1 | 0100110 |Rc| rsvd |
45 | NN | RA | RB | sh | itype | SH | 1000110 |Rc| bmopsi |
46 | NN | RA | RB | | | | 1100110 |Rc| rsvd |
47 | NN | RA | RB | RC | itype | 0 | 0001110 |Rc| clmul |
48 | NN | RA | RB | sh | itype | 0 | 0101110 |Rc| clmulw |
49 | NN | RA | RB | RC | 00 | 0 | 0010110 |Rc| gorc |
50 | NN | RA | RB | sh | 00 | SH | 1010110 |Rc| gorci |
51 | NN | RA | RB | RC | 00 | 0 | 0110110 |Rc| gorcw |
52 | NN | RA | RB | sh | 00 | 0 | 1110110 |Rc| gorcwi |
53 | NN | RA | RB | RC | 00 | 1 | 1110110 |Rc| bmator |
54 | NN | RA | RB | RC | 01 | 0 | 0010110 |Rc| grev |
55 | NN | RA | RB | sh | 01 | SH | 1010110 |Rc| grevi |
56 | NN | RA | RB | RC | 01 | 0 | 0110110 |Rc| grevw |
57 | NN | RA | RB | sh | 01 | 0 | 1110110 |Rc| grevwi |
58 | NN | RA | RB | RC | 01 | 1 | 1110110 |Rc| bmatxor |
59 | NN | RA | RB | RC | 10 | 0 | 0010110 |Rc| shfl |
60 | NN | RA | RB | sh | 10 | SH | 1010110 |Rc| shfli |
61 | NN | RA | RB | RC | 10 | 0 | 0110110 |Rc| shflw |
62 | NN | RA | RB | RC | 10 | 0 | 1110110 |Rc| bdep |
63 | NN | RA | RB | RC | 10 | 1 | 1110110 |Rc| bext |
64 | NN | RA | RB | | 11 | | 1110110 |Rc| rsvd |
65 | NN | RA | RB | | | | NN11110 |Rc| rsvd |
66
67 # bit to byte permute
68
69 similar to matrix permute in RV bitmanip, which has XOR and OR variants
70
71 do j = 0 to 7
72 do k = 0 to 7
73 b = VSR[VRB+32].dword[i].byte[k].bit[j]
74 VSR[VRT+32].dword[i].byte[j].bit[k] = b
75
76 # vector bit deposit
77
78 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
79
80 do while(m < 64)
81 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
82 result = VSR[VRA+32].dword[i].bit[63-k]
83 VSR[VRT+32].dword[i].bit[63-m] = result
84 k = k + 1
85 m = m + 1
86
87 ```
88
89 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
90 {
91 uint_xlen_t r = 0;
92 for (int i = 0, j = 0; i < XLEN; i++)
93 if ((RB >> i) & 1) {
94 if ((RA >> j) & 1)
95 r |= uint_xlen_t(1) << i;
96 j++;
97 }
98 return r;
99 }
100
101 ```
102
103 # vector bit extract
104
105 other way round: identical to RV bext
106
107 ```
108 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
109 {
110 uint_xlen_t r = 0;
111 for (int i = 0, j = 0; i < XLEN; i++)
112 if ((RB >> i) & 1) {
113 if ((RA >> i) & 1)
114 r |= uint_xlen_t(1) << j;
115 j++;
116 }
117 return r;
118 }
119 ```
120
121 # int min/max
122
123 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
124
125 signed/unsigned min/max gives more flexibility.
126
127 # ternary bitops
128
129 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
130
131 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
132 | -- | -- | --- | --- | ----- | -------- |--|
133 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
134
135 for i in range(64):
136 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
137 RT[i] = (imm & (1<<idx)) != 0
138
139 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
140
141 a 4 operand variant which becomes more along the lines of an FPGA:
142
143 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
144 | -- | -- | --- | --- | --- | ------- |--|
145 | NN | RT | RA | RB | RC | mode 1 |1 |
146
147 for i in range(64):
148 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
149 RT[i] = (RC & (1<<idx)) != 0
150
151 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul.
152
153 also, another possible variant involving swizzle and vec4:
154
155 for i in range(8):
156 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
157 RT[i] = (RA.w[i] & (1<<idx)) != 0
158
159 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
160 | -- | -- | --- | ----- | ---- | ----- |--|
161 | NN | RT | RA | xyzw | mask | mode 1 |1 |
162
163 for i in range(8):
164 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
165 res = (RA.w[i] & (1<<idx)) != 0
166 for j in range(4):
167 if mask[j]: RT[i+j*8] = res
168
169 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
170 | -- | -- | --- | ----- | ---- | ----- |--|
171 | NN | RT | RA | imm | mask | mode 1 |1 |
172
173 for i in range(8):
174 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
175 res = (imm & (1<<idx)) != 0
176 for j in range(3):
177 if mask[j]: RT[i+j*8] = res
178
179 another mode selection would be CRs not Ints.
180
181 | 0.5|6.8 | 9.11|12.14|15.17|18.20| 21..25| 26.29|30|31|
182 | -- | -- | --- | --- | --- |-----| ----- | ---- |--|--|
183 | NN | BT | BA | BB | BC |im5-7| im0-4 | mask |1 |0 |
184
185 for i in range(4):
186 if not mask[i] continue
187 idx = crregs[BA][i] << 2 |
188 crregs[BB][i] << 1 |
189 crregs[BC][i]
190 crregs[BT][i] = (imm & (1<<idx)) != 0
191
192 # bitmask set
193
194 based on RV bitmanip singlebit set, instruction format similar to shift
195 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask).
196 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
197
198 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
199 | -- | -- | --- | --- | --- | ------- |--|
200 | NN | RT | RA | RB | RC | mode 010 |Rc|
201
202 ```
203 uint_xlen_t bmset(RA, RB, sh)
204 {
205 int shamt = RB & (XLEN - 1);
206 mask = (2<<sh)-1;
207 return RA | (mask << shamt);
208 }
209
210 uint_xlen_t bmclr(RA, RB, sh)
211 {
212 int shamt = RB & (XLEN - 1);
213 mask = (2<<sh)-1;
214 return RA & ~(mask << shamt);
215 }
216
217 uint_xlen_t bminv(RA, RB, sh)
218 {
219 int shamt = RB & (XLEN - 1);
220 mask = (2<<sh)-1;
221 return RA ^ (mask << shamt);
222 }
223
224 uint_xlen_t bmext(RA, RB, sh)
225 {
226 int shamt = RB & (XLEN - 1);
227 mask = (2<<sh)-1;
228 return mask & (RA >> shamt);
229 }
230 ```
231
232 # grev
233
234 based on RV bitmanip
235
236 ```
237 uint64_t grev64(uint64_t RA, uint64_t RB)
238 {
239 uint64_t x = RA;
240 int shamt = RB & 63;
241 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
242 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
243 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
244 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
245 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
246 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
247 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
248 ((x & 0xFF00FF00FF00FF00LL) >> 8);
249 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
250 ((x & 0xFFFF0000FFFF0000LL) >> 16);
251 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
252 ((x & 0xFFFFFFFF00000000LL) >> 32);
253 return x;
254 }
255
256 ```
257
258 # shuffle / unshuffle
259
260 based on RV bitmanip
261
262 ```
263 uint32_t shfl32(uint32_t RA, uint32_t RB)
264 {
265 uint32_t x = RA;
266 int shamt = RB & 15;
267 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
268 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
269 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
270 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
271 return x;
272 }
273 uint32_t unshfl32(uint32_t RA, uint32_t RB)
274 {
275 uint32_t x = RA;
276 int shamt = RB & 15;
277 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
278 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
279 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
280 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
281 return x;
282 }
283
284 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
285 {
286 uint64_t x = src & ~(maskL | maskR);
287 x |= ((src << N) & maskL) | ((src >> N) & maskR);
288 return x;
289 }
290 uint64_t shfl64(uint64_t RA, uint64_t RB)
291 {
292 uint64_t x = RA;
293 int shamt = RB & 31;
294 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
295 0x00000000ffff0000LL, 16);
296 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
297 0x0000ff000000ff00LL, 8);
298 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
299 0x00f000f000f000f0LL, 4);
300 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
301 0x0c0c0c0c0c0c0c0cLL, 2);
302 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
303 0x2222222222222222LL, 1);
304 return x;
305 }
306 uint64_t unshfl64(uint64_t RA, uint64_t RB)
307 {
308 uint64_t x = RA;
309 int shamt = RB & 31;
310 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
311 0x2222222222222222LL, 1);
312 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
313 0x0c0c0c0c0c0c0c0cLL, 2);
314 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
315 0x00f000f000f000f0LL, 4);
316 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
317 0x0000ff000000ff00LL, 8);
318 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
319 0x00000000ffff0000LL, 16);
320 return x;
321 }
322 ```
323
324 # xperm
325
326 based on RV bitmanip
327
328 ```
329 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
330 {
331 uint_xlen_t r = 0;
332 uint_xlen_t sz = 1LL << sz_log2;
333 uint_xlen_t mask = (1LL << sz) - 1;
334 for (int i = 0; i < XLEN; i += sz) {
335 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
336 if (pos < XLEN)
337 r |= ((RA >> pos) & mask) << i;
338 }
339 return r;
340 }
341 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
342 { return xperm(RA, RB, 2); }
343 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
344 { return xperm(RA, RB, 3); }
345 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
346 { return xperm(RA, RB, 4); }
347 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
348 { return xperm(RA, RB, 5); }
349 ```
350
351 # gorc
352
353 based on RV bitmanip
354
355 ```
356 uint32_t gorc32(uint32_t RA, uint32_t RB)
357 {
358 uint32_t x = RA;
359 int shamt = RB & 31;
360 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
361 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
362 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
363 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
364 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
365 return x;
366 }
367 uint64_t gorc64(uint64_t RA, uint64_t RB)
368 {
369 uint64_t x = RA;
370 int shamt = RB & 63;
371 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
372 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
373 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
374 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
375 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
376 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
377 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
378 ((x & 0xFF00FF00FF00FF00LL) >> 8);
379 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
380 ((x & 0xFFFF0000FFFF0000LL) >> 16);
381 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
382 ((x & 0xFFFFFFFF00000000LL) >> 32);
383 return x;
384 }
385
386 ```
387
388 # cmix
389
390 based on RV bitmanip, covered by ternary bitops
391
392 ```
393 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
394 return (RA & RB) | (RC & ~RB);
395 }
396 ```
397
398 # carryless mul
399
400 based on RV bitmanip
401
402 ```
403 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
404 {
405 uint_xlen_t x = 0;
406 for (int i = 0; i < XLEN; i++)
407 if ((RB >> i) & 1)
408 x ^= RA << i;
409 return x;
410 }
411 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
412 {
413 uint_xlen_t x = 0;
414 for (int i = 1; i < XLEN; i++)
415 if ((RB >> i) & 1)
416 x ^= RA >> (XLEN-i);
417 return x;
418 }
419 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
420 {
421 uint_xlen_t x = 0;
422 for (int i = 0; i < XLEN; i++)
423 if ((RB >> i) & 1)
424 x ^= RA >> (XLEN-i-1);
425 return x;
426 }
427 ```
428
429 # crc
430
431 ```
432 uint_xlen_t crc32(uint_xlen_t x, int nbits)
433 {
434 for (int i = 0; i < nbits; i++)
435 x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
436 return x;
437 }
438 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
439 {
440 for (int i = 0; i < nbits; i++)
441 x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
442 return x;
443 }
444 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
445 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
446 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
447 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
448 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
449 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
450 #if XLEN > 32
451 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
452 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
453 #endif
454 ```
455
456 # bitmatrix
457
458 ```
459 uint64_t bmatflip(uint64_t RA)
460 {
461 uint64_t x = RA;
462 x = shfl64(x, 31);
463 x = shfl64(x, 31);
464 x = shfl64(x, 31);
465 return x;
466 }
467 uint64_t bmatxor(uint64_t RA, uint64_t RB)
468 {
469 // transpose of RB
470 uint64_t RBt = bmatflip(RB);
471 uint8_t u[8]; // rows of RA
472 uint8_t v[8]; // cols of RB
473 for (int i = 0; i < 8; i++) {
474 u[i] = RA >> (i*8);
475 v[i] = RBt >> (i*8);
476 }
477 uint64_t x = 0;
478 for (int i = 0; i < 64; i++) {
479 if (pcnt(u[i / 8] & v[i % 8]) & 1)
480 x |= 1LL << i;
481 }
482 return x;
483 }
484 uint64_t bmator(uint64_t RA, uint64_t RB)
485 {
486 // transpose of RB
487 uint64_t RBt = bmatflip(RB);
488 uint8_t u[8]; // rows of RA
489 uint8_t v[8]; // cols of RB
490 for (int i = 0; i < 8; i++) {
491 u[i] = RA >> (i*8);
492 v[i] = RBt >> (i*8);
493 }
494 uint64_t x = 0;
495 for (int i = 0; i < 64; i++) {
496 if ((u[i / 8] & v[i % 8]) != 0)
497 x |= 1LL << i;
498 }
499 return x;
500 }
501
502 ```