(no commit message)
[libreriscv.git] / openpower / sv / bitmanip.mdwn
1 [[!tag standards]]
2
3 # summary
4
5 1-op and variants
6
7 | dest | src1 | subop | op |
8 | ---- | ---- | ----- | -------- |
9 | RT | RA | .. | bmatflip |
10 | RT | RA | size | crc32 |
11 | RT | RA | size | crc32c |
12
13 2-op and variants
14
15 | dest | src1 | src2 | subop | op |
16 | ---- | ---- | ---- | ----- | -------- |
17 | RT | RA | RB | or | bmatflip |
18 | RT | RA | RB | xor | bmatflip |
19 | RT | RA | RB | bdep | dep/ext |
20 | RT | RA | RB | bext | dep/ext |
21 | RT | RA | RB | | grev |
22 | RT | RA | RB | | gorc |
23 | RT | RA | RB | shuf | shuffle |
24 | RT | RA | RB | unshuf| shuffle |
25 | RT | RA | RB | width | xperm |
26 | RT | RA | RB | type | clmul |
27 | RT | RA | RB | type | minmax |
28 | RT | RA | RB | | |
29 | RT | RA | RB | | |
30 | RT | RA | RB | | |
31
32 3 ops
33
34 * bitmask swt/extract
35 * ternary bitops
36
37 ops
38
39 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24..30 |31| name |
40 | -- | -- | --- | --- | ----- | -- | ------- |--| ---- |
41 | NN | RA | RB | | | 0 | 0000110 |Rc| rsvd |
42 | NN | RA | RB | RC | itype | 1 | 0000110 |Rc| xperm |
43 | NN | RA | RB | RC | itype | 0 | 0100110 |Rc| minmax |
44 | NN | RA | RB | | | 1 | 0100110 |Rc| rsvd |
45 | NN | RA | RB | sh | itype | SH | 1000110 |Rc| bmopsi |
46 | NN | RA | RB | | | | 1100110 |Rc| rsvd |
47 | NN | RA | RB | RC | itype | 0 | 0001110 |Rc| clmul |
48 | NN | RA | RB | sh | itype | 0 | 0101110 |Rc| clmulw |
49 | NN | RA | RB | RC | 00 | 0 | 0010110 |Rc| gorc |
50 | NN | RA | RB | sh | 00 | SH | 1010110 |Rc| gorci |
51 | NN | RA | RB | RC | 00 | 0 | 0110110 |Rc| gorcw |
52 | NN | RA | RB | sh | 00 | 0 | 1110110 |Rc| gorcwi |
53 | NN | RA | RB | RC | 00 | 1 | 1110110 |Rc| bmator |
54 | NN | RA | RB | RC | 01 | 0 | 0010110 |Rc| grev |
55 | NN | RA | RB | sh | 01 | SH | 1010110 |Rc| grevi |
56 | NN | RA | RB | RC | 01 | 0 | 0110110 |Rc| grevw |
57 | NN | RA | RB | sh | 01 | 0 | 1110110 |Rc| grevwi |
58 | NN | RA | RB | RC | 01 | 1 | 1110110 |Rc| bmatxor |
59 | NN | RA | RB | RC | 10 | 0 | 0010110 |Rc| shfl |
60 | NN | RA | RB | sh | 10 | SH | 1010110 |Rc| shfli |
61 | NN | RA | RB | RC | 10 | 0 | 0110110 |Rc| shflw |
62 | NN | RA | RB | RC | 10 | 0 | 1110110 |Rc| bdep |
63 | NN | RA | RB | RC | 10 | 1 | 1110110 |Rc| bext |
64 | NN | RA | RB | | 11 | | 1110110 |Rc| rsvd |
65 | NN | RA | RB | | | | NN11110 |Rc| rsvd |
66
67 # bit to byte permute
68
69 similar to matrix permute in RV bitmanip, which has XOR and OR variants
70
71 do j = 0 to 7
72 do k = 0 to 7
73 b = VSR[VRB+32].dword[i].byte[k].bit[j]
74 VSR[VRT+32].dword[i].byte[j].bit[k] = b
75
76 # vector bit deposit
77
78 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
79
80 do while(m < 64)
81 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
82 result = VSR[VRA+32].dword[i].bit[63-k]
83 VSR[VRT+32].dword[i].bit[63-m] = result
84 k = k + 1
85 m = m + 1
86
87 ```
88
89 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
90 {
91 uint_xlen_t r = 0;
92 for (int i = 0, j = 0; i < XLEN; i++)
93 if ((RB >> i) & 1) {
94 if ((RA >> j) & 1)
95 r |= uint_xlen_t(1) << i;
96 j++;
97 }
98 return r;
99 }
100
101 ```
102
103 # vector bit extract
104
105 other way round: identical to RV bext
106
107 ```
108 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
109 {
110 uint_xlen_t r = 0;
111 for (int i = 0, j = 0; i < XLEN; i++)
112 if ((RB >> i) & 1) {
113 if ((RA >> i) & 1)
114 r |= uint_xlen_t(1) << j;
115 j++;
116 }
117 return r;
118 }
119 ```
120
121 # int min/max
122
123 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
124
125 signed/unsigned min/max gives more flexibility.
126
127 # ternary bitops
128
129 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
130
131 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
132 | -- | -- | --- | --- | ----- | -------- |--|
133 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
134
135 for i in range(64):
136 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
137 RT[i] = (imm & (1<<idx)) != 0
138
139 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
140
141 a 4 operand variant which becomes more along the lines of an FPGA:
142
143 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
144 | -- | -- | --- | --- | --- | ------- |--|
145 | NN | RT | RA | RB | RC | mode 1 |1 |
146
147 for i in range(64):
148 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
149 RT[i] = (RC & (1<<idx)) != 0
150
151 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul.
152
153 also, another possible variant involving swizzle and vec4:
154
155 for i in range(8):
156 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
157 RT[i] = (RA.w[i] & (1<<idx)) != 0
158
159 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
160 | -- | -- | --- | ----- | ---- | ----- |--|
161 | NN | RT | RA | xyzw | mask | mode 1 |1 |
162
163 for i in range(8):
164 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
165 res = (RA.w[i] & (1<<idx)) != 0
166 for j in range(4):
167 if mask[j]: RT[i+j*8] = res
168
169 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
170 | -- | -- | --- | ----- | ---- | ----- |--|
171 | NN | RT | RA | imm | mask | mode 1 |1 |
172
173 for i in range(8):
174 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
175 res = (imm & (1<<idx)) != 0
176 for j in range(3):
177 if mask[j]: RT[i+j*8] = res
178
179 another mode selection would be CRs not Ints.
180
181 | 0.5|6.8 | 9.11|12.14|15.17|18.20| 21..25| 26.29|30|31|
182 | -- | -- | --- | --- | --- |-----| ----- | ---- |--|--|
183 | NN | BT | BA | BB | BC |im5-7| im0-4 | mask |1 |0 |
184
185 for i in range(4):
186 if not mask[i] continue
187 idx = crregs[BA][i] << 2 |
188 crregs[BB][i] << 1 |
189 crregs[BC][i]
190 crregs[BT][i] = (imm & (1<<idx)) != 0
191
192 # bitmask set
193
194 based on RV bitmanip singlebit set, instruction format similar to shift
195 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask).
196 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
197
198 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
199 | -- | -- | --- | --- | --- | ------- |--|
200 | NN | RT | RA | RB | RC | mode 010 |Rc|
201
202 ```
203 uint_xlen_t bmset(RA, RB, sh)
204 {
205 int shamt = RB & (XLEN - 1);
206 mask = (2<<sh)-1;
207 return RA | (mask << shamt);
208 }
209
210 uint_xlen_t bmclr(RA, RB, sh)
211 {
212 int shamt = RB & (XLEN - 1);
213 mask = (2<<sh)-1;
214 return RA & ~(mask << shamt);
215 }
216
217 uint_xlen_t bminv(RA, RB, sh)
218 {
219 int shamt = RB & (XLEN - 1);
220 mask = (2<<sh)-1;
221 return RA ^ (mask << shamt);
222 }
223
224 uint_xlen_t bmext(RA, RB, sh)
225 {
226 int shamt = RB & (XLEN - 1);
227 mask = (2<<sh)-1;
228 return mask & (RA >> shamt);
229 }
230 ```
231
232 # grev
233
234 based on RV bitmanip
235
236 ```
237 uint64_t grev64(uint64_t RA, uint64_t RB)
238 {
239 uint64_t x = RA;
240 int shamt = RB & 63;
241 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
242 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
243 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
244 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
245 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
246 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
247 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
248 ((x & 0xFF00FF00FF00FF00LL) >> 8);
249 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
250 ((x & 0xFFFF0000FFFF0000LL) >> 16);
251 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
252 ((x & 0xFFFFFFFF00000000LL) >> 32);
253 return x;
254 }
255
256 ```
257
258 # shuffle / unshuffle
259
260 based on RV bitmanip
261
262 ```
263 uint32_t shfl32(uint32_t RA, uint32_t RB)
264 {
265 uint32_t x = RA;
266 int shamt = RB & 15;
267 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
268 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
269 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
270 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
271 return x;
272 }
273 uint32_t unshfl32(uint32_t RA, uint32_t RB)
274 {
275 uint32_t x = RA;
276 int shamt = RB & 15;
277 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
278 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
279 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
280 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
281 return x;
282 }
283
284 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
285 {
286 uint64_t x = src & ~(maskL | maskR);
287 x |= ((src << N) & maskL) | ((src >> N) & maskR);
288 return x;
289 }
290 uint64_t shfl64(uint64_t RA, uint64_t RB)
291 {
292 uint64_t x = RA;
293 int shamt = RB & 31;
294 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
295 0x00000000ffff0000LL, 16);
296 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
297 0x0000ff000000ff00LL, 8);
298 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
299 0x00f000f000f000f0LL, 4);
300 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
301 0x0c0c0c0c0c0c0c0cLL, 2);
302 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
303 0x2222222222222222LL, 1);
304 return x;
305 }
306 uint64_t unshfl64(uint64_t RA, uint64_t RB)
307 {
308 uint64_t x = RA;
309 int shamt = RB & 31;
310 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
311 0x2222222222222222LL, 1);
312 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
313 0x0c0c0c0c0c0c0c0cLL, 2);
314 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
315 0x00f000f000f000f0LL, 4);
316 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
317 0x0000ff000000ff00LL, 8);
318 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
319 0x00000000ffff0000LL, 16);
320 return x;
321 }
322 ```
323
324 # xperm
325
326 based on RV bitmanip
327
328 ```
329 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
330 {
331 uint_xlen_t r = 0;
332 uint_xlen_t sz = 1LL << sz_log2;
333 uint_xlen_t mask = (1LL << sz) - 1;
334 for (int i = 0; i < XLEN; i += sz) {
335 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
336 if (pos < XLEN)
337 r |= ((RA >> pos) & mask) << i;
338 }
339 return r;
340 }
341 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
342 { return xperm(RA, RB, 2); }
343 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
344 { return xperm(RA, RB, 3); }
345 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
346 { return xperm(RA, RB, 4); }
347 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
348 { return xperm(RA, RB, 5); }
349 ```
350
351 # gorc
352
353 based on RV bitmanip
354
355 ```
356 uint32_t gorc32(uint32_t RA, uint32_t RB)
357 {
358 uint32_t x = RA;
359 int shamt = RB & 31;
360 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
361 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
362 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
363 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
364 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
365 return x;
366 }
367 uint64_t gorc64(uint64_t RA, uint64_t RB)
368 {
369 uint64_t x = RA;
370 int shamt = RB & 63;
371 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
372 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
373 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
374 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
375 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
376 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
377 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
378 ((x & 0xFF00FF00FF00FF00LL) >> 8);
379 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
380 ((x & 0xFFFF0000FFFF0000LL) >> 16);
381 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
382 ((x & 0xFFFFFFFF00000000LL) >> 32);
383 return x;
384 }
385
386 ```
387
388 # cmix
389
390 based on RV bitmanip, covered by ternary bitops
391
392 ```
393 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
394 return (RA & RB) | (RC & ~RB);
395 }
396 ```
397
398 # carryless mul
399
400 based on RV bitmanip
401 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
402
403 ```
404 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
405 {
406 uint_xlen_t x = 0;
407 for (int i = 0; i < XLEN; i++)
408 if ((RB >> i) & 1)
409 x ^= RA << i;
410 return x;
411 }
412 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
413 {
414 uint_xlen_t x = 0;
415 for (int i = 1; i < XLEN; i++)
416 if ((RB >> i) & 1)
417 x ^= RA >> (XLEN-i);
418 return x;
419 }
420 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
421 {
422 uint_xlen_t x = 0;
423 for (int i = 0; i < XLEN; i++)
424 if ((RB >> i) & 1)
425 x ^= RA >> (XLEN-i-1);
426 return x;
427 }
428 ```
429 # Galois Field Multiply
430
431 this requires 3 parameters and a "degree"
432
433 RT = GFMUL(RA, RB, gfdegree, modulo=RC)
434
435 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
436
437 RS = GFMUL(RS, RA, gfdegree, modulo=RB)
438
439 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
440 | -- | -- | --- | --- | --- | ------- |--|
441 | NN | RS | RA | RB | deg | | |
442
443 where the SimpleV variant may override RS-as-src differently from RS-as-dest
444
445
446 ```
447 from functools import reduce
448
449 # constants used in the multGF2 function
450 mask1 = mask2 = polyred = None
451
452 def setGF2(degree, irPoly):
453 """Define parameters of binary finite field GF(2^m)/g(x)
454 - degree: extension degree of binary field
455 - irPoly: coefficients of irreducible polynomial g(x)
456 """
457 def i2P(sInt):
458 """Convert an integer into a polynomial"""
459 return [(sInt >> i) & 1
460 for i in reversed(range(sInt.bit_length()))]
461
462 global mask1, mask2, polyred
463 mask1 = mask2 = 1 << degree
464 mask2 -= 1
465 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
466
467 def multGF2(p1, p2):
468 """Multiply two polynomials in GF(2^m)/g(x)"""
469 p = 0
470 while p2:
471 if p2 & 1:
472 p ^= p1
473 p1 <<= 1
474 if p1 & mask1:
475 p1 ^= polyred
476 p2 >>= 1
477 return p & mask2
478
479 if __name__ == "__main__":
480
481 # Define binary field GF(2^3)/x^3 + x + 1
482 setGF2(3, 0b1011)
483
484 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
485 print("{:02x}".format(multGF2(0b111, 0b101)))
486
487 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
488 # (used in the Advanced Encryption Standard-AES)
489 setGF2(8, 0b100011011)
490
491 # Evaluate the product (x^7)(x^7 + x + 1)
492 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
493 ```
494
495 # crc
496
497 ```
498 uint_xlen_t crc32(uint_xlen_t x, int nbits)
499 {
500 for (int i = 0; i < nbits; i++)
501 x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
502 return x;
503 }
504 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
505 {
506 for (int i = 0; i < nbits; i++)
507 x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
508 return x;
509 }
510 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
511 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
512 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
513 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
514 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
515 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
516 #if XLEN > 32
517 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
518 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
519 #endif
520 ```
521
522 # bitmatrix
523
524 ```
525 uint64_t bmatflip(uint64_t RA)
526 {
527 uint64_t x = RA;
528 x = shfl64(x, 31);
529 x = shfl64(x, 31);
530 x = shfl64(x, 31);
531 return x;
532 }
533 uint64_t bmatxor(uint64_t RA, uint64_t RB)
534 {
535 // transpose of RB
536 uint64_t RBt = bmatflip(RB);
537 uint8_t u[8]; // rows of RA
538 uint8_t v[8]; // cols of RB
539 for (int i = 0; i < 8; i++) {
540 u[i] = RA >> (i*8);
541 v[i] = RBt >> (i*8);
542 }
543 uint64_t x = 0;
544 for (int i = 0; i < 64; i++) {
545 if (pcnt(u[i / 8] & v[i % 8]) & 1)
546 x |= 1LL << i;
547 }
548 return x;
549 }
550 uint64_t bmator(uint64_t RA, uint64_t RB)
551 {
552 // transpose of RB
553 uint64_t RBt = bmatflip(RB);
554 uint8_t u[8]; // rows of RA
555 uint8_t v[8]; // cols of RB
556 for (int i = 0; i < 8; i++) {
557 u[i] = RA >> (i*8);
558 v[i] = RBt >> (i*8);
559 }
560 uint64_t x = 0;
561 for (int i = 0; i < 64; i++) {
562 if ((u[i / 8] & v[i % 8]) != 0)
563 x |= 1LL << i;
564 }
565 return x;
566 }
567
568 ```