(no commit message)
[libreriscv.git] / openpower / sv / bitmanip.mdwn
1 [[!tag standards]]
2
3 # summary
4
5 minor opcode allocation
6
7 | 28.30 |31| name |
8 | ------ |--| --------- |
9 | 00 |Rc| ternaryi |
10 | 001 |Rc| ternary |
11 | 010 |Rc| bitmask |
12 | 011 |Rc| gf* |
13 | 101 |1 | ternaryv |
14 | 101 |0 | ternarycr |
15 | 110 |1 | 1/2-op |
16 | 111 |Rc| reserved |
17
18 1-op and variants
19
20 | dest | src1 | subop | op |
21 | ---- | ---- | ----- | -------- |
22 | RT | RA | .. | bmatflip |
23 | RT | RA | size | crc32 |
24 | RT | RA | size | crc32c |
25
26 2-op and variants
27
28 | dest | src1 | src2 | subop | op |
29 | ---- | ---- | ---- | ----- | -------- |
30 | RT | RA | RB | or | bmatflip |
31 | RT | RA | RB | xor | bmatflip |
32 | RT | RA | RB | bdep | dep/ext |
33 | RT | RA | RB | bext | dep/ext |
34 | RT | RA | RB | | grev |
35 | RT | RA | RB | | gorc |
36 | RT | RA | RB | shuf | shuffle |
37 | RT | RA | RB | unshuf| shuffle |
38 | RT | RA | RB | width | xperm |
39 | RT | RA | RB | type | minmax |
40 | RT | RA | RB | | |
41 | RT | RA | RB | | |
42 | RT | RA | RB | | |
43
44 3 ops
45
46 * bitmask set/extract
47 * ternary bitops
48 * GF
49
50 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
51 | -- | -- | --- | --- | ----- | -------- |--| ------ |
52 | NN | RT | RA | RB | RC | mode 001 |Rc| ternary |
53 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc| ternaryi |
54 | NN | RS | RA | RB | deg | 00 011 |Rc| gfmul |
55 | NN | RS | RA | RB | deg | 01 011 |Rc| gfadd |
56 | NN | RT | RA | RB | deg | 10 011 |Rc| gfinv |
57 | NN | RS | RA | RB | deg | 11 011 |Rc| gf rsvd |
58
59 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
60 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
61 | NN | RT | RA | imm | mask | 101 |1 | ternaryv |
62
63 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
64 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
65 | NN | BA | BB | BC |0 |imm | mask | 101 |0 | ternarycr |
66
67 ops
68
69 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
70 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
71 | NN | RA | RB | | | 0 | 0000 110 |Rc| rsvd |
72 | NN | RA | RB | RC | itype | 1 | 0000 110 |Rc| xperm |
73 | NN | RA | RB | RC | itype | 0 | 0100 110 |Rc| minmax |
74 | NN | RA | RB | | | 1 | 0100 110 |Rc| rsvd |
75 | NN | RA | RB | sh | itype | SH | 1000 110 |Rc| bmopsi |
76 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
77 | NN | RA | RB | | | 0 | 0001 110 |Rc| rsvd |
78 | NN | RA | RB | | | 0 | 0101 110 |Rc| rsvd |
79 | NN | RA | RB | RC | 00 | 0 | 0010 110 |Rc| gorc |
80 | NN | RA | RB | sh | 00 | SH | 1010 110 |Rc| gorci |
81 | NN | RA | RB | RC | 00 | 0 | 0110 110 |Rc| gorcw |
82 | NN | RA | RB | sh | 00 | 0 | 1110 110 |Rc| gorcwi |
83 | NN | RA | RB | RC | 00 | 1 | 1110 110 |Rc| bmator |
84 | NN | RA | RB | RC | 01 | 0 | 0010 110 |Rc| grev |
85 | NN | RA | RB | sh | 01 | SH | 1010 110 |Rc| grevi |
86 | NN | RA | RB | RC | 01 | 0 | 0110 110 |Rc| grevw |
87 | NN | RA | RB | sh | 01 | 0 | 1110 110 |Rc| grevwi |
88 | NN | RA | RB | RC | 01 | 1 | 1110 110 |Rc| bmatxor |
89 | NN | RA | RB | RC | 10 | 0 | 0010 110 |Rc| shfl |
90 | NN | RA | RB | sh | 10 | SH | 1010 110 |Rc| shfli |
91 | NN | RA | RB | RC | 10 | 0 | 0110 110 |Rc| shflw |
92 | NN | RA | RB | RC | 10 | 0 | 1110 110 |Rc| bdep |
93 | NN | RA | RB | RC | 10 | 1 | 1110 110 |Rc| bext |
94 | NN | RA | RB | | 11 | | 1110 110 |Rc| rsvd |
95 | NN | RA | RB | | | | NN11 110 |Rc| rsvd |
96
97 # bit to byte permute
98
99 similar to matrix permute in RV bitmanip, which has XOR and OR variants
100
101 do j = 0 to 7
102 do k = 0 to 7
103 b = VSR[VRB+32].dword[i].byte[k].bit[j]
104 VSR[VRT+32].dword[i].byte[j].bit[k] = b
105
106 # vector bit deposit
107
108 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
109
110 do while(m < 64)
111 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
112 result = VSR[VRA+32].dword[i].bit[63-k]
113 VSR[VRT+32].dword[i].bit[63-m] = result
114 k = k + 1
115 m = m + 1
116
117 ```
118
119 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
120 {
121 uint_xlen_t r = 0;
122 for (int i = 0, j = 0; i < XLEN; i++)
123 if ((RB >> i) & 1) {
124 if ((RA >> j) & 1)
125 r |= uint_xlen_t(1) << i;
126 j++;
127 }
128 return r;
129 }
130
131 ```
132
133 # vector bit extract
134
135 other way round: identical to RV bext
136
137 ```
138 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
139 {
140 uint_xlen_t r = 0;
141 for (int i = 0, j = 0; i < XLEN; i++)
142 if ((RB >> i) & 1) {
143 if ((RA >> i) & 1)
144 r |= uint_xlen_t(1) << j;
145 j++;
146 }
147 return r;
148 }
149 ```
150
151 # int min/max
152
153 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
154
155 signed/unsigned min/max gives more flexibility.
156
157 # ternary bitops
158
159 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
160
161 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
162 | -- | -- | --- | --- | ----- | -------- |--|
163 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
164
165 for i in range(64):
166 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
167 RT[i] = (imm & (1<<idx)) != 0
168
169 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
170
171 a 4 operand variant which becomes more along the lines of an FPGA:
172
173 | 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
174 | -- | -- | --- | --- | --- | -------- |--|
175 | NN | RT | RA | RB | RC | mode 001 |Rc|
176
177 for i in range(64):
178 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
179 RT[i] = (RC & (1<<idx)) != 0
180
181 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
182 3 modes.
183
184 also, another possible variant involving swizzle and vec4:
185
186 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
187 | -- | -- | --- | ----- | ---- | ----- |--|
188 | NN | RT | RA | imm | mask | 101 |1 |
189
190 for i in range(8):
191 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
192 res = (imm & (1<<idx)) != 0
193 for j in range(3):
194 if mask[j]: RT[i+j*8] = res
195
196 another mode selection would be CRs not Ints.
197
198 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
199 | -- | -- | --- | --- |- |-----|----- | -----|--|
200 | NN | BA | BB | BC |0 |imm | mask | 101 |0 |
201
202 for i in range(4):
203 if not mask[i] continue
204 idx = crregs[BA][i] << 2 |
205 crregs[BB][i] << 1 |
206 crregs[BC][i]
207 crregs[BA][i] = (imm & (1<<idx)) != 0
208
209 # bitmask set
210
211 based on RV bitmanip singlebit set, instruction format similar to shift
212 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask).
213 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
214
215 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
216 | -- | -- | --- | --- | --- | ------- |--|
217 | NN | RT | RA | RB | RC | mode 010 |Rc|
218
219 ```
220 uint_xlen_t bmset(RA, RB, sh)
221 {
222 int shamt = RB & (XLEN - 1);
223 mask = (2<<sh)-1;
224 return RA | (mask << shamt);
225 }
226
227 uint_xlen_t bmclr(RA, RB, sh)
228 {
229 int shamt = RB & (XLEN - 1);
230 mask = (2<<sh)-1;
231 return RA & ~(mask << shamt);
232 }
233
234 uint_xlen_t bminv(RA, RB, sh)
235 {
236 int shamt = RB & (XLEN - 1);
237 mask = (2<<sh)-1;
238 return RA ^ (mask << shamt);
239 }
240
241 uint_xlen_t bmext(RA, RB, sh)
242 {
243 int shamt = RB & (XLEN - 1);
244 mask = (2<<sh)-1;
245 return mask & (RA >> shamt);
246 }
247 ```
248
249 # grev
250
251 based on RV bitmanip
252
253 ```
254 uint64_t grev64(uint64_t RA, uint64_t RB)
255 {
256 uint64_t x = RA;
257 int shamt = RB & 63;
258 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
259 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
260 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
261 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
262 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
263 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
264 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
265 ((x & 0xFF00FF00FF00FF00LL) >> 8);
266 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
267 ((x & 0xFFFF0000FFFF0000LL) >> 16);
268 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
269 ((x & 0xFFFFFFFF00000000LL) >> 32);
270 return x;
271 }
272
273 ```
274
275 # shuffle / unshuffle
276
277 based on RV bitmanip
278
279 ```
280 uint32_t shfl32(uint32_t RA, uint32_t RB)
281 {
282 uint32_t x = RA;
283 int shamt = RB & 15;
284 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
285 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
286 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
287 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
288 return x;
289 }
290 uint32_t unshfl32(uint32_t RA, uint32_t RB)
291 {
292 uint32_t x = RA;
293 int shamt = RB & 15;
294 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
295 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
296 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
297 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
298 return x;
299 }
300
301 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
302 {
303 uint64_t x = src & ~(maskL | maskR);
304 x |= ((src << N) & maskL) | ((src >> N) & maskR);
305 return x;
306 }
307 uint64_t shfl64(uint64_t RA, uint64_t RB)
308 {
309 uint64_t x = RA;
310 int shamt = RB & 31;
311 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
312 0x00000000ffff0000LL, 16);
313 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
314 0x0000ff000000ff00LL, 8);
315 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
316 0x00f000f000f000f0LL, 4);
317 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
318 0x0c0c0c0c0c0c0c0cLL, 2);
319 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
320 0x2222222222222222LL, 1);
321 return x;
322 }
323 uint64_t unshfl64(uint64_t RA, uint64_t RB)
324 {
325 uint64_t x = RA;
326 int shamt = RB & 31;
327 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
328 0x2222222222222222LL, 1);
329 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
330 0x0c0c0c0c0c0c0c0cLL, 2);
331 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
332 0x00f000f000f000f0LL, 4);
333 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
334 0x0000ff000000ff00LL, 8);
335 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
336 0x00000000ffff0000LL, 16);
337 return x;
338 }
339 ```
340
341 # xperm
342
343 based on RV bitmanip
344
345 ```
346 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
347 {
348 uint_xlen_t r = 0;
349 uint_xlen_t sz = 1LL << sz_log2;
350 uint_xlen_t mask = (1LL << sz) - 1;
351 for (int i = 0; i < XLEN; i += sz) {
352 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
353 if (pos < XLEN)
354 r |= ((RA >> pos) & mask) << i;
355 }
356 return r;
357 }
358 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
359 { return xperm(RA, RB, 2); }
360 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
361 { return xperm(RA, RB, 3); }
362 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
363 { return xperm(RA, RB, 4); }
364 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
365 { return xperm(RA, RB, 5); }
366 ```
367
368 # gorc
369
370 based on RV bitmanip
371
372 ```
373 uint32_t gorc32(uint32_t RA, uint32_t RB)
374 {
375 uint32_t x = RA;
376 int shamt = RB & 31;
377 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
378 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
379 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
380 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
381 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
382 return x;
383 }
384 uint64_t gorc64(uint64_t RA, uint64_t RB)
385 {
386 uint64_t x = RA;
387 int shamt = RB & 63;
388 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
389 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
390 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
391 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
392 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
393 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
394 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
395 ((x & 0xFF00FF00FF00FF00LL) >> 8);
396 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
397 ((x & 0xFFFF0000FFFF0000LL) >> 16);
398 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
399 ((x & 0xFFFFFFFF00000000LL) >> 32);
400 return x;
401 }
402
403 ```
404
405 # cmix
406
407 based on RV bitmanip, covered by ternary bitops
408
409 ```
410 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
411 return (RA & RB) | (RC & ~RB);
412 }
413 ```
414
415 # carryless mul
416
417 based on RV bitmanip
418 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
419
420 ```
421 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
422 {
423 uint_xlen_t x = 0;
424 for (int i = 0; i < XLEN; i++)
425 if ((RB >> i) & 1)
426 x ^= RA << i;
427 return x;
428 }
429 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
430 {
431 uint_xlen_t x = 0;
432 for (int i = 1; i < XLEN; i++)
433 if ((RB >> i) & 1)
434 x ^= RA >> (XLEN-i);
435 return x;
436 }
437 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
438 {
439 uint_xlen_t x = 0;
440 for (int i = 0; i < XLEN; i++)
441 if ((RB >> i) & 1)
442 x ^= RA >> (XLEN-i-1);
443 return x;
444 }
445 ```
446 # Galois Field
447
448 ## Multiply
449
450 this requires 3 parameters and a "degree"
451
452 RT = GFMUL(RA, RB, gfdegree, modulo=RC)
453
454 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
455
456 RS = GFMUL(RS, RA, gfdegree, modulo=RB)
457
458 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
459 | -- | -- | --- | --- | --- | ------- |--|
460 | NN | RS | RA | RB | deg | 00 011 |Rc|
461
462 where the SimpleV variant may override RS-as-src differently from RS-as-dest
463
464
465
466 ```
467 from functools import reduce
468
469 # constants used in the multGF2 function
470 mask1 = mask2 = polyred = None
471
472 def setGF2(degree, irPoly):
473 """Define parameters of binary finite field GF(2^m)/g(x)
474 - degree: extension degree of binary field
475 - irPoly: coefficients of irreducible polynomial g(x)
476 """
477 def i2P(sInt):
478 """Convert an integer into a polynomial"""
479 return [(sInt >> i) & 1
480 for i in reversed(range(sInt.bit_length()))]
481
482 global mask1, mask2, polyred
483 mask1 = mask2 = 1 << degree
484 mask2 -= 1
485 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
486
487 def multGF2(p1, p2):
488 """Multiply two polynomials in GF(2^m)/g(x)"""
489 p = 0
490 while p2:
491 if p2 & 1:
492 p ^= p1
493 p1 <<= 1
494 if p1 & mask1:
495 p1 ^= polyred
496 p2 >>= 1
497 return p & mask2
498
499 if __name__ == "__main__":
500
501 # Define binary field GF(2^3)/x^3 + x + 1
502 setGF2(3, 0b1011)
503
504 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
505 print("{:02x}".format(multGF2(0b111, 0b101)))
506
507 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
508 # (used in the Advanced Encryption Standard-AES)
509 setGF2(8, 0b100011011)
510
511 # Evaluate the product (x^7)(x^7 + x + 1)
512 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
513 ```
514 ## GF add
515
516 RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
517
518 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
519 | -- | -- | --- | --- | --- | ------- |--|
520 | NN | RS | RA | RB | deg | 01 011 |Rc|
521
522 ## gf invert
523
524 ```
525 def gf_degree(a) :
526 res = 0
527 a >>= 1
528 while (a != 0) :
529 a >>= 1;
530 res += 1;
531 return res
532
533 def gf_invert(a, mod=0x1B) :
534 v = mod
535 g1 = 1
536 g2 = 0
537 j = gf_degree(a) - 8
538
539 while (a != 1) :
540 if (j < 0) :
541 a, v = v, a
542 g1, g2 = g2, g1
543 j = -j
544
545 a ^= v << j
546 g1 ^= g2 << j
547
548 a %= 256 # Emulating 8-bit overflow
549 g1 %= 256 # Emulating 8-bit overflow
550
551 j = gf_degree(a) - gf_degree(v)
552
553 return g1
554 ```
555
556 # crc
557
558 * <https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq>
559 * <https://en.wikipedia.org/wiki/Cyclic_redundancy_check>
560
561 ```
562 uint_xlen_t crc32(uint_xlen_t x, int nbits)
563 {
564 for (int i = 0; i < nbits; i++)
565 x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
566 return x;
567 }
568 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
569 {
570 for (int i = 0; i < nbits; i++)
571 x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
572 return x;
573 }
574 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
575 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
576 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
577 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
578 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
579 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
580 #if XLEN > 32
581 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
582 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
583 #endif
584 ```
585
586 # bitmatrix
587
588 ```
589 uint64_t bmatflip(uint64_t RA)
590 {
591 uint64_t x = RA;
592 x = shfl64(x, 31);
593 x = shfl64(x, 31);
594 x = shfl64(x, 31);
595 return x;
596 }
597 uint64_t bmatxor(uint64_t RA, uint64_t RB)
598 {
599 // transpose of RB
600 uint64_t RBt = bmatflip(RB);
601 uint8_t u[8]; // rows of RA
602 uint8_t v[8]; // cols of RB
603 for (int i = 0; i < 8; i++) {
604 u[i] = RA >> (i*8);
605 v[i] = RBt >> (i*8);
606 }
607 uint64_t x = 0;
608 for (int i = 0; i < 64; i++) {
609 if (pcnt(u[i / 8] & v[i % 8]) & 1)
610 x |= 1LL << i;
611 }
612 return x;
613 }
614 uint64_t bmator(uint64_t RA, uint64_t RB)
615 {
616 // transpose of RB
617 uint64_t RBt = bmatflip(RB);
618 uint8_t u[8]; // rows of RA
619 uint8_t v[8]; // cols of RB
620 for (int i = 0; i < 8; i++) {
621 u[i] = RA >> (i*8);
622 v[i] = RBt >> (i*8);
623 }
624 uint64_t x = 0;
625 for (int i = 0; i < 64; i++) {
626 if ((u[i / 8] & v[i % 8]) != 0)
627 x |= 1LL << i;
628 }
629 return x;
630 }
631
632 ```