7fec1087fb0e44ecc0a8d1c8f0e97f9997763cd5
[libreriscv.git] / openpower / sv / bitmanip.mdwn
1 [[!tag standards]]
2
3 # summary
4
5 1-op and variants
6
7 | dest | src1 | subop | op |
8 | ---- | ---- | ----- | -------- |
9 | RT | RA | .. | bmatflip |
10 | RT | RA | size | crc32 |
11 | RT | RA | size | crc32c |
12
13 2-op and variants
14
15 | dest | src1 | src2 | subop | op |
16 | ---- | ---- | ---- | ----- | -------- |
17 | RT | RA | RB | or | bmatflip |
18 | RT | RA | RB | xor | bmatflip |
19 | RT | RA | RB | bdep | dep/ext |
20 | RT | RA | RB | bext | dep/ext |
21 | RT | RA | RB | | grev |
22 | RT | RA | RB | | gorc |
23 | RT | RA | RB | shuf | shuffle |
24 | RT | RA | RB | unshuf| shuffle |
25 | RT | RA | RB | width | xperm |
26 | RT | RA | RB | type | clmul |
27 | RT | RA | RB | type | minmax |
28 | RT | RA | RB | | |
29 | RT | RA | RB | | |
30 | RT | RA | RB | | |
31
32 3 ops
33
34 * bitmask swt/extract
35 * ternary bitops
36 * GF
37
38 | 0.5|6.10|11.15|16.20|21.25| 26....30 |31| name |
39 | -- | -- | --- | --- | --- | -------- |--| ------ |
40 | NN | RT | RA | RB | RC | mode 001 |Rc| ternary |
41
42 ops
43
44 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
45 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
46 | NN | RA | RB | | | 0 | 0000 110 |Rc| rsvd |
47 | NN | RA | RB | RC | itype | 1 | 0000 110 |Rc| xperm |
48 | NN | RA | RB | RC | itype | 0 | 0100 110 |Rc| minmax |
49 | NN | RA | RB | | | 1 | 0100 110 |Rc| rsvd |
50 | NN | RA | RB | sh | itype | SH | 1000 110 |Rc| bmopsi |
51 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
52 | NN | RA | RB | | | 0 | 0001 110 |Rc| rsvd |
53 | NN | RA | RB | | | 0 | 0101 110 |Rc| rsvd |
54 | NN | RA | RB | RC | 00 | 0 | 0010 110 |Rc| gorc |
55 | NN | RA | RB | sh | 00 | SH | 1010 110 |Rc| gorci |
56 | NN | RA | RB | RC | 00 | 0 | 0110 110 |Rc| gorcw |
57 | NN | RA | RB | sh | 00 | 0 | 1110 110 |Rc| gorcwi |
58 | NN | RA | RB | RC | 00 | 1 | 1110 110 |Rc| bmator |
59 | NN | RA | RB | RC | 01 | 0 | 0010 110 |Rc| grev |
60 | NN | RA | RB | sh | 01 | SH | 1010 110 |Rc| grevi |
61 | NN | RA | RB | RC | 01 | 0 | 0110 110 |Rc| grevw |
62 | NN | RA | RB | sh | 01 | 0 | 1110 110 |Rc| grevwi |
63 | NN | RA | RB | RC | 01 | 1 | 1110 110 |Rc| bmatxor |
64 | NN | RA | RB | RC | 10 | 0 | 0010 110 |Rc| shfl |
65 | NN | RA | RB | sh | 10 | SH | 1010 110 |Rc| shfli |
66 | NN | RA | RB | RC | 10 | 0 | 0110 110 |Rc| shflw |
67 | NN | RA | RB | RC | 10 | 0 | 1110 110 |Rc| bdep |
68 | NN | RA | RB | RC | 10 | 1 | 1110 110 |Rc| bext |
69 | NN | RA | RB | | 11 | | 1110 110 |Rc| rsvd |
70 | NN | RA | RB | | | | NN11 110 |Rc| rsvd |
71
72 # bit to byte permute
73
74 similar to matrix permute in RV bitmanip, which has XOR and OR variants
75
76 do j = 0 to 7
77 do k = 0 to 7
78 b = VSR[VRB+32].dword[i].byte[k].bit[j]
79 VSR[VRT+32].dword[i].byte[j].bit[k] = b
80
81 # vector bit deposit
82
83 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
84
85 do while(m < 64)
86 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
87 result = VSR[VRA+32].dword[i].bit[63-k]
88 VSR[VRT+32].dword[i].bit[63-m] = result
89 k = k + 1
90 m = m + 1
91
92 ```
93
94 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
95 {
96 uint_xlen_t r = 0;
97 for (int i = 0, j = 0; i < XLEN; i++)
98 if ((RB >> i) & 1) {
99 if ((RA >> j) & 1)
100 r |= uint_xlen_t(1) << i;
101 j++;
102 }
103 return r;
104 }
105
106 ```
107
108 # vector bit extract
109
110 other way round: identical to RV bext
111
112 ```
113 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
114 {
115 uint_xlen_t r = 0;
116 for (int i = 0, j = 0; i < XLEN; i++)
117 if ((RB >> i) & 1) {
118 if ((RA >> i) & 1)
119 r |= uint_xlen_t(1) << j;
120 j++;
121 }
122 return r;
123 }
124 ```
125
126 # int min/max
127
128 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
129
130 signed/unsigned min/max gives more flexibility.
131
132 # ternary bitops
133
134 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
135
136 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
137 | -- | -- | --- | --- | ----- | -------- |--|
138 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
139
140 for i in range(64):
141 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
142 RT[i] = (imm & (1<<idx)) != 0
143
144 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
145
146 a 4 operand variant which becomes more along the lines of an FPGA:
147
148 | 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
149 | -- | -- | --- | --- | --- | -------- |--|
150 | NN | RT | RA | RB | RC | mode 001 |Rc|
151
152 for i in range(64):
153 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
154 RT[i] = (RC & (1<<idx)) != 0
155
156 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
157 3 modes.
158
159 also, another possible variant involving swizzle and vec4:
160
161 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
162 | -- | -- | --- | ----- | ---- | ----- |--|
163 | NN | RT | RA | imm | mask | 101 |1 |
164
165 for i in range(8):
166 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
167 res = (imm & (1<<idx)) != 0
168 for j in range(3):
169 if mask[j]: RT[i+j*8] = res
170
171 another mode selection would be CRs not Ints.
172
173 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
174 | -- | -- | --- | --- |- |-----|----- | -----|--|
175 | NN | BA | BB | BC |0 |imm | mask | 101 |0 |
176
177 for i in range(4):
178 if not mask[i] continue
179 idx = crregs[BA][i] << 2 |
180 crregs[BB][i] << 1 |
181 crregs[BC][i]
182 crregs[BA][i] = (imm & (1<<idx)) != 0
183
184 # bitmask set
185
186 based on RV bitmanip singlebit set, instruction format similar to shift
187 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask).
188 however bitmask-invert is not, and set/clr are not covered, although they can ise the same Shift ALU.
189
190 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
191 | -- | -- | --- | --- | --- | ------- |--|
192 | NN | RT | RA | RB | RC | mode 010 |Rc|
193
194 ```
195 uint_xlen_t bmset(RA, RB, sh)
196 {
197 int shamt = RB & (XLEN - 1);
198 mask = (2<<sh)-1;
199 return RA | (mask << shamt);
200 }
201
202 uint_xlen_t bmclr(RA, RB, sh)
203 {
204 int shamt = RB & (XLEN - 1);
205 mask = (2<<sh)-1;
206 return RA & ~(mask << shamt);
207 }
208
209 uint_xlen_t bminv(RA, RB, sh)
210 {
211 int shamt = RB & (XLEN - 1);
212 mask = (2<<sh)-1;
213 return RA ^ (mask << shamt);
214 }
215
216 uint_xlen_t bmext(RA, RB, sh)
217 {
218 int shamt = RB & (XLEN - 1);
219 mask = (2<<sh)-1;
220 return mask & (RA >> shamt);
221 }
222 ```
223
224 # grev
225
226 based on RV bitmanip
227
228 ```
229 uint64_t grev64(uint64_t RA, uint64_t RB)
230 {
231 uint64_t x = RA;
232 int shamt = RB & 63;
233 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
234 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
235 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
236 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
237 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
238 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
239 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
240 ((x & 0xFF00FF00FF00FF00LL) >> 8);
241 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
242 ((x & 0xFFFF0000FFFF0000LL) >> 16);
243 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
244 ((x & 0xFFFFFFFF00000000LL) >> 32);
245 return x;
246 }
247
248 ```
249
250 # shuffle / unshuffle
251
252 based on RV bitmanip
253
254 ```
255 uint32_t shfl32(uint32_t RA, uint32_t RB)
256 {
257 uint32_t x = RA;
258 int shamt = RB & 15;
259 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
260 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
261 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
262 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
263 return x;
264 }
265 uint32_t unshfl32(uint32_t RA, uint32_t RB)
266 {
267 uint32_t x = RA;
268 int shamt = RB & 15;
269 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
270 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
271 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
272 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
273 return x;
274 }
275
276 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
277 {
278 uint64_t x = src & ~(maskL | maskR);
279 x |= ((src << N) & maskL) | ((src >> N) & maskR);
280 return x;
281 }
282 uint64_t shfl64(uint64_t RA, uint64_t RB)
283 {
284 uint64_t x = RA;
285 int shamt = RB & 31;
286 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
287 0x00000000ffff0000LL, 16);
288 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
289 0x0000ff000000ff00LL, 8);
290 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
291 0x00f000f000f000f0LL, 4);
292 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
293 0x0c0c0c0c0c0c0c0cLL, 2);
294 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
295 0x2222222222222222LL, 1);
296 return x;
297 }
298 uint64_t unshfl64(uint64_t RA, uint64_t RB)
299 {
300 uint64_t x = RA;
301 int shamt = RB & 31;
302 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
303 0x2222222222222222LL, 1);
304 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
305 0x0c0c0c0c0c0c0c0cLL, 2);
306 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
307 0x00f000f000f000f0LL, 4);
308 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
309 0x0000ff000000ff00LL, 8);
310 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
311 0x00000000ffff0000LL, 16);
312 return x;
313 }
314 ```
315
316 # xperm
317
318 based on RV bitmanip
319
320 ```
321 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
322 {
323 uint_xlen_t r = 0;
324 uint_xlen_t sz = 1LL << sz_log2;
325 uint_xlen_t mask = (1LL << sz) - 1;
326 for (int i = 0; i < XLEN; i += sz) {
327 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
328 if (pos < XLEN)
329 r |= ((RA >> pos) & mask) << i;
330 }
331 return r;
332 }
333 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
334 { return xperm(RA, RB, 2); }
335 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
336 { return xperm(RA, RB, 3); }
337 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
338 { return xperm(RA, RB, 4); }
339 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
340 { return xperm(RA, RB, 5); }
341 ```
342
343 # gorc
344
345 based on RV bitmanip
346
347 ```
348 uint32_t gorc32(uint32_t RA, uint32_t RB)
349 {
350 uint32_t x = RA;
351 int shamt = RB & 31;
352 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
353 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
354 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
355 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
356 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
357 return x;
358 }
359 uint64_t gorc64(uint64_t RA, uint64_t RB)
360 {
361 uint64_t x = RA;
362 int shamt = RB & 63;
363 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
364 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
365 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
366 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
367 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
368 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
369 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
370 ((x & 0xFF00FF00FF00FF00LL) >> 8);
371 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
372 ((x & 0xFFFF0000FFFF0000LL) >> 16);
373 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
374 ((x & 0xFFFFFFFF00000000LL) >> 32);
375 return x;
376 }
377
378 ```
379
380 # cmix
381
382 based on RV bitmanip, covered by ternary bitops
383
384 ```
385 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
386 return (RA & RB) | (RC & ~RB);
387 }
388 ```
389
390 # carryless mul
391
392 based on RV bitmanip
393 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
394
395 ```
396 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
397 {
398 uint_xlen_t x = 0;
399 for (int i = 0; i < XLEN; i++)
400 if ((RB >> i) & 1)
401 x ^= RA << i;
402 return x;
403 }
404 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
405 {
406 uint_xlen_t x = 0;
407 for (int i = 1; i < XLEN; i++)
408 if ((RB >> i) & 1)
409 x ^= RA >> (XLEN-i);
410 return x;
411 }
412 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
413 {
414 uint_xlen_t x = 0;
415 for (int i = 0; i < XLEN; i++)
416 if ((RB >> i) & 1)
417 x ^= RA >> (XLEN-i-1);
418 return x;
419 }
420 ```
421 # Galois Field
422
423 ## Multiply
424
425 this requires 3 parameters and a "degree"
426
427 RT = GFMUL(RA, RB, gfdegree, modulo=RC)
428
429 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
430
431 RS = GFMUL(RS, RA, gfdegree, modulo=RB)
432
433 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
434 | -- | -- | --- | --- | --- | ------- |--|
435 | NN | RS | RA | RB | deg | 00 011 |Rc|
436
437 where the SimpleV variant may override RS-as-src differently from RS-as-dest
438
439
440
441 ```
442 from functools import reduce
443
444 # constants used in the multGF2 function
445 mask1 = mask2 = polyred = None
446
447 def setGF2(degree, irPoly):
448 """Define parameters of binary finite field GF(2^m)/g(x)
449 - degree: extension degree of binary field
450 - irPoly: coefficients of irreducible polynomial g(x)
451 """
452 def i2P(sInt):
453 """Convert an integer into a polynomial"""
454 return [(sInt >> i) & 1
455 for i in reversed(range(sInt.bit_length()))]
456
457 global mask1, mask2, polyred
458 mask1 = mask2 = 1 << degree
459 mask2 -= 1
460 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
461
462 def multGF2(p1, p2):
463 """Multiply two polynomials in GF(2^m)/g(x)"""
464 p = 0
465 while p2:
466 if p2 & 1:
467 p ^= p1
468 p1 <<= 1
469 if p1 & mask1:
470 p1 ^= polyred
471 p2 >>= 1
472 return p & mask2
473
474 if __name__ == "__main__":
475
476 # Define binary field GF(2^3)/x^3 + x + 1
477 setGF2(3, 0b1011)
478
479 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
480 print("{:02x}".format(multGF2(0b111, 0b101)))
481
482 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
483 # (used in the Advanced Encryption Standard-AES)
484 setGF2(8, 0b100011011)
485
486 # Evaluate the product (x^7)(x^7 + x + 1)
487 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
488 ```
489 ## GF add
490
491 RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
492
493 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
494 | -- | -- | --- | --- | --- | ------- |--|
495 | NN | RS | RA | RB | deg | 01 011 |Rc|
496
497 ## gf invert
498
499 ```
500 def gf_degree(a) :
501 res = 0
502 a >>= 1
503 while (a != 0) :
504 a >>= 1;
505 res += 1;
506 return res
507
508 def gf_invert(a, mod=0x1B) :
509 v = mod
510 g1 = 1
511 g2 = 0
512 j = gf_degree(a) - 8
513
514 while (a != 1) :
515 if (j < 0) :
516 a, v = v, a
517 g1, g2 = g2, g1
518 j = -j
519
520 a ^= v << j
521 g1 ^= g2 << j
522
523 a %= 256 # Emulating 8-bit overflow
524 g1 %= 256 # Emulating 8-bit overflow
525
526 j = gf_degree(a) - gf_degree(v)
527
528 return g1
529 ```
530
531 # crc
532
533 * <https://stackoverflow.com/questions/21171733/calculating-constants-for-crc32-using-pclmulqdq>
534 * <https://en.wikipedia.org/wiki/Cyclic_redundancy_check>
535
536 ```
537 uint_xlen_t crc32(uint_xlen_t x, int nbits)
538 {
539 for (int i = 0; i < nbits; i++)
540 x = (x >> 1) ^ (0xEDB88320 & ~((x&1)-1));
541 return x;
542 }
543 uint_xlen_t crc32c(uint_xlen_t x, int nbits)
544 {
545 for (int i = 0; i < nbits; i++)
546 x = (x >> 1) ^ (0x82F63B78 & ~((x&1)-1));
547 return x;
548 }
549 uint_xlen_t crc32_b(uint_xlen_t RA) { return crc32(RA, 8); }
550 uint_xlen_t crc32_h(uint_xlen_t RA) { return crc32(RA, 16); }
551 uint_xlen_t crc32_w(uint_xlen_t RA) { return crc32(RA, 32); }
552 uint_xlen_t crc32c_b(uint_xlen_t RA) { return crc32c(RA, 8); }
553 uint_xlen_t crc32c_h(uint_xlen_t RA) { return crc32c(RA, 16); }
554 uint_xlen_t crc32c_w(uint_xlen_t RA) { return crc32c(RA, 32); }
555 #if XLEN > 32
556 uint_xlen_t crc32_d (uint_xlen_t RA) { return crc32 (RA, 64); }
557 uint_xlen_t crc32c_d(uint_xlen_t RA) { return crc32c(RA, 64); }
558 #endif
559 ```
560
561 # bitmatrix
562
563 ```
564 uint64_t bmatflip(uint64_t RA)
565 {
566 uint64_t x = RA;
567 x = shfl64(x, 31);
568 x = shfl64(x, 31);
569 x = shfl64(x, 31);
570 return x;
571 }
572 uint64_t bmatxor(uint64_t RA, uint64_t RB)
573 {
574 // transpose of RB
575 uint64_t RBt = bmatflip(RB);
576 uint8_t u[8]; // rows of RA
577 uint8_t v[8]; // cols of RB
578 for (int i = 0; i < 8; i++) {
579 u[i] = RA >> (i*8);
580 v[i] = RBt >> (i*8);
581 }
582 uint64_t x = 0;
583 for (int i = 0; i < 64; i++) {
584 if (pcnt(u[i / 8] & v[i % 8]) & 1)
585 x |= 1LL << i;
586 }
587 return x;
588 }
589 uint64_t bmator(uint64_t RA, uint64_t RB)
590 {
591 // transpose of RB
592 uint64_t RBt = bmatflip(RB);
593 uint8_t u[8]; // rows of RA
594 uint8_t v[8]; // cols of RB
595 for (int i = 0; i < 8; i++) {
596 u[i] = RA >> (i*8);
597 v[i] = RBt >> (i*8);
598 }
599 uint64_t x = 0;
600 for (int i = 0; i < 64; i++) {
601 if ((u[i / 8] & v[i % 8]) != 0)
602 x |= 1LL << i;
603 }
604 return x;
605 }
606
607 ```