(no commit message)
[libreriscv.git] / openpower / sv / bitmanip.mdwn
1 [[!tag standards]]
2
3 # bitmanipulation
4
5 **DRAFT STATUS**
6
7 this extension amalgamates bitnanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX. Vectorisation and SIMD are removed: these are straight scalar (element) operations. Vectorisation Context is provided by [[openpower/sv]].
8
9 ternaryv is experimental and is the only operation that may be considered a "Packed SIMD". It is added as a variant of the already well-justified ternary operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs. Other vendors of ISAs are buying FPGA companies to achieve a similar objective.
10
11 general-purpose Galois Field operations are added so as to avoid huge opcode proliferation across many areas of Computer Science. however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added. The expectation is that these operations would all be covered by the same pipeline.
12
13 # summary
14
15 minor opcode allocation
16
17 | 28.30 |31| name |
18 | ------ |--| --------- |
19 | 00 |Rc| ternaryi |
20 | 001 |Rc| ternary |
21 | 010 |Rc| bitmask |
22 | 011 |Rc| gf* |
23 | 101 |1 | ternaryv |
24 | 101 |0 | ternarycr |
25 | 110 |Rc| 1/2-op |
26 | 111 |Rc| bitmaski |
27
28 1-op and variants
29
30 | dest | src1 | subop | op |
31 | ---- | ---- | ----- | -------- |
32 | RT | RA | .. | bmatflip |
33
34 2-op and variants
35
36 | dest | src1 | src2 | subop | op |
37 | ---- | ---- | ---- | ----- | -------- |
38 | RT | RA | RB | or | bmatflip |
39 | RT | RA | RB | xor | bmatflip |
40 | RT | RA | RB | bdep | dep/ext |
41 | RT | RA | RB | bext | dep/ext |
42 | RT | RA | RB | | grev |
43 | RT | RA | RB | | clmul* |
44 | RT | RA | RB | | gorc |
45 | RT | RA | RB | shuf | shuffle |
46 | RT | RA | RB | unshuf| shuffle |
47 | RT | RA | RB | width | xperm |
48 | RT | RA | RB | type | minmax |
49 | RT | RA | RB | | |
50 | RT | RA | RB | | |
51 | RT | RA | RB | | |
52
53 3 ops
54
55 * bitmask set/extract
56 * ternary bitops
57 * GF
58
59 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
60 | -- | -- | --- | --- | ----- | -------- |--| ------ |
61 | NN | RT | RA | RB | RC | mode 001 |Rc| ternary |
62 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc| ternaryi |
63 | NN | RS | RA | RB | deg | 00 011 |Rc| gfmul |
64 | NN | RS | RA | RB | deg | 01 011 |Rc| gfadd |
65 | NN | RT | RA | RB | deg | 10 011 |Rc| gfinv |
66 | NN | RS | RA | RB | deg | 11 011 |Rc| gfmod |
67
68 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
69 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
70 | NN | RT | RA | imm | mask | 101 |1 | ternaryv |
71
72 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
73 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
74 | NN | BA | BB | BC |0 |imm | mask | 101 |0 | ternarycr |
75
76 ops
77
78 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
79 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
80 | NN | RA | RB | | | 0 | 0000 110 |Rc| rsvd |
81 | NN | RA | RB | RC | itype | 1 | 0000 110 |Rc| xperm |
82 | NN | RA | RB | RC | itype | 0 | 0100 110 |Rc| minmax |
83 | NN | RA | RB | | | 1 | 0100 110 |Rc| rsvd |
84 | NN | RA | RB | sh | itype | SH | 1000 110 |Rc| bmopsi |
85 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
86 | NN | RA | RB | | | 0 | 0001 110 |Rc| rsvd |
87 | NN | RA | RB | | | 0 | 0101 110 |Rc| rsvd |
88 | NN | RA | RB | RC | 00 | 0 | 0010 110 |Rc| gorc |
89 | NN | RA | RB | sh | 00 | SH | 1010 110 |Rc| gorci |
90 | NN | RA | RB | RC | 00 | 0 | 0110 110 |Rc| gorcw |
91 | NN | RA | RB | sh | 00 | 0 | 1110 110 |Rc| gorcwi |
92 | NN | RA | RB | RC | 00 | 1 | 1110 110 |Rc| bmator |
93 | NN | RA | RB | RC | 01 | 0 | 0010 110 |Rc| grev |
94 | NN | RA | RB | RC | 01 | 1 | 0010 110 |Rc| clmul |
95 | NN | RA | RB | sh | 01 | SH | 1010 110 |Rc| grevi |
96 | NN | RA | RB | RC | 01 | 0 | 0110 110 |Rc| grevw |
97 | NN | RA | RB | sh | 01 | 0 | 1110 110 |Rc| grevwi |
98 | NN | RA | RB | RC | 01 | 1 | 1110 110 |Rc| bmatxor |
99 | NN | RA | RB | RC | 10 | 0 | 0010 110 |Rc| shfl |
100 | NN | RA | RB | sh | 10 | SH | 1010 110 |Rc| shfli |
101 | NN | RA | RB | RC | 10 | 0 | 0110 110 |Rc| shflw |
102 | NN | RA | RB | RC | 10 | 0 | 1110 110 |Rc| bdep |
103 | NN | RA | RB | RC | 10 | 1 | 1110 110 |Rc| bext |
104 | NN | RA | RB | RC | 11 | 0 | 1110 110 |Rc| clmulr |
105 | NN | RA | RB | RC | 11 | 1 | 1110 110 |Rc| clmulh |
106 | NN | RA | RB | | | | NN11 110 |Rc| rsvd |
107
108 # bit to byte permute
109
110 similar to matrix permute in RV bitmanip, which has XOR and OR variants
111
112 do j = 0 to 7
113 do k = 0 to 7
114 b = VSR[VRB+32].dword[i].byte[k].bit[j]
115 VSR[VRT+32].dword[i].byte[j].bit[k] = b
116
117 # vector bit deposit
118
119 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
120
121 do while(m < 64)
122 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
123 result = VSR[VRA+32].dword[i].bit[63-k]
124 VSR[VRT+32].dword[i].bit[63-m] = result
125 k = k + 1
126 m = m + 1
127
128 ```
129
130 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
131 {
132 uint_xlen_t r = 0;
133 for (int i = 0, j = 0; i < XLEN; i++)
134 if ((RB >> i) & 1) {
135 if ((RA >> j) & 1)
136 r |= uint_xlen_t(1) << i;
137 j++;
138 }
139 return r;
140 }
141
142 ```
143
144 # vector bit extract
145
146 other way round: identical to RV bext
147
148 ```
149 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
150 {
151 uint_xlen_t r = 0;
152 for (int i = 0, j = 0; i < XLEN; i++)
153 if ((RB >> i) & 1) {
154 if ((RA >> i) & 1)
155 r |= uint_xlen_t(1) << j;
156 j++;
157 }
158 return r;
159 }
160 ```
161
162 # int min/max
163
164 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
165
166 signed/unsigned min/max gives more flexibility.
167
168 ```
169 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
170 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
171 }
172 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
173 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
174 }
175 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
176 { return rs1 < rs2 ? rs1 : rs2;
177 }
178 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
179 { return rs1 > rs2 ? rs1 : rs2;
180 }
181 ```
182
183
184 # ternary bitops
185
186 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
187
188 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
189 | -- | -- | --- | --- | ----- | -------- |--|
190 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
191
192 for i in range(64):
193 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
194 RT[i] = (imm & (1<<idx)) != 0
195
196 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
197
198 a 4 operand variant which becomes more along the lines of an FPGA:
199
200 | 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
201 | -- | -- | --- | --- | --- | -------- |--|
202 | NN | RT | RA | RB | RC | mode 001 |Rc|
203
204 for i in range(64):
205 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
206 RT[i] = (RC & (1<<idx)) != 0
207
208 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
209 3 modes.
210
211 also, another possible variant involving swizzle and vec4:
212
213 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
214 | -- | -- | --- | ----- | ---- | ----- |--|
215 | NN | RT | RA | imm | mask | 101 |1 |
216
217 for i in range(8):
218 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
219 res = (imm & (1<<idx)) != 0
220 for j in range(3):
221 if mask[j]: RT[i+j*8] = res
222
223 another mode selection would be CRs not Ints.
224
225 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
226 | -- | -- | --- | --- |- |-----|----- | -----|--|
227 | NN | BA | BB | BC |0 |imm | mask | 101 |0 |
228
229 for i in range(4):
230 if not mask[i] continue
231 idx = crregs[BA][i] << 2 |
232 crregs[BB][i] << 1 |
233 crregs[BC][i]
234 crregs[BA][i] = (imm & (1<<idx)) != 0
235
236 # bitmask set
237
238 based on RV bitmanip singlebit set, instruction format similar to shift
239 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask rldicl but only immediate version).
240 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
241
242 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate. for the immediate version this does not matter, so a bmexti is not required.
243 bmrev however there is no direct equivalent and consequently a bmrevi is required.
244
245 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name |
246 | -- | -- | --- | --- | --- | ------- |--| ----- |
247 | NN | RT | RA | RB | RC | mode 010 |Rc| bm* |
248 | NN | RT | RA | RB | RC | 0 1 111 |Rc| bmrev |
249 | NN | | | | | 1 1 111 |Rc| rsvd |
250
251 ```
252 uint_xlen_t bmset(RA, RB, sh)
253 {
254 int shamt = RB & (XLEN - 1);
255 mask = (2<<sh)-1;
256 return RA | (mask << shamt);
257 }
258
259 uint_xlen_t bmclr(RA, RB, sh)
260 {
261 int shamt = RB & (XLEN - 1);
262 mask = (2<<sh)-1;
263 return RA & ~(mask << shamt);
264 }
265
266 uint_xlen_t bminv(RA, RB, sh)
267 {
268 int shamt = RB & (XLEN - 1);
269 mask = (2<<sh)-1;
270 return RA ^ (mask << shamt);
271 }
272
273 uint_xlen_t bmext(RA, RB, sh)
274 {
275 int shamt = RB & (XLEN - 1);
276 mask = (2<<sh)-1;
277 return mask & (RA >> shamt);
278 }
279 ```
280
281 bitmask extract with reverse
282
283 ```
284 msb = rb[5:0];
285 rev[0:msb] = ra[msb:0];
286 rt = ZE(rev[msb:0]);
287 ```
288
289 | 0.5|6.10|11.15|16.20|21.26| 27..30 |31| name |
290 | -- | -- | --- | --- | --- | ------- |--| ------ |
291 | NN | RT | RA | RB | sh | 0 111 |Rc| bmrevi |
292
293
294
295 # grev
296
297 based on RV bitmanip
298
299 ```
300 uint64_t grev64(uint64_t RA, uint64_t RB)
301 {
302 uint64_t x = RA;
303 int shamt = RB & 63;
304 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
305 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
306 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
307 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
308 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
309 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
310 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
311 ((x & 0xFF00FF00FF00FF00LL) >> 8);
312 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
313 ((x & 0xFFFF0000FFFF0000LL) >> 16);
314 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
315 ((x & 0xFFFFFFFF00000000LL) >> 32);
316 return x;
317 }
318
319 ```
320
321 # shuffle / unshuffle
322
323 based on RV bitmanip
324
325 ```
326 uint32_t shfl32(uint32_t RA, uint32_t RB)
327 {
328 uint32_t x = RA;
329 int shamt = RB & 15;
330 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
331 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
332 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
333 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
334 return x;
335 }
336 uint32_t unshfl32(uint32_t RA, uint32_t RB)
337 {
338 uint32_t x = RA;
339 int shamt = RB & 15;
340 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
341 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
342 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
343 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
344 return x;
345 }
346
347 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
348 {
349 uint64_t x = src & ~(maskL | maskR);
350 x |= ((src << N) & maskL) | ((src >> N) & maskR);
351 return x;
352 }
353 uint64_t shfl64(uint64_t RA, uint64_t RB)
354 {
355 uint64_t x = RA;
356 int shamt = RB & 31;
357 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
358 0x00000000ffff0000LL, 16);
359 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
360 0x0000ff000000ff00LL, 8);
361 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
362 0x00f000f000f000f0LL, 4);
363 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
364 0x0c0c0c0c0c0c0c0cLL, 2);
365 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
366 0x2222222222222222LL, 1);
367 return x;
368 }
369 uint64_t unshfl64(uint64_t RA, uint64_t RB)
370 {
371 uint64_t x = RA;
372 int shamt = RB & 31;
373 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
374 0x2222222222222222LL, 1);
375 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
376 0x0c0c0c0c0c0c0c0cLL, 2);
377 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
378 0x00f000f000f000f0LL, 4);
379 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
380 0x0000ff000000ff00LL, 8);
381 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
382 0x00000000ffff0000LL, 16);
383 return x;
384 }
385 ```
386
387 # xperm
388
389 based on RV bitmanip
390
391 ```
392 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
393 {
394 uint_xlen_t r = 0;
395 uint_xlen_t sz = 1LL << sz_log2;
396 uint_xlen_t mask = (1LL << sz) - 1;
397 for (int i = 0; i < XLEN; i += sz) {
398 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
399 if (pos < XLEN)
400 r |= ((RA >> pos) & mask) << i;
401 }
402 return r;
403 }
404 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
405 { return xperm(RA, RB, 2); }
406 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
407 { return xperm(RA, RB, 3); }
408 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
409 { return xperm(RA, RB, 4); }
410 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
411 { return xperm(RA, RB, 5); }
412 ```
413
414 # gorc
415
416 based on RV bitmanip
417
418 ```
419 uint32_t gorc32(uint32_t RA, uint32_t RB)
420 {
421 uint32_t x = RA;
422 int shamt = RB & 31;
423 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
424 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
425 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
426 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
427 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
428 return x;
429 }
430 uint64_t gorc64(uint64_t RA, uint64_t RB)
431 {
432 uint64_t x = RA;
433 int shamt = RB & 63;
434 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
435 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
436 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
437 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
438 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
439 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
440 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
441 ((x & 0xFF00FF00FF00FF00LL) >> 8);
442 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
443 ((x & 0xFFFF0000FFFF0000LL) >> 16);
444 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
445 ((x & 0xFFFFFFFF00000000LL) >> 32);
446 return x;
447 }
448
449 ```
450
451 # cmix
452
453 based on RV bitmanip, covered by ternary bitops
454
455 ```
456 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
457 return (RA & RB) | (RC & ~RB);
458 }
459 ```
460
461 # carryless mul
462
463 based on RV bitmanip
464 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
465
466 ```
467 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
468 {
469 uint_xlen_t x = 0;
470 for (int i = 0; i < XLEN; i++)
471 if ((RB >> i) & 1)
472 x ^= RA << i;
473 return x;
474 }
475 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
476 {
477 uint_xlen_t x = 0;
478 for (int i = 1; i < XLEN; i++)
479 if ((RB >> i) & 1)
480 x ^= RA >> (XLEN-i);
481 return x;
482 }
483 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
484 {
485 uint_xlen_t x = 0;
486 for (int i = 0; i < XLEN; i++)
487 if ((RB >> i) & 1)
488 x ^= RA >> (XLEN-i-1);
489 return x;
490 }
491 ```
492 # Galois Field
493
494 ## Multiply
495
496 this requires 3 parameters and a "degree"
497
498 RT = GFMUL(RA, RB, gfdegree, modulo=RC)
499
500 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
501
502 RS = GFMUL(RS, RA, gfdegree, modulo=RB)
503
504 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
505 | -- | -- | --- | --- | --- | ------- |--|
506 | NN | RS | RA | RB | deg | 00 011 |Rc|
507
508 where the SimpleV variant may override RS-as-src differently from RS-as-dest
509
510
511
512 ```
513 from functools import reduce
514
515 # constants used in the multGF2 function
516 mask1 = mask2 = polyred = None
517
518 def setGF2(degree, irPoly):
519 """Define parameters of binary finite field GF(2^m)/g(x)
520 - degree: extension degree of binary field
521 - irPoly: coefficients of irreducible polynomial g(x)
522 """
523 def i2P(sInt):
524 """Convert an integer into a polynomial"""
525 return [(sInt >> i) & 1
526 for i in reversed(range(sInt.bit_length()))]
527
528 global mask1, mask2, polyred
529 mask1 = mask2 = 1 << degree
530 mask2 -= 1
531 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
532
533 def multGF2(p1, p2):
534 """Multiply two polynomials in GF(2^m)/g(x)"""
535 p = 0
536 while p2:
537 if p2 & 1:
538 p ^= p1
539 p1 <<= 1
540 if p1 & mask1:
541 p1 ^= polyred
542 p2 >>= 1
543 return p & mask2
544
545 if __name__ == "__main__":
546
547 # Define binary field GF(2^3)/x^3 + x + 1
548 setGF2(3, 0b1011)
549
550 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
551 print("{:02x}".format(multGF2(0b111, 0b101)))
552
553 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
554 # (used in the Advanced Encryption Standard-AES)
555 setGF2(8, 0b100011011)
556
557 # Evaluate the product (x^7)(x^7 + x + 1)
558 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
559 ```
560 ## GF add
561
562 RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
563
564 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
565 | -- | -- | --- | --- | --- | ------- |--|
566 | NN | RS | RA | RB | deg | 01 011 |Rc|
567
568 ## gf invert
569
570 ```
571 def gf_degree(a) :
572 res = 0
573 a >>= 1
574 while (a != 0) :
575 a >>= 1;
576 res += 1;
577 return res
578
579 def gf_invert(a, mod=0x1B) :
580 v = mod
581 g1 = 1
582 g2 = 0
583 j = gf_degree(a) - 8
584
585 while (a != 1) :
586 if (j < 0) :
587 a, v = v, a
588 g1, g2 = g2, g1
589 j = -j
590
591 a ^= v << j
592 g1 ^= g2 << j
593
594 a %= 256 # Emulating 8-bit overflow
595 g1 %= 256 # Emulating 8-bit overflow
596
597 j = gf_degree(a) - gf_degree(v)
598
599 return g1
600 ```
601
602 ## GF mod
603
604 modulo arithmetic to ensure a polynomial is in range seems sensible to have.
605
606 # bitmatrix
607
608 ```
609 uint64_t bmatflip(uint64_t RA)
610 {
611 uint64_t x = RA;
612 x = shfl64(x, 31);
613 x = shfl64(x, 31);
614 x = shfl64(x, 31);
615 return x;
616 }
617 uint64_t bmatxor(uint64_t RA, uint64_t RB)
618 {
619 // transpose of RB
620 uint64_t RBt = bmatflip(RB);
621 uint8_t u[8]; // rows of RA
622 uint8_t v[8]; // cols of RB
623 for (int i = 0; i < 8; i++) {
624 u[i] = RA >> (i*8);
625 v[i] = RBt >> (i*8);
626 }
627 uint64_t x = 0;
628 for (int i = 0; i < 64; i++) {
629 if (pcnt(u[i / 8] & v[i % 8]) & 1)
630 x |= 1LL << i;
631 }
632 return x;
633 }
634 uint64_t bmator(uint64_t RA, uint64_t RB)
635 {
636 // transpose of RB
637 uint64_t RBt = bmatflip(RB);
638 uint8_t u[8]; // rows of RA
639 uint8_t v[8]; // cols of RB
640 for (int i = 0; i < 8; i++) {
641 u[i] = RA >> (i*8);
642 v[i] = RBt >> (i*8);
643 }
644 uint64_t x = 0;
645 for (int i = 0; i < 64; i++) {
646 if ((u[i / 8] & v[i % 8]) != 0)
647 x |= 1LL << i;
648 }
649 return x;
650 }
651
652 ```