(no commit message)
[libreriscv.git] / openpower / sv / bitmanip.mdwn
1 [[!tag standards]]
2
3 # bitmanipulation
4
5 **DRAFT STATUS**
6
7 this extension amalgamates bitmanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX. Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications.
8 Vectorisation Context is provided by [[openpower/sv]].
9
10 Scaoar variants of bitmanip oerations found in VSX are added so that VSX may be retired as "legacy" in the far future (10 to 20 years). Also, because VSX is hundreds of opcodes, requires 128 bit pathways, and is wholly unsuited to low power or embedded scenarios.
11
12 ternaryv is experimental and is the only operation that may be considered a "Packed SIMD". It is added as a variant of the already well-justified ternary operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs. Other vendors of ISAs are buying FPGA companies to achieve a similar objective.
13
14 general-purpose Galois Field operations are added so as to avoid huge opcode proliferation across many areas of Computer Science. however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added. The expectation is that these operations would all be covered by the same pipeline.
15
16 note that there are brownfield spaces below that could incorporate some of the set-before-first and other scalar operations listed in [[sv/vector_ops]],
17 the [[sv/av_opcodes]] and [[sv/cr_int_predication]]
18
19 # summary
20
21 minor opcode allocation
22
23 | 28.30 |31| name |
24 | ------ |--| --------- |
25 | 00 |Rc| ternaryi |
26 | 001 |Rc| ternary |
27 | 010 |Rc| bitmask |
28 | 011 |Rc| gf* |
29 | 101 |1 | ternaryv |
30 | 101 |0 | ternarycr |
31 | 110 |Rc| 1/2-op |
32 | 111 |Rc| 3-op |
33
34 1-op and variants
35
36 | dest | src1 | subop | op |
37 | ---- | ---- | ----- | -------- |
38 | RT | RA | .. | bmatflip |
39
40 2-op and variants
41
42 | dest | src1 | src2 | subop | op |
43 | ---- | ---- | ---- | ----- | -------- |
44 | RT | RA | RB | or | bmatflip |
45 | RT | RA | RB | xor | bmatflip |
46 | RT | RA | RB | bdep | dep/ext |
47 | RT | RA | RB | bext | dep/ext |
48 | RT | RA | RB | | grev |
49 | RT | RA | RB | | clmul* |
50 | RT | RA | RB | | gorc |
51 | RT | RA | RB | shuf | shuffle |
52 | RT | RA | RB | unshuf| shuffle |
53 | RT | RA | RB | width | xperm |
54 | RT | RA | RB | type | minmax |
55 | RT | RA | RB | | |
56 | RT | RA | RB | | |
57 | RT | RA | RB | | |
58
59 3 ops
60
61 * bitmask set/extract
62 * ternary bitops
63 * GF
64
65 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
66 | -- | -- | --- | --- | ----- | -------- |--| ------ |
67 | NN | RT | RA | RB | RC | mode 001 |Rc| ternary |
68 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc| ternaryi |
69 | NN | RS | RA | RB | RC | 00 011 |Rc| gfmul |
70 | NN | RS | RA | RB | RC | 01 011 |Rc| gfadd |
71 | NN | RT | RA | RB | deg | 10 011 |Rc| gfinv |
72 | NN | RS | RA | RB | deg | 11 011 |Rc| gfmuli |
73 | NN | RS | RA | RB | deg | 11 111 |Rc| gfaddi |
74
75 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
76 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
77 | NN | RT | RA | imm | mask | 101 |1 | ternaryv |
78
79 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
80 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
81 | NN | BA | BB | BC |0 |imm | mask | 101 |0 | ternarycr |
82
83 ops
84
85 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
86 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
87 | NN | RA | RB | | | 0 | 0000 110 |Rc| rsvd |
88 | NN | RA | RB | RC | itype | 1 | 0000 110 |Rc| xperm |
89 | NN | RA | RB | RC | itype | 0 | 0100 110 |Rc| minmax |
90 | NN | RA | RB | | | 1 | 0100 110 |Rc| rsvd |
91 | NN | RA | RB | sh | itype | SH | 1000 110 |Rc| bmopsi |
92 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
93 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
94 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
95 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
96 | NN | RA | RB | | | 0 | 0001 110 |Rc| rsvd |
97 | NN | RA | RB | | | 0 | 0101 110 |Rc| rsvd |
98 | NN | RA | RB | RC | 00 | 0 | 0010 110 |Rc| gorc |
99 | NN | RA | RB | sh | 00 | SH | 1010 110 |Rc| gorci |
100 | NN | RA | RB | RC | 00 | 0 | 0110 110 |Rc| gorcw |
101 | NN | RA | RB | sh | 00 | 0 | 1110 110 |Rc| gorcwi |
102 | NN | RA | RB | RC | 00 | 1 | 1110 110 |Rc| bmator |
103 | NN | RA | RB | RC | 01 | 0 | 0010 110 |Rc| grev |
104 | NN | RA | RB | RC | 01 | 1 | 0010 110 |Rc| clmul |
105 | NN | RA | RB | sh | 01 | SH | 1010 110 |Rc| grevi |
106 | NN | RA | RB | RC | 01 | 0 | 0110 110 |Rc| grevw |
107 | NN | RA | RB | sh | 01 | 0 | 1110 110 |Rc| grevwi |
108 | NN | RA | RB | RC | 01 | 1 | 1110 110 |Rc| bmatxor |
109 | NN | RA | RB | RC | 10 | 0 | 0010 110 |Rc| shfl |
110 | NN | RA | RB | sh | 10 | SH | 1010 110 |Rc| shfli |
111 | NN | RA | RB | RC | 10 | 0 | 0110 110 |Rc| shflw |
112 | NN | RA | RB | RC | 10 | 0 | 1110 110 |Rc| bdep |
113 | NN | RA | RB | RC | 10 | 1 | 1110 110 |Rc| bext |
114 | NN | RA | RB | RC | 11 | 0 | 1110 110 |Rc| clmulr |
115 | NN | RA | RB | RC | 11 | 1 | 1110 110 |Rc| clmulh |
116 | NN | RA | RB | | | | NN11 110 |Rc| rsvd |
117
118 # count leading/trailing zeros with mask
119
120 in v3.1 p105
121
122 ```
123 count = 0
124 do i = 0 to 63 if((RB)i=1) then do
125 if((RS)i=1) then break end end count ← count + 1
126 RA ← EXTZ64(count)
127 ```
128
129 # bit to byte permute
130
131 similar to matrix permute in RV bitmanip, which has XOR and OR variants
132
133 do j = 0 to 7
134 do k = 0 to 7
135 b = VSR[VRB+32].dword[i].byte[k].bit[j]
136 VSR[VRT+32].dword[i].byte[j].bit[k] = b
137
138 # bit deposit
139
140 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
141
142 do while(m < 64)
143 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
144 result = VSR[VRA+32].dword[i].bit[63-k]
145 VSR[VRT+32].dword[i].bit[63-m] = result
146 k = k + 1
147 m = m + 1
148
149 ```
150
151 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
152 {
153 uint_xlen_t r = 0;
154 for (int i = 0, j = 0; i < XLEN; i++)
155 if ((RB >> i) & 1) {
156 if ((RA >> j) & 1)
157 r |= uint_xlen_t(1) << i;
158 j++;
159 }
160 return r;
161 }
162
163 ```
164
165 # bit extract
166
167 other way round: identical to RV bext, found in v3.1 p196
168
169 ```
170 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
171 {
172 uint_xlen_t r = 0;
173 for (int i = 0, j = 0; i < XLEN; i++)
174 if ((RB >> i) & 1) {
175 if ((RA >> i) & 1)
176 r |= uint_xlen_t(1) << j;
177 j++;
178 }
179 return r;
180 }
181 ```
182
183 # centrifuge
184
185 found in v3.1 p106
186
187 ```
188 ptr0 = 0
189 ptr1 = 0
190 do i = 0 to 63
191 if((RB)i=0) then do
192 resultptr0 = (RS)i
193 end
194 ptr0 = ptr0 + 1
195 if((RB)63-i==1) then do
196 result63-ptr1 = (RS)63-i
197 end
198 ptr1 = ptr1 + 1
199 RA = result
200 ```
201
202 # int min/max
203
204 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
205
206 signed/unsigned min/max gives more flexibility.
207
208 ```
209 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
210 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
211 }
212 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
213 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
214 }
215 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
216 { return rs1 < rs2 ? rs1 : rs2;
217 }
218 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
219 { return rs1 > rs2 ? rs1 : rs2;
220 }
221 ```
222
223
224 # ternary bitops
225
226 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
227
228 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
229 | -- | -- | --- | --- | ----- | -------- |--|
230 | NN | RT | RA | RB | im0-4 | im5-7 00 |Rc|
231
232 for i in range(64):
233 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
234 RT[i] = (imm & (1<<idx)) != 0
235
236 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
237
238 a 4 operand variant which becomes more along the lines of an FPGA:
239
240 | 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
241 | -- | -- | --- | --- | --- | -------- |--|
242 | NN | RT | RA | RB | RC | mode 001 |Rc|
243
244 for i in range(64):
245 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
246 RT[i] = (RC & (1<<idx)) != 0
247
248 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
249 3 modes.
250
251 also, another possible variant involving swizzle and vec4:
252
253 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
254 | -- | -- | --- | ----- | ---- | ----- |--|
255 | NN | RT | RA | imm | mask | 101 |1 |
256
257 for i in range(8):
258 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
259 res = (imm & (1<<idx)) != 0
260 for j in range(3):
261 if mask[j]: RT[i+j*8] = res
262
263 another mode selection would be CRs not Ints.
264
265 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
266 | -- | -- | --- | --- |- |-----|----- | -----|--|
267 | NN | BA | BB | BC |0 |imm | mask | 101 |0 |
268
269 for i in range(4):
270 if not mask[i] continue
271 idx = crregs[BA][i] << 2 |
272 crregs[BB][i] << 1 |
273 crregs[BC][i]
274 crregs[BA][i] = (imm & (1<<idx)) != 0
275
276 # bitmask set
277
278 based on RV bitmanip singlebit set, instruction format similar to shift
279 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask rldicl but only immediate version).
280 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
281
282 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate. for the immediate version this does not matter, so a bmexti is not required.
283 bmrev however there is no direct equivalent and consequently a bmrevi is required.
284
285 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name |
286 | -- | -- | --- | --- | --- | ------- |--| ----- |
287 | NN | RT | RA | RB | RC | mode 010 |Rc| bm* |
288 | NN | RT | RA | RB | RC | 0 1 111 |Rc| bmrev |
289
290
291 ```
292 uint_xlen_t bmset(RA, RB, sh)
293 {
294 int shamt = RB & (XLEN - 1);
295 mask = (2<<sh)-1;
296 return RA | (mask << shamt);
297 }
298
299 uint_xlen_t bmclr(RA, RB, sh)
300 {
301 int shamt = RB & (XLEN - 1);
302 mask = (2<<sh)-1;
303 return RA & ~(mask << shamt);
304 }
305
306 uint_xlen_t bminv(RA, RB, sh)
307 {
308 int shamt = RB & (XLEN - 1);
309 mask = (2<<sh)-1;
310 return RA ^ (mask << shamt);
311 }
312
313 uint_xlen_t bmext(RA, RB, sh)
314 {
315 int shamt = RB & (XLEN - 1);
316 mask = (2<<sh)-1;
317 return mask & (RA >> shamt);
318 }
319 ```
320
321 bitmask extract with reverse. can be done by bitinverting all of RA and getting bits of RA from the opposite end.
322
323 ```
324 msb = rb[5:0];
325 rev[0:msb] = ra[msb:0];
326 rt = ZE(rev[msb:0]);
327
328 uint_xlen_t bmextrev(RA, RB, sh)
329 {
330 int shamt = (RB & (XLEN - 1));
331 shamt = (XLEN-1)-shamt; # shift other end
332 bra = bitreverse(RA) # swap LSB-MSB
333 mask = (2<<sh)-1;
334 return mask & (bra >> shamt);
335 }
336 ```
337
338 | 0.5|6.10|11.15|16.20|21.26| 27..30 |31| name |
339 | -- | -- | --- | --- | --- | ------- |--| ------ |
340 | NN | RT | RA | RB | sh | 0 111 |Rc| bmrevi |
341
342
343
344 # grev
345
346 based on RV bitmanip
347
348 ```
349 uint64_t grev64(uint64_t RA, uint64_t RB)
350 {
351 uint64_t x = RA;
352 int shamt = RB & 63;
353 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
354 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
355 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
356 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
357 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
358 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
359 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
360 ((x & 0xFF00FF00FF00FF00LL) >> 8);
361 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
362 ((x & 0xFFFF0000FFFF0000LL) >> 16);
363 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
364 ((x & 0xFFFFFFFF00000000LL) >> 32);
365 return x;
366 }
367
368 ```
369
370 # shuffle / unshuffle
371
372 based on RV bitmanip
373
374 ```
375 uint32_t shfl32(uint32_t RA, uint32_t RB)
376 {
377 uint32_t x = RA;
378 int shamt = RB & 15;
379 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
380 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
381 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
382 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
383 return x;
384 }
385 uint32_t unshfl32(uint32_t RA, uint32_t RB)
386 {
387 uint32_t x = RA;
388 int shamt = RB & 15;
389 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
390 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
391 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
392 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
393 return x;
394 }
395
396 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
397 {
398 uint64_t x = src & ~(maskL | maskR);
399 x |= ((src << N) & maskL) | ((src >> N) & maskR);
400 return x;
401 }
402 uint64_t shfl64(uint64_t RA, uint64_t RB)
403 {
404 uint64_t x = RA;
405 int shamt = RB & 31;
406 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
407 0x00000000ffff0000LL, 16);
408 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
409 0x0000ff000000ff00LL, 8);
410 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
411 0x00f000f000f000f0LL, 4);
412 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
413 0x0c0c0c0c0c0c0c0cLL, 2);
414 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
415 0x2222222222222222LL, 1);
416 return x;
417 }
418 uint64_t unshfl64(uint64_t RA, uint64_t RB)
419 {
420 uint64_t x = RA;
421 int shamt = RB & 31;
422 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
423 0x2222222222222222LL, 1);
424 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
425 0x0c0c0c0c0c0c0c0cLL, 2);
426 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
427 0x00f000f000f000f0LL, 4);
428 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
429 0x0000ff000000ff00LL, 8);
430 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
431 0x00000000ffff0000LL, 16);
432 return x;
433 }
434 ```
435
436 # xperm
437
438 based on RV bitmanip
439
440 ```
441 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
442 {
443 uint_xlen_t r = 0;
444 uint_xlen_t sz = 1LL << sz_log2;
445 uint_xlen_t mask = (1LL << sz) - 1;
446 for (int i = 0; i < XLEN; i += sz) {
447 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
448 if (pos < XLEN)
449 r |= ((RA >> pos) & mask) << i;
450 }
451 return r;
452 }
453 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
454 { return xperm(RA, RB, 2); }
455 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
456 { return xperm(RA, RB, 3); }
457 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
458 { return xperm(RA, RB, 4); }
459 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
460 { return xperm(RA, RB, 5); }
461 ```
462
463 # gorc
464
465 based on RV bitmanip
466
467 ```
468 uint32_t gorc32(uint32_t RA, uint32_t RB)
469 {
470 uint32_t x = RA;
471 int shamt = RB & 31;
472 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
473 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
474 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
475 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
476 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
477 return x;
478 }
479 uint64_t gorc64(uint64_t RA, uint64_t RB)
480 {
481 uint64_t x = RA;
482 int shamt = RB & 63;
483 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
484 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
485 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
486 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
487 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
488 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
489 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
490 ((x & 0xFF00FF00FF00FF00LL) >> 8);
491 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
492 ((x & 0xFFFF0000FFFF0000LL) >> 16);
493 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
494 ((x & 0xFFFFFFFF00000000LL) >> 32);
495 return x;
496 }
497
498 ```
499
500 # cmix
501
502 based on RV bitmanip, covered by ternary bitops
503
504 ```
505 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
506 return (RA & RB) | (RC & ~RB);
507 }
508 ```
509
510 # carryless mul
511
512 based on RV bitmanip
513 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
514
515 ```
516 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
517 {
518 uint_xlen_t x = 0;
519 for (int i = 0; i < XLEN; i++)
520 if ((RB >> i) & 1)
521 x ^= RA << i;
522 return x;
523 }
524 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
525 {
526 uint_xlen_t x = 0;
527 for (int i = 1; i < XLEN; i++)
528 if ((RB >> i) & 1)
529 x ^= RA >> (XLEN-i);
530 return x;
531 }
532 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
533 {
534 uint_xlen_t x = 0;
535 for (int i = 0; i < XLEN; i++)
536 if ((RB >> i) & 1)
537 x ^= RA >> (XLEN-i-1);
538 return x;
539 }
540 ```
541 # Galois Field
542
543 see <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
544
545 ## Multiply
546
547 this requires 3 parameters and a "degree"
548
549 RT = GFMUL(RA, RB, gfdegree, modulo=RC)
550
551 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
552
553 RS = GFMUL(RS, RA, gfdegree, modulo=RB)
554 RS = GFMUL(RS, RA, gfdegree=RC, modulo=RB)
555
556 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
557 | -- | -- | --- | --- | --- | ------- |--|
558 | NN | RS | RA | RB | deg | 00 011 |Rc|
559 | NN | RS | RA | RB | RC | 11 011 |Rc|
560
561 where the SimpleV variant may override RS-as-src differently from RS-as-dest
562
563
564
565 ```
566 from functools import reduce
567
568 # constants used in the multGF2 function
569 mask1 = mask2 = polyred = None
570
571 def setGF2(degree, irPoly):
572 """Define parameters of binary finite field GF(2^m)/g(x)
573 - degree: extension degree of binary field
574 - irPoly: coefficients of irreducible polynomial g(x)
575 """
576 def i2P(sInt):
577 """Convert an integer into a polynomial"""
578 return [(sInt >> i) & 1
579 for i in reversed(range(sInt.bit_length()))]
580
581 global mask1, mask2, polyred
582 mask1 = mask2 = 1 << degree
583 mask2 -= 1
584 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
585
586 def multGF2(p1, p2):
587 """Multiply two polynomials in GF(2^m)/g(x)"""
588 p = 0
589 while p2:
590 if p2 & 1:
591 p ^= p1
592 p1 <<= 1
593 if p1 & mask1:
594 p1 ^= polyred
595 p2 >>= 1
596 return p & mask2
597
598 if __name__ == "__main__":
599
600 # Define binary field GF(2^3)/x^3 + x + 1
601 setGF2(3, 0b1011)
602
603 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
604 print("{:02x}".format(multGF2(0b111, 0b101)))
605
606 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
607 # (used in the Advanced Encryption Standard-AES)
608 setGF2(8, 0b100011011)
609
610 # Evaluate the product (x^7)(x^7 + x + 1)
611 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
612 ```
613 ## GF add
614
615 RS = GFADDI(RS, RA|0, gfdegree, modulo=RB)
616 RS = GFADD(RS, RA|0, gfdegree=RC, modulo=RB)
617
618 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name |
619 | -- | -- | --- | --- | --- | ------- |--| ----- |
620 | NN | RS | RA | RB | deg | 0 1 011 |Rc| gfaddi |
621 | NN | RS | RA | RB | RC | 1 1 111 |Rc| gfadd |
622
623 GFMOD is a pseudo-op where RA=0
624
625 ## gf invert
626
627 ```
628 def gf_degree(a) :
629 res = 0
630 a >>= 1
631 while (a != 0) :
632 a >>= 1;
633 res += 1;
634 return res
635
636 def gf_invert(a, mod=0x1B) :
637 v = mod
638 g1 = 1
639 g2 = 0
640 j = gf_degree(a) - 8
641
642 while (a != 1) :
643 if (j < 0) :
644 a, v = v, a
645 g1, g2 = g2, g1
646 j = -j
647
648 a ^= v << j
649 g1 ^= g2 << j
650
651 a %= 256 # Emulating 8-bit overflow
652 g1 %= 256 # Emulating 8-bit overflow
653
654 j = gf_degree(a) - gf_degree(v)
655
656 return g1
657 ```
658
659 # bitmatrix
660
661 ```
662 uint64_t bmatflip(uint64_t RA)
663 {
664 uint64_t x = RA;
665 x = shfl64(x, 31);
666 x = shfl64(x, 31);
667 x = shfl64(x, 31);
668 return x;
669 }
670 uint64_t bmatxor(uint64_t RA, uint64_t RB)
671 {
672 // transpose of RB
673 uint64_t RBt = bmatflip(RB);
674 uint8_t u[8]; // rows of RA
675 uint8_t v[8]; // cols of RB
676 for (int i = 0; i < 8; i++) {
677 u[i] = RA >> (i*8);
678 v[i] = RBt >> (i*8);
679 }
680 uint64_t x = 0;
681 for (int i = 0; i < 64; i++) {
682 if (pcnt(u[i / 8] & v[i % 8]) & 1)
683 x |= 1LL << i;
684 }
685 return x;
686 }
687 uint64_t bmator(uint64_t RA, uint64_t RB)
688 {
689 // transpose of RB
690 uint64_t RBt = bmatflip(RB);
691 uint8_t u[8]; // rows of RA
692 uint8_t v[8]; // cols of RB
693 for (int i = 0; i < 8; i++) {
694 u[i] = RA >> (i*8);
695 v[i] = RBt >> (i*8);
696 }
697 uint64_t x = 0;
698 for (int i = 0; i < 64; i++) {
699 if ((u[i / 8] & v[i % 8]) != 0)
700 x |= 1LL << i;
701 }
702 return x;
703 }
704
705 ```