(no commit message)
[libreriscv.git] / openpower / sv / bitmanip.mdwn
1 [[!tag standards]]
2
3 # Implementation Log
4
5 * ternlogi <https://bugs.libre-soc.org/show_bug.cgi?id=745>
6 * grev <https://bugs.libre-soc.org/show_bug.cgi?id=755>
7 * remove Rc=1 from ternlog due to conflicts in encoding as well
8 as saving space <https://bugs.libre-soc.org/show_bug.cgi?id=753#c5>
9 * GF2^M <https://bugs.libre-soc.org/show_bug.cgi?id=782>
10
11 # bitmanipulation
12
13 **DRAFT STATUS**
14
15 pseudocode: <https://libre-soc.org/openpower/isa/bitmanip/>
16
17 this extension amalgamates bitmanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX. Vectorisation and SIMD are removed: these are straight scalar (element) operations making them suitable for embedded applications.
18 Vectorisation Context is provided by [[openpower/sv]].
19
20 When combined with SV, scalar variants of bitmanip operations found in VSX are added so that VSX may be retired as "legacy" in the far future (10 to 20 years). Also, VSX is hundreds of opcodes, requires 128 bit pathways, and is wholly unsuited to low power or embedded scenarios.
21
22 ternlogv is experimental and is the only operation that may be considered a "Packed SIMD". It is added as a variant of the already well-justified ternlog operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs. Other vendors of ISAs are buying FPGA companies to achieve similar objectives.
23
24 general-purpose Galois Field 2^M operations are added so as to avoid huge custom opcode proliferation across many areas of Computer Science. however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added. The expectation is that these operations would all be covered by the same pipeline.
25
26 note that there are brownfield spaces below that could incorporate some of the set-before-first and other scalar operations listed in [[sv/vector_ops]], and
27 the [[sv/av_opcodes]] as well as [[sv/setvl]]
28
29 Useful resource:
30
31 * <https://en.wikiversity.org/wiki/Reed%E2%80%93Solomon_codes_for_coders>
32 * <https://maths-people.anu.edu.au/~brent/pd/rpb232tr.pdf>
33
34 # summary
35
36 minor opcode allocation
37
38 | 28.30 |31| name |
39 | ------ |--| --------- |
40 | -00 |0 | ternlogi |
41 | -00 |1 | grevlog |
42 | -01 | | grevlogi |
43 | 010 |Rc| bitmask |
44 | 011 |0 | gfbmadd* |
45 | 011 |1 | clmadd* |
46 | 110 |Rc| 1/2-op |
47 | 111 |1 | ternlogv |
48 | 111 |0 | ternlogcr |
49
50 1-op and variants
51
52 | dest | src1 | subop | op |
53 | ---- | ---- | ----- | -------- |
54 | RT | RA | .. | bmatflip |
55
56 2-op and variants
57
58 | dest | src1 | src2 | subop | op |
59 | ---- | ---- | ---- | ----- | -------- |
60 | RT | RA | RB | or | bmatflip |
61 | RT | RA | RB | xor | bmatflip |
62 | RT | RA | RB | | grev |
63 | RT | RA | RB | | clmul* |
64 | RT | RA | RB | | gorc |
65 | RT | RA | RB | shuf | shuffle |
66 | RT | RA | RB | unshuf| shuffle |
67 | RT | RA | RB | width | xperm |
68 | RT | RA | RB | type | minmax |
69 | RT | RA | RB | | av abs avgadd |
70 | RT | RA | RB | type | vmask ops |
71 | RT | RA | RB | | |
72
73 3 ops
74
75 * grevlog
76 * ternlog bitops
77 * GF mul-add
78 * bitmask-reverse
79
80 TODO: convert all instructions to use RT and not RS
81
82 | 0.5|6.10|11.15|16.20 |21..25 | 26....30 |31| name |
83 | -- | -- | --- | --- | ----- | -------- |--| ------ |
84 | NN | RT | RA | RB | im0-4 | im5-7 00 |0 | ternlogi |
85 | NN | RT | RA | RB | im0-4 | im5-7 00 |1 | grevlog |
86 | NN | RT | RA | s0-4 | im0-4 | im5-7 01 |s5| grevlogi |
87 | NN | RS | RA | RB | RC | 00 011 |0 | gfbmadd |
88 | NN | RS | RA | RB | RC | 10 011 |0 | gfbmaddsub |
89 | NN | RS | RA | RB | RC | 00 011 |1 | clmadd |
90 | NN | RS | RA | RB | RC | 10 011 |1 | clmaddsub |
91 | NN | RT | RA | RB | sh0-4 | sh5 1 011 |Rc| bmrevi |
92
93 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
94 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
95 | NN | RT | RA | imm | mask | 111 |1 | ternlogv |
96
97 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
98 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
99 | NN | BA | BB | BC |0 |imm | mask | 111 |0 | ternlogcr |
100
101 ops (note that av avg and abs as well as vec scalar mask
102 are included here)
103
104 TODO: convert from RA, RB, and RC to correct field names of RT, RA, and RB, and
105 double check that instructions didn't need 3 inputs.
106
107 | 0.5|6.10|11.15|16.20| 21 | 22.23 | 24....30 |31| name |
108 | -- | -- | --- | --- | -- | ----- | -------- |--| ---- |
109 | NN | RA | RB | | 0 | | 0000 110 |Rc| rsvd |
110 | NN | RA | RB | RC | 1 | itype | 0000 110 |Rc| xperm |
111 | NN | RA | RB | RC | 0 | itype | 0100 110 |Rc| minmax |
112 | NN | RA | RB | RC | 1 | 00 | 0100 110 |Rc| av avgadd |
113 | NN | RA | RB | RC | 1 | 01 | 0100 110 |Rc| av abs |
114 | NN | RA | RB | | 1 | 10 | 0100 110 |Rc| rsvd |
115 | NN | RA | RB | | 1 | 11 | 0100 110 |Rc| rsvd |
116 | NN | RA | RB | sh | SH | itype | 1000 110 |Rc| bmopsi |
117 | NN | RA | RB | | | | 1100 110 |Rc| rsvd |
118 | NN | RT | RA | RB | 1 | 00 | 0001 110 |Rc| cldiv |
119 | NN | RT | RA | RB | 1 | 01 | 0001 110 |Rc| clmod |
120 | NN | RT | RA | RB | 1 | 10 | 0001 110 |Rc| |
121 | NN | RT | RB | RB | 1 | 11 | 0001 110 |Rc| clinv |
122 | NN | RA | RB | RC | 0 | 00 | 0001 110 |Rc| vec sbfm |
123 | NN | RA | RB | RC | 0 | 01 | 0001 110 |Rc| vec sofm |
124 | NN | RA | RB | RC | 0 | 10 | 0001 110 |Rc| vec sifm |
125 | NN | RA | RB | RC | 0 | 11 | 0001 110 |Rc| vec cprop |
126 | NN | RA | RB | | 0 | | 0101 110 |Rc| rsvd |
127 | NN | RA | RB | RC | 0 | 00 | 0010 110 |Rc| gorc |
128 | NN | RA | RB | sh | SH | 00 | 1010 110 |Rc| gorci |
129 | NN | RA | RB | RC | 0 | 00 | 0110 110 |Rc| gorcw |
130 | NN | RA | RB | sh | 0 | 00 | 1110 110 |Rc| gorcwi |
131 | NN | RA | RB | RC | 1 | 00 | 1110 110 |Rc| bmator |
132 | NN | RA | RB | RC | 0 | 01 | 0010 110 |Rc| grev |
133 | NN | RA | RB | RC | 1 | 01 | 0010 110 |Rc| clmul |
134 | NN | RA | RB | sh | SH | 01 | 1010 110 |Rc| grevi |
135 | NN | RA | RB | RC | 0 | 01 | 0110 110 |Rc| grevw |
136 | NN | RA | RB | sh | 0 | 01 | 1110 110 |Rc| grevwi |
137 | NN | RA | RB | RC | 1 | 01 | 1110 110 |Rc| bmatxor |
138 | NN | RA | RB | RC | 0 | 10 | 0010 110 |Rc| shfl |
139 | NN | RA | RB | sh | SH | 10 | 1010 110 |Rc| shfli |
140 | NN | RA | RB | RC | 0 | 10 | 0110 110 |Rc| shflw |
141 | NN | RA | RB | RC | | 10 | 1110 110 |Rc| rsvd |
142 | NN | RA | RB | RC | 0 | 11 | 1110 110 |Rc| clmulr |
143 | NN | RA | RB | RC | 1 | 11 | 1110 110 |Rc| clmulh |
144 | NN | | | | | | --11 110 |Rc| setvl |
145
146 # bit to byte permute
147
148 similar to matrix permute in RV bitmanip, which has XOR and OR variants
149
150 do j = 0 to 7
151 do k = 0 to 7
152 b = VSR[VRB+32].dword[i].byte[k].bit[j]
153 VSR[VRT+32].dword[i].byte[j].bit[k] = b
154
155 # int min/max
156
157 signed and unsigned min/max for integer. this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned. when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
158
159 signed/unsigned min/max gives more flexibility.
160
161 ```
162 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
163 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
164 }
165 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
166 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
167 }
168 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
169 { return rs1 < rs2 ? rs1 : rs2;
170 }
171 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
172 { return rs1 > rs2 ? rs1 : rs2;
173 }
174 ```
175
176
177 # ternlog bitops
178
179 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register.
180
181 Like the x86 AVX512F [vpternlogd/vpternlogq](https://www.felixcloutier.com/x86/vpternlogd:vpternlogq) instructions.
182
183 ## ternlogi
184
185 | 0.5|6.10|11.15|16.20| 21..25| 26..30 |31|
186 | -- | -- | --- | --- | ----- | -------- |--|
187 | NN | RT | RA | RB | im0-4 | im5-7 00 |0 |
188
189 lut3(imm, a, b, c):
190 idx = c << 2 | b << 1 | a
191 return imm[idx] # idx by LSB0 order
192
193 for i in range(64):
194 RT[i] = lut3(imm, RB[i], RA[i], RT[i])
195
196 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
197
198 ## ternlog
199
200 a 4 operand variant which becomes more along the lines of an FPGA:
201
202 | 0.5|6.10|11.15|16.20|21.25| 26...30 |31|
203 | -- | -- | --- | --- | --- | -------- |--|
204 | NN | RT | RA | RB | RC | mode 100 |1 |
205
206 for i in range(64):
207 idx = RT[i] << 2 | RA[i] << 1 | RB[i]
208 RT[i] = (RC & (1<<idx)) != 0
209
210 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
211 3 modes.
212
213 ## ternlogv
214
215 also, another possible variant involving swizzle and vec4:
216
217 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
218 | -- | -- | --- | ----- | ---- | ----- |--|
219 | NN | RT | RA | imm | mask | 111 |1 |
220
221 for i in range(8):
222 idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
223 res = (imm & (1<<idx)) != 0
224 for j in range(3):
225 if mask[j]: RT[i+j*8] = res
226
227 ## ternlogcr
228
229 another mode selection would be CRs not Ints.
230
231 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
232 | -- | -- | --- | --- |- |-----|----- | -----|--|
233 | NN | BA | BB | BC |0 |imm | mask | 111 |0 |
234
235 for i in range(4):
236 if not mask[i] continue
237 idx = crregs[BA][i] << 2 |
238 crregs[BB][i] << 1 |
239 crregs[BC][i]
240 crregs[BA][i] = (imm & (1<<idx)) != 0
241
242 ## cmix
243
244 based on RV bitmanip, covered by ternlog bitops
245
246 ```
247 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
248 return (RA & RB) | (RC & ~RB);
249 }
250 ```
251
252
253 # bitmask set
254
255 based on RV bitmanip singlebit set, instruction format similar to shift
256 [[isa/fixedshift]]. bmext is actually covered already (shift-with-mask rldicl but only immediate version).
257 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
258
259 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate. for the immediate version this does not matter, so a bmexti is not required.
260 bmrev however there is no direct equivalent and consequently a bmrevi is required.
261
262 bmset (register for mask amount) is particularly useful for creating
263 predicate masks where the length is a dynamic runtime quantity.
264 bmset(RA=0, RB=0, RC=mask) will produce a run of ones of length "mask" in a single instruction without needing to initialise or depend on any other registers.
265
266 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name |
267 | -- | -- | --- | --- | --- | ------- |--| ----- |
268 | NN | RT | RA | RB | RC | mode 010 |Rc| bm* |
269
270
271 ```
272 uint_xlen_t bmset(RA, RB, sh)
273 {
274 int shamt = RB & (XLEN - 1);
275 mask = (2<<sh)-1;
276 return RA | (mask << shamt);
277 }
278
279 uint_xlen_t bmclr(RA, RB, sh)
280 {
281 int shamt = RB & (XLEN - 1);
282 mask = (2<<sh)-1;
283 return RA & ~(mask << shamt);
284 }
285
286 uint_xlen_t bminv(RA, RB, sh)
287 {
288 int shamt = RB & (XLEN - 1);
289 mask = (2<<sh)-1;
290 return RA ^ (mask << shamt);
291 }
292
293 uint_xlen_t bmext(RA, RB, sh)
294 {
295 int shamt = RB & (XLEN - 1);
296 mask = (2<<sh)-1;
297 return mask & (RA >> shamt);
298 }
299 ```
300
301 bitmask extract with reverse. can be done by bitinverting all of RA and getting bits of RA from the opposite end.
302
303 ```
304 msb = rb[5:0];
305 rev[0:msb] = ra[msb:0];
306 rt = ZE(rev[msb:0]);
307
308 uint_xlen_t bmextrev(RA, RB, sh)
309 {
310 int shamt = (RB & (XLEN - 1));
311 shamt = (XLEN-1)-shamt; # shift other end
312 bra = bitreverse(RA) # swap LSB-MSB
313 mask = (2<<sh)-1;
314 return mask & (bra >> shamt);
315 }
316 ```
317
318 | 0.5|6.10|11.15|16.20|21.26| 27..30 |31| name |
319 | -- | -- | --- | --- | --- | ------- |--| ------ |
320 | NN | RT | RA | RB | sh | 1 011 |Rc| bmrevi |
321
322
323 # grevlut
324
325 generalised reverse combined with a pair of LUT2s and allowing
326 zero when RA=0 provides a wide range of instructions
327 and a means to set regular 64 bit patterns in one
328 32 bit instruction.
329
330 the two LUT2s are applied left-half (when not swapping)
331 and right-half (when swapping) so as to allow a wider
332 range of options
333
334 <img src="/openpower/sv/grevlut2x2.jpg" width=700 />
335
336 ```
337 lut2(imm, a, b):
338 idx = b << 1 | a
339 return imm[idx] # idx by LSB0 order
340
341 dorow(imm8, step_i, chunksize):
342 for j in 0 to 63:
343 if (j&chunk_size) == 0
344 imm = imm8[0..3]
345 else
346 imm = imm8[4..7]
347 step_o[j] = lut2(imm, step_i[j], step_i[j ^ chunk_size])
348 return step_o
349
350 uint64_t grevlut64(uint64_t RA, uint64_t RB, uint8 imm)
351 {
352 uint64_t x = RA;
353 int shamt = RB & 63;
354 for i in 0 to 6
355 step = 1<<i
356 if (shamt & step) x = dorow(imm, x, step)
357 return x;
358 }
359
360 ```
361
362 # grev
363
364 based on RV bitmanip, this is also known as a butterfly network. however
365 where a butterfly network allows setting of every crossbar setting in
366 every row and every column, generalised-reverse (grev) only allows
367 a per-row decision: every entry in the same row must either switch or
368 not-switch.
369
370 <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Butterfly_Network.jpg/474px-Butterfly_Network.jpg" />
371
372 ```
373 uint64_t grev64(uint64_t RA, uint64_t RB)
374 {
375 uint64_t x = RA;
376 int shamt = RB & 63;
377 if (shamt & 1) x = ((x & 0x5555555555555555LL) << 1) |
378 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
379 if (shamt & 2) x = ((x & 0x3333333333333333LL) << 2) |
380 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
381 if (shamt & 4) x = ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
382 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
383 if (shamt & 8) x = ((x & 0x00FF00FF00FF00FFLL) << 8) |
384 ((x & 0xFF00FF00FF00FF00LL) >> 8);
385 if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
386 ((x & 0xFFFF0000FFFF0000LL) >> 16);
387 if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
388 ((x & 0xFFFFFFFF00000000LL) >> 32);
389 return x;
390 }
391
392 ```
393
394 # shuffle / unshuffle
395
396 based on RV bitmanip
397
398 ```
399 uint32_t shfl32(uint32_t RA, uint32_t RB)
400 {
401 uint32_t x = RA;
402 int shamt = RB & 15;
403 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
404 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
405 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
406 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
407 return x;
408 }
409 uint32_t unshfl32(uint32_t RA, uint32_t RB)
410 {
411 uint32_t x = RA;
412 int shamt = RB & 15;
413 if (shamt & 1) x = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
414 if (shamt & 2) x = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
415 if (shamt & 4) x = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
416 if (shamt & 8) x = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
417 return x;
418 }
419
420 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
421 {
422 uint64_t x = src & ~(maskL | maskR);
423 x |= ((src << N) & maskL) | ((src >> N) & maskR);
424 return x;
425 }
426 uint64_t shfl64(uint64_t RA, uint64_t RB)
427 {
428 uint64_t x = RA;
429 int shamt = RB & 31;
430 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
431 0x00000000ffff0000LL, 16);
432 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
433 0x0000ff000000ff00LL, 8);
434 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
435 0x00f000f000f000f0LL, 4);
436 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
437 0x0c0c0c0c0c0c0c0cLL, 2);
438 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
439 0x2222222222222222LL, 1);
440 return x;
441 }
442 uint64_t unshfl64(uint64_t RA, uint64_t RB)
443 {
444 uint64_t x = RA;
445 int shamt = RB & 31;
446 if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
447 0x2222222222222222LL, 1);
448 if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
449 0x0c0c0c0c0c0c0c0cLL, 2);
450 if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
451 0x00f000f000f000f0LL, 4);
452 if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
453 0x0000ff000000ff00LL, 8);
454 if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
455 0x00000000ffff0000LL, 16);
456 return x;
457 }
458 ```
459
460 # xperm
461
462 based on RV bitmanip
463
464 ```
465 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
466 {
467 uint_xlen_t r = 0;
468 uint_xlen_t sz = 1LL << sz_log2;
469 uint_xlen_t mask = (1LL << sz) - 1;
470 for (int i = 0; i < XLEN; i += sz) {
471 uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
472 if (pos < XLEN)
473 r |= ((RA >> pos) & mask) << i;
474 }
475 return r;
476 }
477 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
478 { return xperm(RA, RB, 2); }
479 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
480 { return xperm(RA, RB, 3); }
481 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
482 { return xperm(RA, RB, 4); }
483 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
484 { return xperm(RA, RB, 5); }
485 ```
486
487 # gorc
488
489 based on RV bitmanip
490
491 ```
492 uint32_t gorc32(uint32_t RA, uint32_t RB)
493 {
494 uint32_t x = RA;
495 int shamt = RB & 31;
496 if (shamt & 1) x |= ((x & 0x55555555) << 1) | ((x & 0xAAAAAAAA) >> 1);
497 if (shamt & 2) x |= ((x & 0x33333333) << 2) | ((x & 0xCCCCCCCC) >> 2);
498 if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4) | ((x & 0xF0F0F0F0) >> 4);
499 if (shamt & 8) x |= ((x & 0x00FF00FF) << 8) | ((x & 0xFF00FF00) >> 8);
500 if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) | ((x & 0xFFFF0000) >> 16);
501 return x;
502 }
503 uint64_t gorc64(uint64_t RA, uint64_t RB)
504 {
505 uint64_t x = RA;
506 int shamt = RB & 63;
507 if (shamt & 1) x |= ((x & 0x5555555555555555LL) << 1) |
508 ((x & 0xAAAAAAAAAAAAAAAALL) >> 1);
509 if (shamt & 2) x |= ((x & 0x3333333333333333LL) << 2) |
510 ((x & 0xCCCCCCCCCCCCCCCCLL) >> 2);
511 if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL) << 4) |
512 ((x & 0xF0F0F0F0F0F0F0F0LL) >> 4);
513 if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL) << 8) |
514 ((x & 0xFF00FF00FF00FF00LL) >> 8);
515 if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL) << 16) |
516 ((x & 0xFFFF0000FFFF0000LL) >> 16);
517 if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL) << 32) |
518 ((x & 0xFFFFFFFF00000000LL) >> 32);
519 return x;
520 }
521
522 ```
523
524 # Galois Field 2^M
525
526 see:
527
528 * <https://courses.csail.mit.edu/6.857/2016/files/ffield.py>
529 * <https://engineering.purdue.edu/kak/compsec/NewLectures/Lecture7.pdf>
530 * <https://foss.heptapod.net/math/libgf2/-/blob/branch/default/src/libgf2/gf2.py>
531
532 ## SPRs to set modulo and degree
533
534 to save registers and make operations orthogonal with standard
535 arithmetic the modulo is to be set in an SPR
536
537 ## Twin Butterfly (Tukey-Cooley) Mul-add-sub
538
539 used in combination with SV FFT REMAP to perform
540 a full NTT in-place. possible by having 3-in 2-out,
541 to avoid the need for a temp register. RS is written
542 to as well as RT.
543
544 gffmadd RT,RA,RC,RB (Rc=0)
545 gffmadd. RT,RA,RC,RB (Rc=1)
546
547 Pseudo-code:
548
549 RT <- GFADD(GFMUL(RA, RC), RB))
550 RS <- GFADD(GFMUL(RA, RC), RB))
551
552
553 ## Multiply
554
555 with the modulo and degree being in an SPR, multiply can be identical
556 equivalent to standard integer add
557
558 RS = GFMUL(RA, RB)
559
560 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31|
561 | -- | -- | --- | --- | --- | ------ |--|
562 | NN | RT | RA | RB |11000| 01110 |Rc|
563
564
565
566 ```
567 from functools import reduce
568
569 def gf_degree(a) :
570 res = 0
571 a >>= 1
572 while (a != 0) :
573 a >>= 1;
574 res += 1;
575 return res
576
577 # constants used in the multGF2 function
578 mask1 = mask2 = polyred = None
579
580 def setGF2(irPoly):
581 """Define parameters of binary finite field GF(2^m)/g(x)
582 - irPoly: coefficients of irreducible polynomial g(x)
583 """
584 # degree: extension degree of binary field
585 degree = gf_degree(irPoly)
586
587 def i2P(sInt):
588 """Convert an integer into a polynomial"""
589 return [(sInt >> i) & 1
590 for i in reversed(range(sInt.bit_length()))]
591
592 global mask1, mask2, polyred
593 mask1 = mask2 = 1 << degree
594 mask2 -= 1
595 polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
596
597 def multGF2(p1, p2):
598 """Multiply two polynomials in GF(2^m)/g(x)"""
599 p = 0
600 while p2:
601 # standard long-multiplication: check LSB and add
602 if p2 & 1:
603 p ^= p1
604 p1 <<= 1
605 # standard modulo: check MSB and add polynomial
606 if p1 & mask1:
607 p1 ^= polyred
608 p2 >>= 1
609 return p & mask2
610
611 if __name__ == "__main__":
612
613 # Define binary field GF(2^3)/x^3 + x + 1
614 setGF2(0b1011) # degree 3
615
616 # Evaluate the product (x^2 + x + 1)(x^2 + 1)
617 print("{:02x}".format(multGF2(0b111, 0b101)))
618
619 # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
620 # (used in the Advanced Encryption Standard-AES)
621 setGF2(0b100011011) # degree 8
622
623 # Evaluate the product (x^7)(x^7 + x + 1)
624 print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
625 ```
626 ## GF div and mod
627
628 ```
629 def gf_degree(a) :
630 res = 0
631 a >>= 1
632 while (a != 0) :
633 a >>= 1;
634 res += 1;
635 return res
636
637 def FullDivision(self, f, v):
638 """
639 Takes two arguments, f, v
640 fDegree and vDegree are the degrees of the field elements
641 f and v represented as a polynomials.
642 This method returns the field elements a and b such that
643
644 f(x) = a(x) * v(x) + b(x).
645
646 That is, a is the divisor and b is the remainder, or in
647 other words a is like floor(f/v) and b is like f modulo v.
648 """
649
650 fDegree, vDegree = gf_degree(f), gf_degree(v)
651 res, rem = 0, f
652 i = fDegree
653 mask = 1 << i
654 while (i >= vDegree):
655 if (mask & rem): # check MSB
656 res ^= (1 << (i - vDegree))
657 rem ^= ( v << (i - vDegree)))
658 i -= 1
659 mask >>= 1
660 return (res, rem)
661 ```
662
663 | 0.5|6.10|11.15|16.20|21.25| 26..30 |31| name |
664 | -- | -- | --- | --- | --- | ------- |--| ----- |
665 | NN | RS | RA | deg | RC | 0 1 011 |Rc| gfaddi |
666 | NN | RS | RA | RB | RC | 1 1 111 |Rc| gfadd |
667
668 GFMOD is a pseudo-op where RA=0
669
670 ## carryless mul
671
672 based on RV bitmanip
673 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
674
675 these are GF2 operations with the modulo set to 2^degree.
676 they are worth adding as their own non-overwrite operations
677 (in the same pipeline).
678
679 ```
680 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
681 {
682 uint_xlen_t x = 0;
683 for (int i = 0; i < XLEN; i++)
684 if ((RB >> i) & 1)
685 x ^= RA << i;
686 return x;
687 }
688 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
689 {
690 uint_xlen_t x = 0;
691 for (int i = 1; i < XLEN; i++)
692 if ((RB >> i) & 1)
693 x ^= RA >> (XLEN-i);
694 return x;
695 }
696 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
697 {
698 uint_xlen_t x = 0;
699 for (int i = 0; i < XLEN; i++)
700 if ((RB >> i) & 1)
701 x ^= RA >> (XLEN-i-1);
702 return x;
703 }
704 ```
705
706 # bitmatrix
707
708 ```
709 uint64_t bmatflip(uint64_t RA)
710 {
711 uint64_t x = RA;
712 x = shfl64(x, 31);
713 x = shfl64(x, 31);
714 x = shfl64(x, 31);
715 return x;
716 }
717 uint64_t bmatxor(uint64_t RA, uint64_t RB)
718 {
719 // transpose of RB
720 uint64_t RBt = bmatflip(RB);
721 uint8_t u[8]; // rows of RA
722 uint8_t v[8]; // cols of RB
723 for (int i = 0; i < 8; i++) {
724 u[i] = RA >> (i*8);
725 v[i] = RBt >> (i*8);
726 }
727 uint64_t x = 0;
728 for (int i = 0; i < 64; i++) {
729 if (pcnt(u[i / 8] & v[i % 8]) & 1)
730 x |= 1LL << i;
731 }
732 return x;
733 }
734 uint64_t bmator(uint64_t RA, uint64_t RB)
735 {
736 // transpose of RB
737 uint64_t RBt = bmatflip(RB);
738 uint8_t u[8]; // rows of RA
739 uint8_t v[8]; // cols of RB
740 for (int i = 0; i < 8; i++) {
741 u[i] = RA >> (i*8);
742 v[i] = RBt >> (i*8);
743 }
744 uint64_t x = 0;
745 for (int i = 0; i < 64; i++) {
746 if ((u[i / 8] & v[i % 8]) != 0)
747 x |= 1LL << i;
748 }
749 return x;
750 }
751
752 ```
753
754 # Already in POWER ISA
755
756 ## count leading/trailing zeros with mask
757
758 in v3.1 p105
759
760 ```
761 count = 0
762 do i = 0 to 63 if((RB)i=1) then do
763 if((RS)i=1) then break end end count ← count + 1
764 RA ← EXTZ64(count)
765 ```
766
767 ## bit deposit
768
769 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep, found already in v3.1 p106
770
771 do while(m < 64)
772 if VSR[VRB+32].dword[i].bit[63-m]=1 then do
773 result = VSR[VRA+32].dword[i].bit[63-k]
774 VSR[VRT+32].dword[i].bit[63-m] = result
775 k = k + 1
776 m = m + 1
777
778 ```
779
780 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
781 {
782 uint_xlen_t r = 0;
783 for (int i = 0, j = 0; i < XLEN; i++)
784 if ((RB >> i) & 1) {
785 if ((RA >> j) & 1)
786 r |= uint_xlen_t(1) << i;
787 j++;
788 }
789 return r;
790 }
791
792 ```
793
794 # bit extract
795
796 other way round: identical to RV bext, found in v3.1 p196
797
798 ```
799 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
800 {
801 uint_xlen_t r = 0;
802 for (int i = 0, j = 0; i < XLEN; i++)
803 if ((RB >> i) & 1) {
804 if ((RA >> i) & 1)
805 r |= uint_xlen_t(1) << j;
806 j++;
807 }
808 return r;
809 }
810 ```
811
812 # centrifuge
813
814 found in v3.1 p106 so not to be added here
815
816 ```
817 ptr0 = 0
818 ptr1 = 0
819 do i = 0 to 63
820 if((RB)i=0) then do
821 resultptr0 = (RS)i
822 end
823 ptr0 = ptr0 + 1
824 if((RB)63-i==1) then do
825 result63-ptr1 = (RS)63-i
826 end
827 ptr1 = ptr1 + 1
828 RA = result
829 ```
830