openpower/sv/bitmanip.mdwn

   1 [[!tag standards]]
   2
   3 # bitmanipulation
   4
   5 **DRAFT STATUS**
   6
   7 this extension amalgamates bitnanipulation primitives from many sources, including RISC-V bitmanip, Packed SIMD, AVX-512 and OpenPOWER VSX.  Vectorisation and SIMD are removed: these are straight scalar (element) operations.  Vectorisation Context is provided by [[openpower/sv]].
   8
   9 ternaryv is experimental and is the only operation that may be considered a "Packed SIMD".  It is added as a variant of the already well-justified ternary operation (done in AVX512 as an immediate only) "because it looks fun". As it is based on the LUT4 concept it will allow accelerated emulation of FPGAs.  Other vendors of ISAs are buying FPGA companies to achieve a similar objective.
  10
  11 general-purpose Galois Field operations are added so as to avoid huge opcode proliferation across many areas of Computer Science.  however for convenience and also to avoid setup costs, some of the more common operations (clmul, crc32) are also added.  The expectation is that these operations would all be covered by the same pipeline.
  12
  13 # summary
  14
  15 minor opcode allocation
  16
  17     |  28.30 |31| name      |
  18     | ------ |--| --------- |
  19     |   00   |Rc| ternaryi  |
  20     |  001   |Rc| ternary   |
  21     |  010   |Rc| bitmask   |
  22     |  011   |Rc| gf*       |
  23     |  101   |1 | ternaryv  |
  24     |  101   |0 | ternarycr |
  25     |  110   |Rc| 1/2-op    |
  26     |  111   |Rc| bitmaski  |
  27
  28 1-op and variants
  29
  30 | dest | src1 | subop | op       |
  31 | ---- | ---- | ----- | -------- |
  32 | RT   | RA   | ..    | bmatflip |
  33
  34 2-op and variants
  35
  36 | dest | src1 | src2 | subop | op       |
  37 | ---- | ---- | ---- | ----- | -------- |
  38 | RT   | RA   | RB   | or    | bmatflip |
  39 | RT   | RA   | RB   | xor   | bmatflip |
  40 | RT   | RA   | RB   | bdep  | dep/ext  |
  41 | RT   | RA   | RB   | bext  | dep/ext  |
  42 | RT   | RA   | RB   |       | grev  |
  43 | RT   | RA   | RB   |       | clmul*  |
  44 | RT   | RA   | RB   |       | gorc |
  45 | RT   | RA   | RB   | shuf  | shuffle |
  46 | RT   | RA   | RB   | unshuf| shuffle |
  47 | RT   | RA   | RB   | width | xperm  |
  48 | RT   | RA   | RB   | type | minmax |
  49 | RT   | RA   | RB   |  |  |
  50 | RT   | RA   | RB   |  |  |
  51 | RT   | RA   | RB   |  |  |
  52
  53 3 ops
  54
  55 * bitmask set/extract
  56 * ternary bitops
  57 * GF
  58
  59 | 0.5|6.10|11.15|16.20|21..25 | 26....30 |31| name |
  60 | -- | -- | --- | --- | ----- | -------- |--| ------ |
  61 | NN | RT | RA  | RB  | RC    | mode 001 |Rc| ternary |
  62 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc| ternaryi |
  63 | NN | RS | RA  | RB  | deg   | 00  011  |Rc| gfmul |
  64 | NN | RS | RA  | RB  | deg   | 01  011  |Rc| gfadd |
  65 | NN | RT | RA  | RB  | deg   | 10  011  |Rc| gfinv |
  66 | NN | RS | RA  | RB  | deg   | 11  011  |Rc| gfmod |
  67
  68 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31| name |
  69 | -- | -- | --- | ----- | ---- | ----- |--| ------ |
  70 | NN | RT | RA  | imm   | mask | 101   |1 | ternaryv |
  71
  72 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31| name |
  73 | -- | -- | --- | --- |- |-----|----- | -----|--| -------|
  74 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 | ternarycr |
  75
  76 ops
  77
  78 | 0.5|6.10|11.15|16.20| 21.22 | 23 | 24....30 |31| name |
  79 | -- | -- | --- | --- | ----- | -- | -------- |--| ---- |
  80 | NN | RA | RB  |     |       | 0  | 0000 110 |Rc| rsvd   |
  81 | NN | RA | RB  | RC  | itype | 1  | 0000 110 |Rc| xperm |
  82 | NN | RA | RB  | RC  | itype | 0  | 0100 110 |Rc| minmax |
  83 | NN | RA | RB  |     |       | 1  | 0100 110 |Rc| rsvd |
  84 | NN | RA | RB  | sh  | itype | SH | 1000 110 |Rc| bmopsi |
  85 | NN | RA | RB  |     |       |    | 1100 110 |Rc| rsvd |
  86 | NN | RA | RB  |     |       | 0  | 0001 110 |Rc| rsvd |
  87 | NN | RA | RB  |     |       | 0  | 0101 110 |Rc| rsvd |
  88 | NN | RA | RB  | RC  | 00    | 0  | 0010 110 |Rc| gorc |
  89 | NN | RA | RB  | sh  | 00    | SH | 1010 110 |Rc| gorci |
  90 | NN | RA | RB  | RC  | 00    | 0  | 0110 110 |Rc| gorcw |
  91 | NN | RA | RB  | sh  | 00    | 0  | 1110 110 |Rc| gorcwi |
  92 | NN | RA | RB  | RC  | 00    | 1  | 1110 110 |Rc| bmator  |
  93 | NN | RA | RB  | RC  | 01    | 0  | 0010 110 |Rc| grev |
  94 | NN | RA | RB  | RC  | 01    | 1  | 0010 110 |Rc| clmul |
  95 | NN | RA | RB  | sh  | 01    | SH | 1010 110 |Rc| grevi |
  96 | NN | RA | RB  | RC  | 01    | 0  | 0110 110 |Rc| grevw |
  97 | NN | RA | RB  | sh  | 01    | 0  | 1110 110 |Rc| grevwi |
  98 | NN | RA | RB  | RC  | 01    | 1  | 1110 110 |Rc| bmatxor   |
  99 | NN | RA | RB  | RC  | 10    | 0  | 0010 110 |Rc| shfl |
 100 | NN | RA | RB  | sh  | 10    | SH | 1010 110 |Rc| shfli |
 101 | NN | RA | RB  | RC  | 10    | 0  | 0110 110 |Rc| shflw |
 102 | NN | RA | RB  | RC  | 10    | 0  | 1110 110 |Rc| bdep   |
 103 | NN | RA | RB  | RC  | 10    | 1  | 1110 110 |Rc| bext  |
 104 | NN | RA | RB  | RC  | 11    | 0  | 1110 110 |Rc| clmulr  |
 105 | NN | RA | RB  | RC  | 11    | 1  | 1110 110 |Rc| clmulh  |
 106 | NN | RA | RB  |     |       |    | NN11 110 |Rc| rsvd  |
 107
 108 # bit to byte permute
 109
 110 similar to matrix permute in RV bitmanip, which has XOR and OR variants
 111
 112     do j = 0 to 7
 113       do k = 0 to 7
 114          b = VSR[VRB+32].dword[i].byte[k].bit[j]
 115          VSR[VRT+32].dword[i].byte[j].bit[k] = b
 116
 117 # vector bit deposit
 118
 119 vpdepd VRT,VRA,VRB, identical to RV bitmamip bdep
 120
 121     do while(m < 64)
 122        if VSR[VRB+32].dword[i].bit[63-m]=1 then do
 123           result = VSR[VRA+32].dword[i].bit[63-k]
 124           VSR[VRT+32].dword[i].bit[63-m] = result
 125           k = k + 1
 126        m = m + 1
 127
 128 ```
 129
 130 uint_xlen_t bdep(uint_xlen_t RA, uint_xlen_t RB)
 131 {
 132     uint_xlen_t r = 0;
 133     for (int i = 0, j = 0; i < XLEN; i++)
 134         if ((RB >> i) & 1) {
 135             if ((RA >> j) & 1)
 136                 r |= uint_xlen_t(1) << i;
 137             j++;
 138         }
 139     return r;
 140 }
 141
 142 ```
 143
 144 # vector bit extract
 145
 146 other way round: identical to RV bext
 147
 148 ```
 149 uint_xlen_t bext(uint_xlen_t RA, uint_xlen_t RB)
 150 {
 151     uint_xlen_t r = 0;
 152     for (int i = 0, j = 0; i < XLEN; i++)
 153         if ((RB >> i) & 1) {
 154             if ((RA >> i) & 1)
 155                 r |= uint_xlen_t(1) << j;
 156             j++;
 157         }
 158     return r;
 159 }
 160 ```
 161
 162 # int min/max
 163
 164 signed and unsigned min/max for integer.  this is sort-of partly synthesiseable in [[sv/svp64]] with pred-result as long as the dest reg is one of the sources, but not both signed and unsigned.  when the dest is also one of the srces and the mv fails due to the CR bittest failing this will only overwrite the dest where the src is greater (or less).
 165
 166 signed/unsigned min/max gives more flexibility.
 167
 168 ```
 169 uint_xlen_t min(uint_xlen_t rs1, uint_xlen_t rs2)
 170 { return (int_xlen_t)rs1 < (int_xlen_t)rs2 ? rs1 : rs2;
 171 }
 172 uint_xlen_t max(uint_xlen_t rs1, uint_xlen_t rs2)
 173 { return (int_xlen_t)rs1 > (int_xlen_t)rs2 ? rs1 : rs2;
 174 }
 175 uint_xlen_t minu(uint_xlen_t rs1, uint_xlen_t rs2)
 176 { return rs1 < rs2 ? rs1 : rs2;
 177 }
 178 uint_xlen_t maxu(uint_xlen_t rs1, uint_xlen_t rs2)
 179 { return rs1 > rs2 ? rs1 : rs2;
 180 }
 181 ```
 182
 183
 184 # ternary bitops
 185
 186 Similar to FPGA LUTs: for every bit perform a lookup into a table using an 8bit immediate, or in another register
 187
 188 | 0.5|6.10|11.15|16.20| 21..25| 26..30   |31|
 189 | -- | -- | --- | --- | ----- | -------- |--|
 190 | NN | RT | RA  | RB  | im0-4 | im5-7 00 |Rc|
 191
 192     for i in range(64):
 193         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 194         RT[i] = (imm & (1<<idx)) != 0
 195
 196 bits 21..22 may be used to specify a mode, such as treating the whole integer zero/nonzero and putting 1/0 in the result, rather than bitwise test.
 197
 198 a 4 operand variant which becomes more along the lines of an FPGA:
 199
 200 | 0.5|6.10|11.15|16.20|21.25| 26...30  |31|
 201 | -- | -- | --- | --- | --- | -------- |--|
 202 | NN | RT | RA  | RB  | RC  | mode 001 |Rc|
 203
 204     for i in range(64):
 205         idx = RT[i] << 2 | RA[i] << 1 | RB[i]
 206         RT[i] = (RC & (1<<idx)) != 0
 207
 208 mode (2 bit) may be used to do inversion of ordering, similar to carryless mul,
 209 3 modes.
 210
 211 also, another possible variant involving swizzle and vec4:
 212
 213 | 0.5|6.10|11.15| 16.23 |24.27 | 28.30 |31|
 214 | -- | -- | --- | ----- | ---- | ----- |--|
 215 | NN | RT | RA  | imm   | mask | 101   |1 |
 216
 217     for i in range(8):
 218         idx = RA.x[i] << 2 | RA.y[i] << 1 | RA.z[i]
 219         res = (imm & (1<<idx)) != 0
 220         for j in range(3):
 221              if mask[j]: RT[i+j*8] = res
 222
 223 another mode selection would be CRs not Ints.
 224
 225 | 0.5|6.8 | 9.11|12.14|15|16.23|24.27 | 28.30|31|
 226 | -- | -- | --- | --- |- |-----|----- | -----|--|
 227 | NN | BA | BB  | BC  |0 |imm  | mask | 101  |0 |
 228
 229     for i in range(4):
 230         if not mask[i] continue
 231         idx = crregs[BA][i] << 2 |
 232               crregs[BB][i] << 1 |
 233               crregs[BC][i]
 234         crregs[BA][i] = (imm & (1<<idx)) != 0
 235
 236 # bitmask set
 237
 238 based on RV bitmanip singlebit set, instruction format similar to shift
 239 [[isa/fixedshift]].  bmext is actually covered already (shift-with-mask rldicl but only immediate version).
 240 however bitmask-invert is not, and set/clr are not covered, although they can use the same Shift ALU.
 241
 242 bmext (RB) version is not the same as rldicl because bmext is a right shift by RC, where rldicl is a left rotate.  for the immediate version this does not matter, so a bmexti is not required.
 243 bmrev however there is no direct equivalent and consequently a bmrevi is required.
 244
 245 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31| name  |
 246 | -- | -- | --- | --- | --- | ------- |--| ----- |
 247 | NN | RT | RA  | RB  | RC  | mode 010 |Rc| bm*   |
 248 | NN | RT | RA  | RB  | RC  | 0 1  111 |Rc| bmrev |
 249 | NN |    |     |     |     | 1 1  111 |Rc| rsvd |
 250
 251 ```
 252 uint_xlen_t bmset(RA, RB, sh)
 253 {
 254     int shamt = RB & (XLEN - 1);
 255     mask = (2<<sh)-1;
 256     return RA | (mask << shamt);
 257 }
 258
 259 uint_xlen_t bmclr(RA, RB, sh)
 260 {
 261     int shamt = RB & (XLEN - 1);
 262     mask = (2<<sh)-1;
 263     return RA & ~(mask << shamt);
 264 }
 265
 266 uint_xlen_t bminv(RA, RB, sh)
 267 {
 268     int shamt = RB & (XLEN - 1);
 269     mask = (2<<sh)-1;
 270     return RA ^ (mask << shamt);
 271 }
 272
 273 uint_xlen_t bmext(RA, RB, sh)
 274 {
 275     int shamt = RB & (XLEN - 1);
 276     mask = (2<<sh)-1;
 277     return mask & (RA >> shamt);
 278 }
 279 ```
 280
 281 bitmask extract with reverse
 282
 283 ```
 284 msb = rb[5:0];
 285 rev[0:msb] = ra[msb:0];
 286 rt = ZE(rev[msb:0]);
 287 ```
 288
 289 | 0.5|6.10|11.15|16.20|21.26| 27..30  |31| name   |
 290 | -- | -- | --- | --- | --- | ------- |--| ------ |
 291 | NN | RT | RA  | RB  | sh  | 0   111 |Rc| bmrevi |
 292
 293
 294
 295 # grev
 296
 297 based on RV bitmanip
 298
 299 ```
 300 uint64_t grev64(uint64_t RA, uint64_t RB)
 301 {
 302     uint64_t x = RA;
 303     int shamt = RB & 63;
 304     if (shamt & 1) x = ((x &  0x5555555555555555LL) <<  1) |
 305                         ((x & 0xAAAAAAAAAAAAAAAALL) >>  1);
 306     if (shamt & 2) x = ((x &  0x3333333333333333LL) <<  2) |
 307                         ((x & 0xCCCCCCCCCCCCCCCCLL) >>  2);
 308     if (shamt & 4) x = ((x &  0x0F0F0F0F0F0F0F0FLL) <<  4) |
 309                         ((x & 0xF0F0F0F0F0F0F0F0LL) >>  4);
 310     if (shamt & 8) x = ((x &  0x00FF00FF00FF00FFLL) <<  8) |
 311                         ((x & 0xFF00FF00FF00FF00LL) >>  8);
 312     if (shamt & 16) x = ((x & 0x0000FFFF0000FFFFLL) << 16) |
 313                         ((x & 0xFFFF0000FFFF0000LL) >> 16);
 314     if (shamt & 32) x = ((x & 0x00000000FFFFFFFFLL) << 32) |
 315                         ((x & 0xFFFFFFFF00000000LL) >> 32);
 316     return x;
 317 }
 318
 319 ```
 320
 321 # shuffle / unshuffle
 322
 323 based on RV bitmanip
 324
 325 ```
 326 uint32_t shfl32(uint32_t RA, uint32_t RB)
 327 {
 328     uint32_t x = RA;
 329     int shamt = RB & 15;
 330     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 331     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 332     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 333     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 334     return x;
 335 }
 336 uint32_t unshfl32(uint32_t RA, uint32_t RB)
 337 {
 338     uint32_t x = RA;
 339     int shamt = RB & 15;
 340     if (shamt & 1) x  = shuffle32_stage(x, 0x44444444, 0x22222222, 1);
 341     if (shamt & 2) x  = shuffle32_stage(x, 0x30303030, 0x0c0c0c0c, 2);
 342     if (shamt & 4) x  = shuffle32_stage(x, 0x0f000f00, 0x00f000f0, 4);
 343     if (shamt & 8) x  = shuffle32_stage(x, 0x00ff0000, 0x0000ff00, 8);
 344     return x;
 345 }
 346
 347 uint64_t shuffle64_stage(uint64_t src, uint64_t maskL, uint64_t maskR, int N)
 348 {
 349     uint64_t x = src & ~(maskL | maskR);
 350     x |= ((src << N) & maskL) | ((src >> N) & maskR);
 351     return x;
 352 }
 353 uint64_t shfl64(uint64_t RA, uint64_t RB)
 354 {
 355     uint64_t x = RA;
 356     int shamt = RB & 31;
 357     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 358                                            0x00000000ffff0000LL, 16);
 359     if (shamt & 8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 360                                            0x0000ff000000ff00LL, 8);
 361     if (shamt & 4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 362                                            0x00f000f000f000f0LL, 4);
 363     if (shamt & 2) x = shuffle64_stage(x, 0x3030303030303030LL,
 364                                            0x0c0c0c0c0c0c0c0cLL, 2);
 365     if (shamt & 1) x = shuffle64_stage(x, 0x4444444444444444LL,
 366                                            0x2222222222222222LL, 1);
 367     return x;
 368 }
 369 uint64_t unshfl64(uint64_t RA, uint64_t RB)
 370 {
 371     uint64_t x = RA;
 372     int shamt = RB & 31;
 373     if (shamt &  1) x = shuffle64_stage(x, 0x4444444444444444LL,
 374                                            0x2222222222222222LL, 1);
 375     if (shamt &  2) x = shuffle64_stage(x, 0x3030303030303030LL,
 376                                            0x0c0c0c0c0c0c0c0cLL, 2);
 377     if (shamt &  4) x = shuffle64_stage(x, 0x0f000f000f000f00LL,
 378                                            0x00f000f000f000f0LL, 4);
 379     if (shamt &  8) x = shuffle64_stage(x, 0x00ff000000ff0000LL,
 380                                            0x0000ff000000ff00LL, 8);
 381     if (shamt & 16) x = shuffle64_stage(x, 0x0000ffff00000000LL,
 382                                            0x00000000ffff0000LL, 16);
 383     return x;
 384 }
 385 ```
 386
 387 # xperm
 388
 389 based on RV bitmanip
 390
 391 ```
 392 uint_xlen_t xperm(uint_xlen_t RA, uint_xlen_t RB, int sz_log2)
 393 {
 394     uint_xlen_t r = 0;
 395     uint_xlen_t sz = 1LL << sz_log2;
 396     uint_xlen_t mask = (1LL << sz) - 1;
 397     for (int i = 0; i < XLEN; i += sz) {
 398         uint_xlen_t pos = ((RB >> i) & mask) << sz_log2;
 399         if (pos < XLEN)
 400             r |= ((RA >> pos) & mask) << i;
 401     }
 402     return r;
 403 }
 404 uint_xlen_t xperm_n (uint_xlen_t RA, uint_xlen_t RB)
 405 {  return xperm(RA, RB, 2); }
 406 uint_xlen_t xperm_b (uint_xlen_t RA, uint_xlen_t RB)
 407 {  return xperm(RA, RB, 3); }
 408 uint_xlen_t xperm_h (uint_xlen_t RA, uint_xlen_t RB)
 409 {  return xperm(RA, RB, 4); }
 410 uint_xlen_t xperm_w (uint_xlen_t RA, uint_xlen_t RB)
 411 {  return xperm(RA, RB, 5); }
 412 ```
 413
 414 # gorc
 415
 416 based on RV bitmanip
 417
 418 ```
 419 uint32_t gorc32(uint32_t RA, uint32_t RB)
 420 {
 421     uint32_t x = RA;
 422     int shamt = RB & 31;
 423     if (shamt & 1) x |= ((x & 0x55555555) << 1)   |  ((x &  0xAAAAAAAA) >> 1);
 424     if (shamt & 2) x |= ((x & 0x33333333) << 2)   |  ((x &  0xCCCCCCCC) >> 2);
 425     if (shamt & 4) x |= ((x & 0x0F0F0F0F) << 4)   |  ((x &  0xF0F0F0F0) >> 4);
 426     if (shamt & 8) x |= ((x & 0x00FF00FF) << 8)   |  ((x &  0xFF00FF00) >> 8);
 427     if (shamt & 16) x |= ((x & 0x0000FFFF) << 16) |  ((x &  0xFFFF0000) >> 16);
 428     return x;
 429 }
 430 uint64_t gorc64(uint64_t RA, uint64_t RB)
 431 {
 432     uint64_t x = RA;
 433     int shamt = RB & 63;
 434     if (shamt & 1) x |= ((x & 0x5555555555555555LL)   <<   1) |
 435                          ((x & 0xAAAAAAAAAAAAAAAALL)  >>  1);
 436     if (shamt & 2) x |= ((x & 0x3333333333333333LL)   <<   2) |
 437                          ((x & 0xCCCCCCCCCCCCCCCCLL)  >>  2);
 438     if (shamt & 4) x |= ((x & 0x0F0F0F0F0F0F0F0FLL)   <<   4) |
 439                          ((x & 0xF0F0F0F0F0F0F0F0LL)  >>  4);
 440     if (shamt & 8) x |= ((x & 0x00FF00FF00FF00FFLL)   <<   8) |
 441                          ((x & 0xFF00FF00FF00FF00LL)  >>  8);
 442     if (shamt & 16) x |= ((x & 0x0000FFFF0000FFFFLL)  << 16) |
 443                          ((x & 0xFFFF0000FFFF0000LL)  >> 16);
 444     if (shamt & 32) x |= ((x & 0x00000000FFFFFFFFLL)  << 32) |
 445                          ((x & 0xFFFFFFFF00000000LL)  >> 32);
 446     return x;
 447 }
 448
 449 ```
 450
 451 # cmix
 452
 453 based on RV bitmanip, covered by ternary bitops
 454
 455 ```
 456 uint_xlen_t cmix(uint_xlen_t RA, uint_xlen_t RB, uint_xlen_t RC) {
 457     return (RA & RB) | (RC & ~RB);
 458 }
 459 ```
 460
 461 # carryless mul
 462
 463 based on RV bitmanip
 464 see https://en.wikipedia.org/wiki/CLMUL_instruction_set
 465
 466 ```
 467 uint_xlen_t clmul(uint_xlen_t RA, uint_xlen_t RB)
 468 {
 469     uint_xlen_t x = 0;
 470     for (int i = 0; i < XLEN; i++)
 471         if ((RB >> i) & 1)
 472             x ^= RA << i;
 473     return x;
 474 }
 475 uint_xlen_t clmulh(uint_xlen_t RA, uint_xlen_t RB)
 476 {
 477     uint_xlen_t x = 0;
 478     for (int i = 1; i < XLEN; i++)
 479         if ((RB >> i) & 1)
 480             x ^= RA >> (XLEN-i);
 481     return x;
 482 }
 483 uint_xlen_t clmulr(uint_xlen_t RA, uint_xlen_t RB)
 484 {
 485     uint_xlen_t x = 0;
 486     for (int i = 0; i < XLEN; i++)
 487         if ((RB >> i) & 1)
 488             x ^= RA >> (XLEN-i-1);
 489     return x;
 490 }
 491 ```
 492 # Galois Field
 493
 494 ## Multiply
 495
 496 this requires 3 parameters and a "degree"
 497
 498     RT = GFMUL(RA, RB, gfdegree, modulo=RC)
 499
 500 realistically with the degree also needing to be an immediate it should be brought down to an overwrite version:
 501
 502     RS = GFMUL(RS, RA, gfdegree, modulo=RB)
 503
 504 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 505 | -- | -- | --- | --- | --- | ------- |--|
 506 | NN | RS | RA  | RB  | deg | 00  011 |Rc|
 507
 508 where the SimpleV variant may override RS-as-src differently from RS-as-dest
 509
 510
 511
 512 ```
 513 from functools import reduce
 514
 515 # constants used in the multGF2 function
 516 mask1 = mask2 = polyred = None
 517
 518 def setGF2(degree, irPoly):
 519     """Define parameters of binary finite field GF(2^m)/g(x)
 520        - degree: extension degree of binary field
 521        - irPoly: coefficients of irreducible polynomial g(x)
 522     """
 523     def i2P(sInt):
 524         """Convert an integer into a polynomial"""
 525         return [(sInt >> i) & 1
 526                 for i in reversed(range(sInt.bit_length()))]
 527
 528     global mask1, mask2, polyred
 529     mask1 = mask2 = 1 << degree
 530     mask2 -= 1
 531     polyred = reduce(lambda x, y: (x << 1) + y, i2P(irPoly)[1:])
 532
 533 def multGF2(p1, p2):
 534     """Multiply two polynomials in GF(2^m)/g(x)"""
 535     p = 0
 536     while p2:
 537         if p2 & 1:
 538             p ^= p1
 539         p1 <<= 1
 540         if p1 & mask1:
 541             p1 ^= polyred
 542         p2 >>= 1
 543     return p & mask2
 544
 545 if __name__ == "__main__":
 546
 547     # Define binary field GF(2^3)/x^3 + x + 1
 548     setGF2(3, 0b1011)
 549
 550     # Evaluate the product (x^2 + x + 1)(x^2 + 1)
 551     print("{:02x}".format(multGF2(0b111, 0b101)))
 552
 553     # Define binary field GF(2^8)/x^8 + x^4 + x^3 + x + 1
 554     # (used in the Advanced Encryption Standard-AES)
 555     setGF2(8, 0b100011011)
 556
 557     # Evaluate the product (x^7)(x^7 + x + 1)
 558     print("{:02x}".format(multGF2(0b10000000, 0b10000011)))
 559 ```
 560 ## GF add
 561
 562     RS = GFADD(RS, RA|0, gfdegree, modulo=RB)
 563
 564 | 0.5|6.10|11.15|16.20|21.25| 26..30  |31|
 565 | -- | -- | --- | --- | --- | ------- |--|
 566 | NN | RS | RA  | RB  | deg | 01  011 |Rc|
 567
 568 ## gf invert
 569
 570 ```
 571 def gf_degree(a) :
 572   res = 0
 573   a >>= 1
 574   while (a != 0) :
 575     a >>= 1;
 576     res += 1;
 577   return res
 578
 579 def gf_invert(a, mod=0x1B) :
 580   v = mod
 581   g1 = 1
 582   g2 = 0
 583   j = gf_degree(a) - 8
 584
 585   while (a != 1) :
 586     if (j < 0) :
 587       a, v = v, a
 588       g1, g2 = g2, g1
 589       j = -j
 590
 591     a ^= v << j
 592     g1 ^= g2 << j
 593
 594     a %= 256  # Emulating 8-bit overflow
 595     g1 %= 256 # Emulating 8-bit overflow
 596
 597     j = gf_degree(a) - gf_degree(v)
 598
 599   return g1
 600 ```
 601
 602 ## GF mod
 603
 604 modulo arithmetic to ensure a polynomial is in range seems sensible to have.
 605
 606 # bitmatrix
 607
 608 ```
 609 uint64_t bmatflip(uint64_t RA)
 610 {
 611     uint64_t x = RA;
 612     x = shfl64(x, 31);
 613     x = shfl64(x, 31);
 614     x = shfl64(x, 31);
 615     return x;
 616 }
 617 uint64_t bmatxor(uint64_t RA, uint64_t RB)
 618 {
 619     // transpose of RB
 620     uint64_t RBt = bmatflip(RB);
 621     uint8_t u[8]; // rows of RA
 622     uint8_t v[8]; // cols of RB
 623     for (int i = 0; i < 8; i++) {
 624         u[i] = RA >> (i*8);
 625         v[i] = RBt >> (i*8);
 626     }
 627     uint64_t x = 0;
 628     for (int i = 0; i < 64; i++) {
 629         if (pcnt(u[i / 8] & v[i % 8]) & 1)
 630             x |= 1LL << i;
 631     }
 632     return x;
 633 }
 634 uint64_t bmator(uint64_t RA, uint64_t RB)
 635 {
 636     // transpose of RB
 637     uint64_t RBt = bmatflip(RB);
 638     uint8_t u[8]; // rows of RA
 639     uint8_t v[8]; // cols of RB
 640     for (int i = 0; i < 8; i++) {
 641         u[i] = RA >> (i*8);
 642         v[i] = RBt >> (i*8);
 643     }
 644     uint64_t x = 0;
 645     for (int i = 0; i < 64; i++) {
 646         if ((u[i / 8] & v[i % 8]) != 0)
 647             x |= 1LL << i;
 648     }
 649     return x;
 650 }
 651
 652 ```