crypto/chacha20/chacha20_svp64.txt

   1 Main loop for xchacha_hchacha20:
   2
   3 for (i = 0; i < 10; i++){
   4     QUARTERROUND(x0, x4,  x8, x12);
   5     QUARTERROUND(x1, x5,  x9, x13);
   6     QUARTERROUND(x2, x6, x10, x14);
   7     QUARTERROUND(x3, x7, x11, x15);
   8     QUARTERROUND(x0, x5, x10, x15);
   9     QUARTERROUND(x1, x6, x11, x12);
  10     QUARTERROUND(x2, x7,  x8, x13);
  11     QUARTERROUND(x3, x4,  x9, x14);
  12 }
  13
  14 #define QUARTERROUND(a,b,c,d) \
  15     a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
  16     c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
  17     a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
  18     c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
  19
  20 We see that the loop is split in two groups of QUARTERROUND calls, one with step=4:
  21
  22     QUARTERROUND(x0, x4,  x8, x12);
  23     QUARTERROUND(x1, x5,  x9, x13);
  24     QUARTERROUND(x2, x6, x10, x14);
  25     QUARTERROUND(x3, x7, x11, x15);
  26
  27 and another with step=5:
  28
  29     QUARTERROUND(x0, x5, x10, x15);
  30     QUARTERROUND(x1, x6, x11, x12);
  31     QUARTERROUND(x2, x7,  x8, x13);
  32     QUARTERROUND(x3, x4,  x9, x14);
  33
  34 Let's start with the first group of QUARTERROUNDs, by unrolling it, essentially it results in the following instructions:
  35
  36     x0  = x0 + x4;   x12 = ROTATE(x12 ^ x0, 16);
  37     x8  = x8 + x12;   x4 = ROTATE(x4 ^ x8, 12);
  38     x0  = x0 + x4;   x12 = ROTATE(x12 ^ x0, 8);
  39     x8  = x8 + x12;   x4 = ROTATE(x4 ^ x8, 7);
  40     x1  = x1 + x5;   x13 = ROTATE(x13 ^ x1, 16);
  41     x9  = x9 + x13;   x5 = ROTATE(x5 ^ x9, 12);
  42     x1  = x1 + x5;   x13 = ROTATE(x13 ^ x1, 8);
  43     x9  = x9 + x13;   x5 = ROTATE(x5 ^ x9, 7);
  44     x2  = x2 + x6;   x14 = ROTATE(x14 ^ x2, 16);
  45     x10 = x10 + x14;  x6 = ROTATE(x6 ^ x10, 12);
  46     x2  = x2 + x6;   x14 = ROTATE(x14 ^ x2, 8);
  47     x10 = x10 + x14;  x6 = ROTATE(x6 ^ x10, 7);
  48     x3  = x3 + x7;   x15 = ROTATE(x15 ^ x3, 16);
  49     x11 = x11 + x15;  x7 = ROTATE(x7 ^ x11, 12);
  50     x3  = x3 + x7;   x15 = ROTATE(x15 ^ x3, 8);
  51     x11 = x11 + x15;  x7 = ROTATE(x7 ^ x11, 7);
  52
  53 Second group of QUARTERROUNDs, unrolled:
  54     x0  = x0 + x5;   x15 = ROTATE(x15 ^ x0, 16);
  55     x10 = x10 + x15;  x5 = ROTATE(x5 ^ x10, 12);
  56     x0  = x0 + x5;   x12 = ROTATE(x15 ^ x0, 8);
  57     x10 = x10 + x15;  x5 = ROTATE(x5 ^ x10, 7);
  58     x1  = x1 + x6;   x12 = ROTATE(x12 ^ x1, 16);
  59     x11 = x11 + x12;  x6 = ROTATE(x6 ^ x11, 12);
  60     x1  = x1 + x6;   x12 = ROTATE(x12 ^ x1, 8);
  61     x11 = x11 + x12;  x6 = ROTATE(x6 ^ x11, 7);
  62     x2  = x2 + x7;   x13 = ROTATE(x13 ^ x2, 16);
  63     x8  = x8 + x13;   x7 = ROTATE(x7 ^ x8, 12);
  64     x2  = x2 + x7;   x13 = ROTATE(x13 ^ x2, 8);
  65     x8  = x8 + x13;   x7 = ROTATE(x7 ^ x8, 7);
  66     x3  = x3 + x4;   x14 = ROTATE(x14 ^ x3, 16);
  67     x9  = x9 + x14;   x4 = ROTATE(x4 ^ x9, 12);
  68     x3  = x3 + x4;   x14 = ROTATE(x14 ^ x3, 8);
  69     x9  = x9 + x14;   x4 = ROTATE(x4 ^ x9, 7);
  70
  71 Let's list the additions only:
  72
  73     x0  = x0 + x4
  74     x8  = x8 + x12
  75     x0  = x0 + x4
  76     x8  = x8 + x12
  77     x1  = x1 + x5
  78     x9  = x9 + x13
  79     x1  = x1 + x5
  80     x9  = x9 + x13
  81     x2  = x2 + x6
  82     x10 = x10 + x14
  83     x2  = x2 + x6
  84     x10 = x10 + x14
  85     x3  = x3 + x7
  86     x11 = x11 + x15
  87     x3  = x3 + x7
  88     x11 = x11 + x15
  89     x0  = x0 + x5
  90     x10 = x10 + x15
  91     x0  = x0 + x5
  92     x10 = x10 + x15
  93     x1  = x1 + x6
  94     x11 = x11 + x12
  95     x1  = x1 + x6
  96     x11 = x11 + x12
  97     x2  = x2 + x7
  98     x8  = x8 + x13
  99     x2  = x2 + x7
 100     x8  = x8 + x13
 101     x3  = x3 + x4
 102     x9  = x9 + x14
 103     x3  = x3 + x4
 104     x9  = x9 + x14
 105
 106 Since we're going to use Vertical-First mode, the additions will be executed one by one and we need to note the indices that are going to be used for each operation.
 107 We remind that sv.add is the instruction that will be executed, in the form:
 108
 109     sv.add RT, RA, RB  # RT = RA + RB
 110
 111 Let's assume the values x in the registers 24-36
 112
 113     GPR 24 |  x0 |  x1 |  x2 |  x3 |
 114     GPR 28 |  x4 |  x5 |  x6 |  x7 |
 115     GPR 32 |  x8 |  x9 | x10 | x11 |
 116     GPR 36 | x12 | x13 | x14 | x15 |
 117
 118 So for the addition in Vertical-First mode, RT (and RA as they are the same) indices are (in terms of x):
 119
 120    |  0 |  8 |  0 |  8 |  1 |  9 |  1 |  9 |
 121    |  2 | 10 |  2 | 10 |  3 | 11 |  3 | 11 |
 122    |  0 | 10 |  0 | 10 |  1 | 11 |  1 | 11 |
 123    |  2 |  8 |  2 |  8 |  3 |  9 |  3 |  9 |
 124
 125 However, since the indices are small values, using a single 64-bit register for a single index value is a waste so we will compress them, 8 indices in a 64-bit register:
 126 So, RT indices will fit inside these 4 registers (in Little Endian format):
 127
 128    SVSHAPE0: | 0x901090108000800 | 0xb030b030a020a02 | 0xb010b010a000a00 | 0x903090308020802 |
 129
 130 Similarly we find the RB indices:
 131
 132    |  4 | 12 |  4 | 12 |  5 | 13 |  5 | 13 |
 133    |  6 | 14 |  6 | 14 |  7 | 15 |  7 | 15 |
 134    |  5 | 15 |  5 | 15 |  6 | 12 |  6 | 12 |
 135    |  7 | 13 |  7 | 13 |  4 | 14 |  7 | 14 |
 136
 137 Using a similar method, we find the final 4 registers with the RB indices:
 138
 139    SVSHAPE1: | 0xd050d050c040c04 | 0xf070f070e060e06 | 0xc060c060f050f05 | 0xe040e040d070d07 |
 140
 141 Now, we can construct the Vertical First loop:
 142
 143     svindex             4, 0, 1, 3, 0, 1, 0             # SVSHAPE0, add RA/RT indices
 144     svindex             6, 1, 1, 3, 0, 1, 0             # SVSHAPE1, add RB indices
 145     setvl               0, 0, 32, 0, 1, 1               # MAXVL=VL=32
 146     # set r22 from VL, set vertical-first
 147     setvl               22, 0, 32, 1, 0, 1              # vertical-first mode
 148     svremap             31, 1, 0, 0, 0, 0, 0            # RA=1, RB=0, RT=0 (0b01011)
 149     sv.add/w=32         *x, *x, *x                      # RT, RB will use SHAPE0, RA will use SHAPE1
 150     svstep. 16, 1, 0                                    # step to next in-regs element
 151
 152 What this code snippet does is the following:
 153
 154 The first instruction
 155
 156     svindex             4, 0, 1, 3, 0, 1, 0
 157
 158 loads the add RT indices in the SVSHAPE0, in register 8. You will note that 4 is listed, but that's because it only works on even registers, so in order to save a bit, we have to double that number to get the actual register. So, SVSHAPE0 will be listed in GPRs 8-12. The number 3 lists that the elements will be 8-bit long. 0=64-bit, 1=32-bit, 2=16-bit, 3=8-bit.
 159
 160 The next step instruction
 161
 162     svindex             6, 1, 1, 3, 0, 1, 0
 163
 164 loads the add RB indices into SVSHAPE1. Again, even though we list 6, the actual registers will be loaded in GPR #12, again a use of 8-bit elements is denoted.
 165 Next, the setvl instructions:
 166
 167     setvl               0, 0, 32, 0, 1, 1
 168     setvl               22, 0, 32, 1, 0, 1
 169
 170 We have to call setvl twice, the first one sets MAXVL and VL to 32. The second setvl, stores the VL to register 22 and also configures Vertical-First mode.
 171 Afterwards, we have to instruct the way we intend to use the indices, and we do this using svremap.
 172
 173     svremap 31, 1, 0, 0, 0, 0, 0
 174
 175 svremap basically instructs the scheduler to use SVSHAPE0 for RT and RB, SVSHAPE1 for RA.
 176 The next instruction performs the *actual* addition:
 177
 178     sv.add/w=32         *x, *x, *x
 179
 180 Note the /w=32 suffix. This instructs the adder to perform the operation in elements of w=32 bits. Since the Power CPU is a 64-bit CPU, this means that we need to have 2 32-bit elements loaded in each register. Also, note that in all parameters we use the *x as argument. This instructs the scheduler to act on the registers as a vector, or a sequence of elements. But even though they are all the same, their indices will be taken from the SVSHAPE0/SVSHAPE1 indices as defined previously. Also note that the indices are relative to the actual register used. So, if *x starts in GPR 24 for example, in essence this instruction will issue the following sequence of instructions:
 181
 182     add/w=32            24 + 0, 24 +  4, 24 + 0
 183     add/w=32            24 + 8, 24 + 12, 24 + 8
 184     add/w=32            24 + 0, 24 +  4, 24 + 0
 185     add/w=32            24 + 8, 24 + 12, 24 + 8
 186     add/w=32            24 + 1, 24 +  5, 24 + 1
 187     add/w=32            24 + 9, 24 + 13, 24 + 9
 188     add/w=32            24 + 1, 24 +  5, 24 + 1
 189     add/w=32            24 + 9, 24 + 13, 24 + 9
 190     ...
 191
 192 Finally, the svstep. instruction steps to the next set of indices
 193
 194 We have shown how to do the additions in a Vertical-first mode. Now let's add the rest of the instructions in the QUARTERROUNDs.
 195 For the XOR instructions of both QUARTERROUNDs groups only, assuming that d = XOR(d, a):
 196
 197     x12 = x12 ^ x0
 198     x4 = x4 ^ x8
 199     x12 = x12 ^ x0
 200     x4 = x4 ^ x8
 201     x13 = x13 ^ x1
 202     x5 = x5 ^ x9
 203     x13 = x13 ^ x1
 204     x5 = x5 ^ x9
 205     x14 = x14 ^ x2
 206     x6 = x6 ^ x10
 207     x14 = x14 ^ x2
 208     x6 = x6 ^ x10
 209     x15 = x15 ^ x3
 210     x7 = x7 ^ x11
 211     x15 = x15 ^ x3
 212     x7 = x7 ^ x11
 213     x15 = x15 ^ x0
 214     x5 = x5 ^ x10
 215     x12 = x15 ^ x0
 216     x5 = x5 ^ x10
 217     x12 = x12 ^ x1
 218     x6 = x6 ^ x11
 219     x12 = x12 ^ x1
 220     x6 = x6 ^ x11
 221     x13 = x13 ^ x2
 222     x7 = x7 ^ x8
 223     x13 = x13 ^ x1
 224     x7 = x7 ^ x8
 225     x14 = x14 ^ x3
 226     x4 = x4 ^ x9
 227     x14 = x14 ^ x3
 228     x4 = x4 ^ x9
 229
 230 We will need to create another set of indices for the XOR instructions. We will only need one set as the other set of indices is the same as RT for sv.add (SHAPE0). So, remembering that our
 231
 232     | 12 |  4 | 12 |  4 | 13 |  5 | 13 |  5 |
 233     | 14 |  6 | 14 |  6 | 15 |  7 | 15 |  7 |
 234     | 15 |  5 | 15 |  5 | 12 |  6 | 12 |  6 |
 235     | 13 |  7 | 13 |  7 | 14 |  4 | 14 |  4 |
 236
 237 Again, we find
 238
 239     SVSHAPE2: | 0x50d050d040c040c | 0x70f070f060e060e | 0x60c060c050f050f | 0x40e040e070d070d |
 240
 241 The next operation is the ROTATE which takes as operand the result of the XOR and a shift argument. You can easily see that the indices used in this case are the same as the XOR. However, the shift values cycle every 4: 16, 12, 8, 7. For the indices we can again use svindex, like this:
 242
 243     svindex             8, 2, 1, 3, 0, 1, 0
 244
 245 Which again means SVPSHAPE2, operating on 8-bit elements, starting from GPR #16 (8*2). For the shift values cycling every 4 elements, the svshape2 instruction will be used:
 246
 247     svshape2            0, 0, 3, 4, 0, 1
 248
 249 This will create an SVSHAPE3, which will use a modulo 4 for all of its elements. Now we can list both XOR and ROTATE instructions in assembly, together with the respective svremap instructions:
 250
 251     svremap             31, 2, 0, 2, 2, 0, 0            # RA=2, RB=0, RS=2 (0b00111)
 252     sv.xor/w=32         *x, *x, *x
 253     svremap             31, 0, 3, 2, 2, 0, 0            # RA=2, RB=3, RS=2 (0b01110)
 254     sv.rldcl/w=32       *x, *x, *SHIFTS, 0
 255
 256 So, in a similar fashion, we instruct XOR (sv.xor) to use SVSHAPE2 for RA and RS and SVSHAPE0 for RB, again for 32-bit elements, while ROTATE (sv.rldcl) will also use SVSHAPE2 for RA and RS, but SVSHAPE3 for RB (the shift values, which cycle every 4 elements). Note that the actual indices for SVSHAPE3 will have to be in 32-bit elements:
 257
 258     SHIFTS: | 0x0000000c00000010 | 0x0000000700000008 |
 259
 260 The complete algorithm for a loop with 10 iterations is as follows:
 261
 262     li                  7, 10                           # Load value 10 into GPR #7
 263     mtctr               7                               # Set up counter on GPR #7
 264
 265     # set up VL=32 vertical-first, and SVSHAPEs 0-2
 266     # set VL/MAXVL first
 267     setvl               0, 0, 32, 0, 1, 1               # MAXVL=VL=32
 268     # set r22 from VL, set vertical-first
 269     setvl               22, 0, 32, 1, 0, 1              # vertical-first mode
 270     # SHAPE0, used by sv.add starts at GPR #8
 271     svindex             8/2, 0, 1, 3, 0, 1, 0           # SVSHAPE0, a
 272     # SHAPE1, used by sv.xor starts at GPR #12
 273     svindex             12/2, 1, 1, 3, 0, 1, 0          # SVSHAPE1, b
 274     # SHAPE2, used by sv.rldcl starts at GPR #16
 275     svindex             16/2, 2, 1, 3, 0, 1, 0          # SVSHAPE2, c
 276     # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
 277     # The inner loop will do 32 iterations, but there are only 4 shift values, so we mod 4
 278     svshape2            0, 0, 3, 4, 0, 1                # SVSHAPE3, shift amount, mod 4
 279
 280 .outer:
 281     # outer loop begins here (standard CTR loop)
 282     setvl               22, 22, 32, 1, 1, 0             # vertical-first, set VL from r22
 283     # inner loop begins here. add-xor-rotl32 with remap, step, branch
 284 .inner:
 285     svremap             31, 1, 0, 0, 0, 0, 0            # RA=1, RB=0, RT=0 (0b01011)
 286     sv.add/w=32         *x, *x, *x
 287     svremap             31, 2, 0, 2, 2, 0, 0            # RA=2, RB=0, RS=2 (0b00111)
 288     sv.xor/w=32         *x, *x, *x
 289     svremap             31, 0, 3, 2, 2, 0, 0            # RA=2, RB=3, RS=2 (0b01110)
 290     sv.rldcl/w=32       *x, *x, *SHIFTS, 0
 291     # 16 is the destination containing the result of svstep.
 292     # it overlaps with SHAPE2 which is also 16. the first 8 indices
 293     # will get corrupted.
 294     svstep.             7, 1, 0                         # step to next in-regs element
 295     bc                  6, 3, .inner                    # svstep. Rc=1 loop-end-condition?
 296     # inner-loop done: outer loop standard CTR-decrement to setvl again
 297     bdnz                .outer                          # Loop until CTR is zero