crypto/chacha20/chacha20_svp64.txt

   1 Main loop for xchacha_hchacha20:
   2
   3 for (i = 0; i < 10; i++){
   4     QUARTERROUND(x0, x4,  x8, x12);
   5     QUARTERROUND(x1, x5,  x9, x13);
   6     QUARTERROUND(x2, x6, x10, x14);
   7     QUARTERROUND(x3, x7, x11, x15);
   8     QUARTERROUND(x0, x5, x10, x15);
   9     QUARTERROUND(x1, x6, x11, x12);
  10     QUARTERROUND(x2, x7,  x8, x13);
  11     QUARTERROUND(x3, x4,  x9, x14);
  12 }
  13
  14 #define QUARTERROUND(a,b,c,d) \
  15     a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
  16     c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
  17     a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
  18     c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
  19
  20 We see that the loop is split in two groups of QUARTERROUND calls,
  21 one with step=4:
  22
  23     QUARTERROUND(x0, x4,  x8, x12);
  24     QUARTERROUND(x1, x5,  x9, x13);
  25     QUARTERROUND(x2, x6, x10, x14);
  26     QUARTERROUND(x3, x7, x11, x15);
  27
  28 and another with step=5:
  29
  30     QUARTERROUND(x0, x5, x10, x15);
  31     QUARTERROUND(x1, x6, x11, x12);
  32     QUARTERROUND(x2, x7,  x8, x13);
  33     QUARTERROUND(x3, x4,  x9, x14);
  34
  35 Let's start with the first group of QUARTERROUNDs, by unrolling it,
  36 essentially it results in the following instructions:
  37
  38     x0  = x0 + x4;   x12 = ROTATE(x12 ^ x0, 16);
  39     x8  = x8 + x12;   x4 = ROTATE(x4 ^ x8, 12);
  40     x0  = x0 + x4;   x12 = ROTATE(x12 ^ x0, 8);
  41     x8  = x8 + x12;   x4 = ROTATE(x4 ^ x8, 7);
  42     x1  = x1 + x5;   x13 = ROTATE(x13 ^ x1, 16);
  43     x9  = x9 + x13;   x5 = ROTATE(x5 ^ x9, 12);
  44     x1  = x1 + x5;   x13 = ROTATE(x13 ^ x1, 8);
  45     x9  = x9 + x13;   x5 = ROTATE(x5 ^ x9, 7);
  46     x2  = x2 + x6;   x14 = ROTATE(x14 ^ x2, 16);
  47     x10 = x10 + x14;  x6 = ROTATE(x6 ^ x10, 12);
  48     x2  = x2 + x6;   x14 = ROTATE(x14 ^ x2, 8);
  49     x10 = x10 + x14;  x6 = ROTATE(x6 ^ x10, 7);
  50     x3  = x3 + x7;   x15 = ROTATE(x15 ^ x3, 16);
  51     x11 = x11 + x15;  x7 = ROTATE(x7 ^ x11, 12);
  52     x3  = x3 + x7;   x15 = ROTATE(x15 ^ x3, 8);
  53     x11 = x11 + x15;  x7 = ROTATE(x7 ^ x11, 7);
  54
  55 Second group of QUARTERROUNDs, unrolled:
  56     x0  = x0 + x5;   x15 = ROTATE(x15 ^ x0, 16);
  57     x10 = x10 + x15;  x5 = ROTATE(x5 ^ x10, 12);
  58     x0  = x0 + x5;   x12 = ROTATE(x15 ^ x0, 8);
  59     x10 = x10 + x15;  x5 = ROTATE(x5 ^ x10, 7);
  60     x1  = x1 + x6;   x12 = ROTATE(x12 ^ x1, 16);
  61     x11 = x11 + x12;  x6 = ROTATE(x6 ^ x11, 12);
  62     x1  = x1 + x6;   x12 = ROTATE(x12 ^ x1, 8);
  63     x11 = x11 + x12;  x6 = ROTATE(x6 ^ x11, 7);
  64     x2  = x2 + x7;   x13 = ROTATE(x13 ^ x2, 16);
  65     x8  = x8 + x13;   x7 = ROTATE(x7 ^ x8, 12);
  66     x2  = x2 + x7;   x13 = ROTATE(x13 ^ x2, 8);
  67     x8  = x8 + x13;   x7 = ROTATE(x7 ^ x8, 7);
  68     x3  = x3 + x4;   x14 = ROTATE(x14 ^ x3, 16);
  69     x9  = x9 + x14;   x4 = ROTATE(x4 ^ x9, 12);
  70     x3  = x3 + x4;   x14 = ROTATE(x14 ^ x3, 8);
  71     x9  = x9 + x14;   x4 = ROTATE(x4 ^ x9, 7);
  72
  73 Let's list the additions only:
  74
  75     x0  = x0 + x4
  76     x8  = x8 + x12
  77     x0  = x0 + x4
  78     x8  = x8 + x12
  79     x1  = x1 + x5
  80     x9  = x9 + x13
  81     x1  = x1 + x5
  82     x9  = x9 + x13
  83     x2  = x2 + x6
  84     x10 = x10 + x14
  85     x2  = x2 + x6
  86     x10 = x10 + x14
  87     x3  = x3 + x7
  88     x11 = x11 + x15
  89     x3  = x3 + x7
  90     x11 = x11 + x15
  91     x0  = x0 + x5
  92     x10 = x10 + x15
  93     x0  = x0 + x5
  94     x10 = x10 + x15
  95     x1  = x1 + x6
  96     x11 = x11 + x12
  97     x1  = x1 + x6
  98     x11 = x11 + x12
  99     x2  = x2 + x7
 100     x8  = x8 + x13
 101     x2  = x2 + x7
 102     x8  = x8 + x13
 103     x3  = x3 + x4
 104     x9  = x9 + x14
 105     x3  = x3 + x4
 106     x9  = x9 + x14
 107
 108 Since we're going to use Vertical-First mode (instructions are executed
 109 first, and an explicit "svstep" moves on to the next register), the
 110 additions will be executed one by one and we need to note the indices
 111 that are going to be used for each operation.  We remind that sv.add is
 112 the instruction that will be executed, in the form:
 113
 114     sv.add RT, RA, RB  # RT = RA + RB
 115
 116 Let's assume the values x in the registers 24-36
 117
 118     GPR 24 |  x0 |  x1 |  x2 |  x3 |
 119     GPR 28 |  x4 |  x5 |  x6 |  x7 |
 120     GPR 32 |  x8 |  x9 | x10 | x11 |
 121     GPR 36 | x12 | x13 | x14 | x15 |
 122
 123 So for the addition in Vertical-First mode, RT (and RA as they are the
 124 same) indices are (in terms of x):
 125
 126    |  0 |  8 |  0 |  8 |  1 |  9 |  1 |  9 |
 127    |  2 | 10 |  2 | 10 |  3 | 11 |  3 | 11 |
 128    |  0 | 10 |  0 | 10 |  1 | 11 |  1 | 11 |
 129    |  2 |  8 |  2 |  8 |  3 |  9 |  3 |  9 |
 130
 131 However, since the indices are small values, using a single 64-bit
 132 register for a single index value is a waste so we will compress them,
 133 8 indices in a 64-bit register:
 134 So, RT indices will fit inside these 4 registers (in Little Endian format):
 135
 136    SVSHAPE0: | 0x901090108000800 | 0xb030b030a020a02 | 0xb010b010a000a00 | 0x903090308020802 |
 137
 138 Similarly we find the RB indices:
 139
 140    |  4 | 12 |  4 | 12 |  5 | 13 |  5 | 13 |
 141    |  6 | 14 |  6 | 14 |  7 | 15 |  7 | 15 |
 142    |  5 | 15 |  5 | 15 |  6 | 12 |  6 | 12 |
 143    |  7 | 13 |  7 | 13 |  4 | 14 |  7 | 14 |
 144
 145 Using a similar method, we find the final 4 registers with the RB indices:
 146
 147    SVSHAPE1: | 0xd050d050c040c04 | 0xf070f070e060e06 | 0xc060c060f050f05 | 0xe040e040d070d07 |
 148
 149 Now, we can construct the Vertical First loop:
 150
 151     svindex             4, 0, 1, 3, 0, 1, 0     # SVSHAPE0, add RA/RT indices
 152     svindex             6, 1, 1, 3, 0, 1, 0     # SVSHAPE1, add RB indices
 153     setvl               0, 0, 32, 1, 1, 1       # MAXVL=VL=32, VF=1
 154     svremap             31, 1, 0, 0, 0, 0, 0    # RA=1, RB=0, RT=0 (0b01011)
 155     sv.add/w=32         *x, *x, *x              # RT, RB: SHAPE0. RA: SHAPE1
 156     svstep. 16, 1, 0                            # step to next in-regs element
 157
 158 What this code snippet does is the following:
 159
 160 The first instruction
 161
 162     svindex             4, 0, 1, 3, 0, 1, 0
 163
 164 loads the add RT indices in the SVSHAPE0, in register 8. You will note
 165 that 4 is listed, but that's because it only works on even registers,
 166 so in order to save a bit, we have to double that number to get the
 167 actual register. So, SVSHAPE0 will be listed in GPRs 8-12. The number
 168 3 lists that the elements will be 8-bit long. 0=64-bit, 1=32-bit,
 169 2=16-bit, 3=8-bit.
 170
 171 The next step instruction
 172
 173     svindex             6, 1, 1, 3, 0, 1, 0
 174
 175 loads the add RB indices into SVSHAPE1. Again, even though we list 6,
 176 the actual registers will be loaded in GPR #12, again a use of 8-bit
 177 elements is denoted.
 178 Next, the setvl instructions:
 179
 180     setvl               0, 0, 32, 1, 1, 1
 181
 182 We have to call setvl to set MAXVL and VL to 32 and also configure
 183 Vertical-First mode.  Afterwards, we have to instruct the way we intend
 184 to use the indices, and we do this using svremap.
 185
 186     svremap 31, 1, 0, 0, 0, 0, 0
 187
 188 svremap basically instructs the scheduler to use SVSHAPE0 for RT and RB,
 189 SVSHAPE1 for RA.  The next instruction performs the *actual* addition:
 190
 191     sv.add/w=32         *x, *x, *x
 192
 193 Note the /w=32 suffix. This instructs the adder to perform the operation
 194 in elements of w=32 bits. Since the Power CPU is a 64-bit CPU, this means
 195 that we need to have 2 32-bit elements loaded in each register. Also,
 196 note that in all parameters we use the *x as argument. This instructs
 197 the scheduler to act on the registers as a vector, or a sequence of
 198 elements. But even though they are all the same, their indices will be
 199 taken from the SVSHAPE0/SVSHAPE1 indices as defined previously. Also
 200 note that the indices are relative to the actual register used. So,
 201 if *x starts in GPR 24 for example, in essence this instruction will
 202 issue the following sequence of instructions:
 203
 204     add/w=32            24 + 0, 24 +  4, 24 + 0
 205     add/w=32            24 + 8, 24 + 12, 24 + 8
 206     add/w=32            24 + 0, 24 +  4, 24 + 0
 207     add/w=32            24 + 8, 24 + 12, 24 + 8
 208     add/w=32            24 + 1, 24 +  5, 24 + 1
 209     add/w=32            24 + 9, 24 + 13, 24 + 9
 210     add/w=32            24 + 1, 24 +  5, 24 + 1
 211     add/w=32            24 + 9, 24 + 13, 24 + 9
 212     ...
 213
 214 Finally, the svstep. instruction steps to the next set of indices
 215
 216 We have shown how to do the additions in a Vertical-first mode. Now
 217 let's add the rest of the instructions in the QUARTERROUNDs.  For the
 218 XOR instructions of both QUARTERROUNDs groups only, assuming that d =
 219 XOR(d, a):
 220
 221     x12 = x12 ^ x0
 222     x4 = x4 ^ x8
 223     x12 = x12 ^ x0
 224     x4 = x4 ^ x8
 225     x13 = x13 ^ x1
 226     x5 = x5 ^ x9
 227     x13 = x13 ^ x1
 228     x5 = x5 ^ x9
 229     x14 = x14 ^ x2
 230     x6 = x6 ^ x10
 231     x14 = x14 ^ x2
 232     x6 = x6 ^ x10
 233     x15 = x15 ^ x3
 234     x7 = x7 ^ x11
 235     x15 = x15 ^ x3
 236     x7 = x7 ^ x11
 237     x15 = x15 ^ x0
 238     x5 = x5 ^ x10
 239     x12 = x15 ^ x0
 240     x5 = x5 ^ x10
 241     x12 = x12 ^ x1
 242     x6 = x6 ^ x11
 243     x12 = x12 ^ x1
 244     x6 = x6 ^ x11
 245     x13 = x13 ^ x2
 246     x7 = x7 ^ x8
 247     x13 = x13 ^ x1
 248     x7 = x7 ^ x8
 249     x14 = x14 ^ x3
 250     x4 = x4 ^ x9
 251     x14 = x14 ^ x3
 252     x4 = x4 ^ x9
 253
 254 We will need to create another set of indices for the XOR instructions. We
 255 will only need one set as the other set of indices is the same as RT
 256 for sv.add (SHAPE0). So, remembering that our
 257
 258     | 12 |  4 | 12 |  4 | 13 |  5 | 13 |  5 |
 259     | 14 |  6 | 14 |  6 | 15 |  7 | 15 |  7 |
 260     | 15 |  5 | 15 |  5 | 12 |  6 | 12 |  6 |
 261     | 13 |  7 | 13 |  7 | 14 |  4 | 14 |  4 |
 262
 263 Again, we find
 264
 265     SVSHAPE2: | 0x50d050d040c040c | 0x70f070f060e060e | 0x60c060c050f050f | 0x40e040e070d070d |
 266
 267 The next operation is the ROTATE which takes as operand the result of the
 268 XOR and a shift argument. You can easily see that the indices used in this
 269 case are the same as the XOR. However, the shift values cycle every 4:
 270 16, 12, 8, 7. For the indices we can again use svindex, like this:
 271
 272     svindex             8, 2, 1, 3, 0, 1, 0
 273
 274 Which again means SVSHAPE2, operating on 8-bit elements, starting
 275 from GPR #16 (8*2). For the shift values cycling every 4 elements,
 276 the svshape2 instruction will be used:
 277
 278     svshape2            0, 0, 3, 4, 0, 1
 279
 280 This will create an SVSHAPE3, which will use a modulo 4 for all of its
 281 elements. Now we can list both XOR and ROTATE instructions in assembly,
 282 together with the respective svremap instructions:
 283
 284     svremap             31, 2, 0, 2, 2, 0, 0       # RA=2, RB=0, RS=2 (0b00111)
 285     sv.xor/w=32         *x, *x, *x
 286     svremap             31, 0, 3, 2, 2, 0, 0       # RA=2, RB=3, RS=2 (0b01110)
 287     sv.rldcl/w=32       *x, *x, *SHIFTS, 0
 288
 289 So, in a similar fashion, we instruct XOR (sv.xor) to use SVSHAPE2 for
 290 RA and RS and SVSHAPE0 for RB, again for 32-bit elements, while ROTATE
 291 (sv.rldcl) will also use SVSHAPE2 for RA and RS, but SVSHAPE3 for RB
 292 (the shift values, which cycle every 4 elements). Note that the actual
 293 indices for SVSHAPE3 will have to be in 32-bit elements:
 294
 295     SHIFTS: | 0x0000000c00000010 | 0x0000000700000008 |
 296
 297 The complete algorithm for a loop with 10 iterations is as follows:
 298
 299     li                  7, 10                     # Load value 10 into GPR #7
 300     mtctr               7                         # Set up counter on GPR #7
 301
 302     # set up VL=32 vertical-first, and SVSHAPEs 0-2
 303     setvl               0, 0, 32, 1, 1, 1
 304     # SHAPE0, used by sv.add starts at GPR #8
 305     svindex             8/2, 0, 1, 3, 0, 1, 0     # SVSHAPE0, a
 306     # SHAPE1, used by sv.xor starts at GPR #12
 307     svindex             12/2, 1, 1, 3, 0, 1, 0    # SVSHAPE1, b
 308     # SHAPE2, used by sv.rldcl starts at GPR #16
 309     svindex             16/2, 2, 1, 3, 0, 1, 0    # SVSHAPE2, c
 310     # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
 311     # The inner loop will do 32 iterations, but there are only
 312     # 4 shift values, so we mod by 4, and can cycle through them
 313     svshape2            0, 0, 3, 4, 0, 1          # SVSHAPE3, shift amount, mod4
 314
 315 .outer:
 316     # outer loop begins here (standard CTR loop)
 317     setvl               0, 0, 32, 1, 1, 1         # MAXVL=VL=32, VF=1
 318     # inner loop begins here. add-xor-rotl32 with remap, step, branch
 319 .inner:
 320     svremap             31, 1, 0, 0, 0, 0, 0      # RA=1, RB=0, RT=0 (0b01011)
 321     sv.add/w=32         *x, *x, *x
 322     svremap             31, 2, 0, 2, 2, 0, 0      # RA=2, RB=0, RS=2 (0b00111)
 323     sv.xor/w=32         *x, *x, *x
 324     svremap             31, 0, 3, 2, 2, 0, 0      # RA=2, RB=3, RS=2 (0b01110)
 325     sv.rldcl/w=32       *x, *x, *SHIFTS, 0
 326     # 16 is the destination containing the result of svstep.
 327     # it overlaps with SHAPE2 which is also 16. the first 8 indices
 328     # will get corrupted.
 329     svstep.             7, 1, 0               # step to next in-regs element
 330     bc                  6, 3, .inner          # svstep. Rc=1 loop-end-condition?
 331     # inner-loop done: outer loop standard CTR-decrement to setvl again
 332     bdnz                .outer                # Loop until CTR is zero