From 14a3722e345878d5b41d592c87e526ee1b622ac5 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 26 Apr 2023 14:31:49 +0000 Subject: [PATCH] WIP: initial commit of chacha20 SVP64 page --- openpower/sv/cookbook/chacha20.mdwn | 338 ++++++++++++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 openpower/sv/cookbook/chacha20.mdwn diff --git a/openpower/sv/cookbook/chacha20.mdwn b/openpower/sv/cookbook/chacha20.mdwn new file mode 100644 index 000000000..74c7d9627 --- /dev/null +++ b/openpower/sv/cookbook/chacha20.mdwn @@ -0,0 +1,338 @@ +[[!tag svp64_cookbook]] + +# ChaCha20 SVP64 Implementation Analysis + +Test + +Main loop for xchacha_hchacha20: + +for (i = 0; i < 10; i++){ + QUARTERROUND(x0, x4, x8, x12); + QUARTERROUND(x1, x5, x9, x13); + QUARTERROUND(x2, x6, x10, x14); + QUARTERROUND(x3, x7, x11, x15); + QUARTERROUND(x0, x5, x10, x15); + QUARTERROUND(x1, x6, x11, x12); + QUARTERROUND(x2, x7, x8, x13); + QUARTERROUND(x3, x4, x9, x14); +} + +#define QUARTERROUND(a,b,c,d) \ + a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \ + c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \ + a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \ + c = PLUS(c,d); b = ROTATE(XOR(b,c), 7); + +We see that the loop is split in two groups of QUARTERROUND calls, +one with step=4: + + QUARTERROUND(x0, x4, x8, x12); + QUARTERROUND(x1, x5, x9, x13); + QUARTERROUND(x2, x6, x10, x14); + QUARTERROUND(x3, x7, x11, x15); + +and another with step=5: + + QUARTERROUND(x0, x5, x10, x15); + QUARTERROUND(x1, x6, x11, x12); + QUARTERROUND(x2, x7, x8, x13); + QUARTERROUND(x3, x4, x9, x14); + +Let's start with the first group of QUARTERROUNDs, by unrolling it, +essentially it results in the following instructions: + + x0 = x0 + x4; x12 = ROTATE(x12 ^ x0, 16); + x8 = x8 + x12; x4 = ROTATE(x4 ^ x8, 12); + x0 = x0 + x4; x12 = ROTATE(x12 ^ x0, 8); + x8 = x8 + x12; x4 = ROTATE(x4 ^ x8, 7); + x1 = x1 + x5; x13 = ROTATE(x13 ^ x1, 16); + x9 = x9 + x13; x5 = ROTATE(x5 ^ x9, 12); + x1 = x1 + x5; x13 = ROTATE(x13 ^ x1, 8); + x9 = x9 + x13; x5 = ROTATE(x5 ^ x9, 7); + x2 = x2 + x6; x14 = ROTATE(x14 ^ x2, 16); + x10 = x10 + x14; x6 = ROTATE(x6 ^ x10, 12); + x2 = x2 + x6; x14 = ROTATE(x14 ^ x2, 8); + x10 = x10 + x14; x6 = ROTATE(x6 ^ x10, 7); + x3 = x3 + x7; x15 = ROTATE(x15 ^ x3, 16); + x11 = x11 + x15; x7 = ROTATE(x7 ^ x11, 12); + x3 = x3 + x7; x15 = ROTATE(x15 ^ x3, 8); + x11 = x11 + x15; x7 = ROTATE(x7 ^ x11, 7); + +Second group of QUARTERROUNDs, unrolled: + x0 = x0 + x5; x15 = ROTATE(x15 ^ x0, 16); + x10 = x10 + x15; x5 = ROTATE(x5 ^ x10, 12); + x0 = x0 + x5; x12 = ROTATE(x15 ^ x0, 8); + x10 = x10 + x15; x5 = ROTATE(x5 ^ x10, 7); + x1 = x1 + x6; x12 = ROTATE(x12 ^ x1, 16); + x11 = x11 + x12; x6 = ROTATE(x6 ^ x11, 12); + x1 = x1 + x6; x12 = ROTATE(x12 ^ x1, 8); + x11 = x11 + x12; x6 = ROTATE(x6 ^ x11, 7); + x2 = x2 + x7; x13 = ROTATE(x13 ^ x2, 16); + x8 = x8 + x13; x7 = ROTATE(x7 ^ x8, 12); + x2 = x2 + x7; x13 = ROTATE(x13 ^ x2, 8); + x8 = x8 + x13; x7 = ROTATE(x7 ^ x8, 7); + x3 = x3 + x4; x14 = ROTATE(x14 ^ x3, 16); + x9 = x9 + x14; x4 = ROTATE(x4 ^ x9, 12); + x3 = x3 + x4; x14 = ROTATE(x14 ^ x3, 8); + x9 = x9 + x14; x4 = ROTATE(x4 ^ x9, 7); + +Let's list the additions only: + + x0 = x0 + x4 + x8 = x8 + x12 + x0 = x0 + x4 + x8 = x8 + x12 + x1 = x1 + x5 + x9 = x9 + x13 + x1 = x1 + x5 + x9 = x9 + x13 + x2 = x2 + x6 + x10 = x10 + x14 + x2 = x2 + x6 + x10 = x10 + x14 + x3 = x3 + x7 + x11 = x11 + x15 + x3 = x3 + x7 + x11 = x11 + x15 + x0 = x0 + x5 + x10 = x10 + x15 + x0 = x0 + x5 + x10 = x10 + x15 + x1 = x1 + x6 + x11 = x11 + x12 + x1 = x1 + x6 + x11 = x11 + x12 + x2 = x2 + x7 + x8 = x8 + x13 + x2 = x2 + x7 + x8 = x8 + x13 + x3 = x3 + x4 + x9 = x9 + x14 + x3 = x3 + x4 + x9 = x9 + x14 + +Since we're going to use Vertical-First mode (instructions are executed +first, and an explicit "svstep" moves on to the next register), the +additions will be executed one by one and we need to note the indices +that are going to be used for each operation. We remind that sv.add is +the instruction that will be executed, in the form: + + sv.add RT, RA, RB # RT = RA + RB + +Let's assume the values x in the registers 24-36 + + GPR 24 | x0 | x1 | x2 | x3 | + GPR 28 | x4 | x5 | x6 | x7 | + GPR 32 | x8 | x9 | x10 | x11 | + GPR 36 | x12 | x13 | x14 | x15 | + +So for the addition in Vertical-First mode, RT (and RA as they are the +same) indices are (in terms of x): + + | 0 | 8 | 0 | 8 | 1 | 9 | 1 | 9 | + | 2 | 10 | 2 | 10 | 3 | 11 | 3 | 11 | + | 0 | 10 | 0 | 10 | 1 | 11 | 1 | 11 | + | 2 | 8 | 2 | 8 | 3 | 9 | 3 | 9 | + +However, since the indices are small values, using a single 64-bit +register for a single index value is a waste so we will compress them, +8 indices in a 64-bit register: +So, RT indices will fit inside these 4 registers (in Little Endian format): + + SVSHAPE0: | 0x901090108000800 | 0xb030b030a020a02 | 0xb010b010a000a00 | 0x903090308020802 | + +Similarly we find the RB indices: + + | 4 | 12 | 4 | 12 | 5 | 13 | 5 | 13 | + | 6 | 14 | 6 | 14 | 7 | 15 | 7 | 15 | + | 5 | 15 | 5 | 15 | 6 | 12 | 6 | 12 | + | 7 | 13 | 7 | 13 | 4 | 14 | 7 | 14 | + +Using a similar method, we find the final 4 registers with the RB indices: + + SVSHAPE1: | 0xd050d050c040c04 | 0xf070f070e060e06 | 0xc060c060f050f05 | 0xe040e040d070d07 | + +Now, we can construct the Vertical First loop: + + svindex 4, 0, 1, 3, 0, 1, 0 # SVSHAPE0, add RA/RT indices + svindex 6, 1, 1, 3, 0, 1, 0 # SVSHAPE1, add RB indices + setvl 0, 0, 32, 1, 1, 1 # MAXVL=VL=32, VF=1 + svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011) + sv.add/w=32 *x, *x, *x # RT, RB: SHAPE0. RA: SHAPE1 + svstep. 16, 1, 0 # step to next in-regs element + +What this code snippet does is the following: + +The first instruction + + svindex 4, 0, 1, 3, 0, 1, 0 + +loads the add RT indices in the SVSHAPE0, in register 8. You will note +that 4 is listed, but that's because it only works on even registers, +so in order to save a bit, we have to double that number to get the +actual register. So, SVSHAPE0 will be listed in GPRs 8-12. The number +3 lists that the elements will be 8-bit long. 0=64-bit, 1=32-bit, +2=16-bit, 3=8-bit. + +The next step instruction + + svindex 6, 1, 1, 3, 0, 1, 0 + +loads the add RB indices into SVSHAPE1. Again, even though we list 6, +the actual registers will be loaded in GPR #12, again a use of 8-bit +elements is denoted. +Next, the setvl instructions: + + setvl 0, 0, 32, 1, 1, 1 + +We have to call setvl to set MAXVL and VL to 32 and also configure +Vertical-First mode. Afterwards, we have to instruct the way we intend +to use the indices, and we do this using svremap. + + svremap 31, 1, 0, 0, 0, 0, 0 + +svremap basically instructs the scheduler to use SVSHAPE0 for RT and RB, +SVSHAPE1 for RA. The next instruction performs the *actual* addition: + + sv.add/w=32 *x, *x, *x + +Note the /w=32 suffix. This instructs the adder to perform the operation +in elements of w=32 bits. Since the Power CPU is a 64-bit CPU, this means +that we need to have 2 32-bit elements loaded in each register. Also, +note that in all parameters we use the *x as argument. This instructs +the scheduler to act on the registers as a vector, or a sequence of +elements. But even though they are all the same, their indices will be +taken from the SVSHAPE0/SVSHAPE1 indices as defined previously. Also +note that the indices are relative to the actual register used. So, +if *x starts in GPR 24 for example, in essence this instruction will +issue the following sequence of instructions: + + add/w=32 24 + 0, 24 + 4, 24 + 0 + add/w=32 24 + 8, 24 + 12, 24 + 8 + add/w=32 24 + 0, 24 + 4, 24 + 0 + add/w=32 24 + 8, 24 + 12, 24 + 8 + add/w=32 24 + 1, 24 + 5, 24 + 1 + add/w=32 24 + 9, 24 + 13, 24 + 9 + add/w=32 24 + 1, 24 + 5, 24 + 1 + add/w=32 24 + 9, 24 + 13, 24 + 9 + ... + +Finally, the svstep. instruction steps to the next set of indices + +We have shown how to do the additions in a Vertical-first mode. Now +let's add the rest of the instructions in the QUARTERROUNDs. For the +XOR instructions of both QUARTERROUNDs groups only, assuming that d = +XOR(d, a): + + x12 = x12 ^ x0 + x4 = x4 ^ x8 + x12 = x12 ^ x0 + x4 = x4 ^ x8 + x13 = x13 ^ x1 + x5 = x5 ^ x9 + x13 = x13 ^ x1 + x5 = x5 ^ x9 + x14 = x14 ^ x2 + x6 = x6 ^ x10 + x14 = x14 ^ x2 + x6 = x6 ^ x10 + x15 = x15 ^ x3 + x7 = x7 ^ x11 + x15 = x15 ^ x3 + x7 = x7 ^ x11 + x15 = x15 ^ x0 + x5 = x5 ^ x10 + x12 = x15 ^ x0 + x5 = x5 ^ x10 + x12 = x12 ^ x1 + x6 = x6 ^ x11 + x12 = x12 ^ x1 + x6 = x6 ^ x11 + x13 = x13 ^ x2 + x7 = x7 ^ x8 + x13 = x13 ^ x1 + x7 = x7 ^ x8 + x14 = x14 ^ x3 + x4 = x4 ^ x9 + x14 = x14 ^ x3 + x4 = x4 ^ x9 + +We will need to create another set of indices for the XOR instructions. We +will only need one set as the other set of indices is the same as RT +for sv.add (SHAPE0). So, remembering that our + + | 12 | 4 | 12 | 4 | 13 | 5 | 13 | 5 | + | 14 | 6 | 14 | 6 | 15 | 7 | 15 | 7 | + | 15 | 5 | 15 | 5 | 12 | 6 | 12 | 6 | + | 13 | 7 | 13 | 7 | 14 | 4 | 14 | 4 | + +Again, we find + + SVSHAPE2: | 0x50d050d040c040c | 0x70f070f060e060e | 0x60c060c050f050f | 0x40e040e070d070d | + +The next operation is the ROTATE which takes as operand the result of the +XOR and a shift argument. You can easily see that the indices used in this +case are the same as the XOR. However, the shift values cycle every 4: +16, 12, 8, 7. For the indices we can again use svindex, like this: + + svindex 8, 2, 1, 3, 0, 1, 0 + +Which again means SVSHAPE2, operating on 8-bit elements, starting +from GPR #16 (8*2). For the shift values cycling every 4 elements, +the svshape2 instruction will be used: + + svshape2 0, 0, 3, 4, 0, 1 + +This will create an SVSHAPE3, which will use a modulo 4 for all of its +elements. Now we can list both XOR and ROTATE instructions in assembly, +together with the respective svremap instructions: + + svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111) + sv.xor/w=32 *x, *x, *x + svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110) + sv.rldcl/w=32 *x, *x, *SHIFTS, 0 + +So, in a similar fashion, we instruct XOR (sv.xor) to use SVSHAPE2 for +RA and RS and SVSHAPE0 for RB, again for 32-bit elements, while ROTATE +(sv.rldcl) will also use SVSHAPE2 for RA and RS, but SVSHAPE3 for RB +(the shift values, which cycle every 4 elements). Note that the actual +indices for SVSHAPE3 will have to be in 32-bit elements: + + SHIFTS: | 0x0000000c00000010 | 0x0000000700000008 | + +The complete algorithm for a loop with 10 iterations is as follows: + + li 7, 10 # Load value 10 into GPR #7 + mtctr 7 # Set up counter on GPR #7 + + # set up VL=32 vertical-first, and SVSHAPEs 0-2 + setvl 0, 0, 32, 1, 1, 1 + # SHAPE0, used by sv.add starts at GPR #8 + svindex 8/2, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a + # SHAPE1, used by sv.xor starts at GPR #12 + svindex 12/2, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b + # SHAPE2, used by sv.rldcl starts at GPR #16 + svindex 16/2, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c + # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20 + # The inner loop will do 32 iterations, but there are only + # 4 shift values, so we mod by 4, and can cycle through them + svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod4 + +.outer: + # outer loop begins here (standard CTR loop) + setvl 0, 0, 32, 1, 1, 1 # MAXVL=VL=32, VF=1 + # inner loop begins here. add-xor-rotl32 with remap, step, branch +.inner: + svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011) + sv.add/w=32 *x, *x, *x + svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111) + sv.xor/w=32 *x, *x, *x + svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110) + sv.rldcl/w=32 *x, *x, *SHIFTS, 0 + # 16 is the destination containing the result of svstep. + # it overlaps with SHAPE2 which is also 16. the first 8 indices + # will get corrupted. + svstep. 7, 1, 0 # step to next in-regs element + bc 6, 3, .inner # svstep. Rc=1 loop-end-condition? + # inner-loop done: outer loop standard CTR-decrement to setvl again + bdnz .outer # Loop until CTR is zero -- 2.30.2