pifploadshift.mdwn, do one example english pseudocode operands
[openpower-isa.git] / crypto / chacha20 / chacha20_svp64.txt
1 Main loop for xchacha_hchacha20:
2
3 for (i = 0; i < 10; i++){
4 QUARTERROUND(x0, x4, x8, x12);
5 QUARTERROUND(x1, x5, x9, x13);
6 QUARTERROUND(x2, x6, x10, x14);
7 QUARTERROUND(x3, x7, x11, x15);
8 QUARTERROUND(x0, x5, x10, x15);
9 QUARTERROUND(x1, x6, x11, x12);
10 QUARTERROUND(x2, x7, x8, x13);
11 QUARTERROUND(x3, x4, x9, x14);
12 }
13
14 #define QUARTERROUND(a,b,c,d) \
15 a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
16 c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
17 a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
18 c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
19
20 We see that the loop is split in two groups of QUARTERROUND calls,
21 one with step=4:
22
23 QUARTERROUND(x0, x4, x8, x12);
24 QUARTERROUND(x1, x5, x9, x13);
25 QUARTERROUND(x2, x6, x10, x14);
26 QUARTERROUND(x3, x7, x11, x15);
27
28 and another with step=5:
29
30 QUARTERROUND(x0, x5, x10, x15);
31 QUARTERROUND(x1, x6, x11, x12);
32 QUARTERROUND(x2, x7, x8, x13);
33 QUARTERROUND(x3, x4, x9, x14);
34
35 Let's start with the first group of QUARTERROUNDs, by unrolling it,
36 essentially it results in the following instructions:
37
38 x0 = x0 + x4; x12 = ROTATE(x12 ^ x0, 16);
39 x8 = x8 + x12; x4 = ROTATE(x4 ^ x8, 12);
40 x0 = x0 + x4; x12 = ROTATE(x12 ^ x0, 8);
41 x8 = x8 + x12; x4 = ROTATE(x4 ^ x8, 7);
42 x1 = x1 + x5; x13 = ROTATE(x13 ^ x1, 16);
43 x9 = x9 + x13; x5 = ROTATE(x5 ^ x9, 12);
44 x1 = x1 + x5; x13 = ROTATE(x13 ^ x1, 8);
45 x9 = x9 + x13; x5 = ROTATE(x5 ^ x9, 7);
46 x2 = x2 + x6; x14 = ROTATE(x14 ^ x2, 16);
47 x10 = x10 + x14; x6 = ROTATE(x6 ^ x10, 12);
48 x2 = x2 + x6; x14 = ROTATE(x14 ^ x2, 8);
49 x10 = x10 + x14; x6 = ROTATE(x6 ^ x10, 7);
50 x3 = x3 + x7; x15 = ROTATE(x15 ^ x3, 16);
51 x11 = x11 + x15; x7 = ROTATE(x7 ^ x11, 12);
52 x3 = x3 + x7; x15 = ROTATE(x15 ^ x3, 8);
53 x11 = x11 + x15; x7 = ROTATE(x7 ^ x11, 7);
54
55 Second group of QUARTERROUNDs, unrolled:
56 x0 = x0 + x5; x15 = ROTATE(x15 ^ x0, 16);
57 x10 = x10 + x15; x5 = ROTATE(x5 ^ x10, 12);
58 x0 = x0 + x5; x12 = ROTATE(x15 ^ x0, 8);
59 x10 = x10 + x15; x5 = ROTATE(x5 ^ x10, 7);
60 x1 = x1 + x6; x12 = ROTATE(x12 ^ x1, 16);
61 x11 = x11 + x12; x6 = ROTATE(x6 ^ x11, 12);
62 x1 = x1 + x6; x12 = ROTATE(x12 ^ x1, 8);
63 x11 = x11 + x12; x6 = ROTATE(x6 ^ x11, 7);
64 x2 = x2 + x7; x13 = ROTATE(x13 ^ x2, 16);
65 x8 = x8 + x13; x7 = ROTATE(x7 ^ x8, 12);
66 x2 = x2 + x7; x13 = ROTATE(x13 ^ x2, 8);
67 x8 = x8 + x13; x7 = ROTATE(x7 ^ x8, 7);
68 x3 = x3 + x4; x14 = ROTATE(x14 ^ x3, 16);
69 x9 = x9 + x14; x4 = ROTATE(x4 ^ x9, 12);
70 x3 = x3 + x4; x14 = ROTATE(x14 ^ x3, 8);
71 x9 = x9 + x14; x4 = ROTATE(x4 ^ x9, 7);
72
73 Let's list the additions only:
74
75 x0 = x0 + x4
76 x8 = x8 + x12
77 x0 = x0 + x4
78 x8 = x8 + x12
79 x1 = x1 + x5
80 x9 = x9 + x13
81 x1 = x1 + x5
82 x9 = x9 + x13
83 x2 = x2 + x6
84 x10 = x10 + x14
85 x2 = x2 + x6
86 x10 = x10 + x14
87 x3 = x3 + x7
88 x11 = x11 + x15
89 x3 = x3 + x7
90 x11 = x11 + x15
91 x0 = x0 + x5
92 x10 = x10 + x15
93 x0 = x0 + x5
94 x10 = x10 + x15
95 x1 = x1 + x6
96 x11 = x11 + x12
97 x1 = x1 + x6
98 x11 = x11 + x12
99 x2 = x2 + x7
100 x8 = x8 + x13
101 x2 = x2 + x7
102 x8 = x8 + x13
103 x3 = x3 + x4
104 x9 = x9 + x14
105 x3 = x3 + x4
106 x9 = x9 + x14
107
108 Since we're going to use Vertical-First mode (instructions are executed
109 first, and an explicit "svstep" moves on to the next register), the
110 additions will be executed one by one and we need to note the indices
111 that are going to be used for each operation. We remind that sv.add is
112 the instruction that will be executed, in the form:
113
114 sv.add RT, RA, RB # RT = RA + RB
115
116 Let's assume the values x in the registers 24-36
117
118 GPR 24 | x0 | x1 | x2 | x3 |
119 GPR 28 | x4 | x5 | x6 | x7 |
120 GPR 32 | x8 | x9 | x10 | x11 |
121 GPR 36 | x12 | x13 | x14 | x15 |
122
123 So for the addition in Vertical-First mode, RT (and RA as they are the
124 same) indices are (in terms of x):
125
126 | 0 | 8 | 0 | 8 | 1 | 9 | 1 | 9 |
127 | 2 | 10 | 2 | 10 | 3 | 11 | 3 | 11 |
128 | 0 | 10 | 0 | 10 | 1 | 11 | 1 | 11 |
129 | 2 | 8 | 2 | 8 | 3 | 9 | 3 | 9 |
130
131 However, since the indices are small values, using a single 64-bit
132 register for a single index value is a waste so we will compress them,
133 8 indices in a 64-bit register:
134 So, RT indices will fit inside these 4 registers (in Little Endian format):
135
136 SVSHAPE0: | 0x901090108000800 | 0xb030b030a020a02 | 0xb010b010a000a00 | 0x903090308020802 |
137
138 Similarly we find the RB indices:
139
140 | 4 | 12 | 4 | 12 | 5 | 13 | 5 | 13 |
141 | 6 | 14 | 6 | 14 | 7 | 15 | 7 | 15 |
142 | 5 | 15 | 5 | 15 | 6 | 12 | 6 | 12 |
143 | 7 | 13 | 7 | 13 | 4 | 14 | 7 | 14 |
144
145 Using a similar method, we find the final 4 registers with the RB indices:
146
147 SVSHAPE1: | 0xd050d050c040c04 | 0xf070f070e060e06 | 0xc060c060f050f05 | 0xe040e040d070d07 |
148
149 Now, we can construct the Vertical First loop:
150
151 svindex 4, 0, 1, 3, 0, 1, 0 # SVSHAPE0, add RA/RT indices
152 svindex 6, 1, 1, 3, 0, 1, 0 # SVSHAPE1, add RB indices
153 setvl 0, 0, 32, 1, 1, 1 # MAXVL=VL=32, VF=1
154 svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011)
155 sv.add/w=32 *x, *x, *x # RT, RB: SHAPE0. RA: SHAPE1
156 svstep. 16, 1, 0 # step to next in-regs element
157
158 What this code snippet does is the following:
159
160 The first instruction
161
162 svindex 4, 0, 1, 3, 0, 1, 0
163
164 loads the add RT indices in the SVSHAPE0, in register 8. You will note
165 that 4 is listed, but that's because it only works on even registers,
166 so in order to save a bit, we have to double that number to get the
167 actual register. So, SVSHAPE0 will be listed in GPRs 8-12. The number
168 3 lists that the elements will be 8-bit long. 0=64-bit, 1=32-bit,
169 2=16-bit, 3=8-bit.
170
171 The next step instruction
172
173 svindex 6, 1, 1, 3, 0, 1, 0
174
175 loads the add RB indices into SVSHAPE1. Again, even though we list 6,
176 the actual registers will be loaded in GPR #12, again a use of 8-bit
177 elements is denoted.
178 Next, the setvl instructions:
179
180 setvl 0, 0, 32, 1, 1, 1
181
182 We have to call setvl to set MAXVL and VL to 32 and also configure
183 Vertical-First mode. Afterwards, we have to instruct the way we intend
184 to use the indices, and we do this using svremap.
185
186 svremap 31, 1, 0, 0, 0, 0, 0
187
188 svremap basically instructs the scheduler to use SVSHAPE0 for RT and RB,
189 SVSHAPE1 for RA. The next instruction performs the *actual* addition:
190
191 sv.add/w=32 *x, *x, *x
192
193 Note the /w=32 suffix. This instructs the adder to perform the operation
194 in elements of w=32 bits. Since the Power CPU is a 64-bit CPU, this means
195 that we need to have 2 32-bit elements loaded in each register. Also,
196 note that in all parameters we use the *x as argument. This instructs
197 the scheduler to act on the registers as a vector, or a sequence of
198 elements. But even though they are all the same, their indices will be
199 taken from the SVSHAPE0/SVSHAPE1 indices as defined previously. Also
200 note that the indices are relative to the actual register used. So,
201 if *x starts in GPR 24 for example, in essence this instruction will
202 issue the following sequence of instructions:
203
204 add/w=32 24 + 0, 24 + 4, 24 + 0
205 add/w=32 24 + 8, 24 + 12, 24 + 8
206 add/w=32 24 + 0, 24 + 4, 24 + 0
207 add/w=32 24 + 8, 24 + 12, 24 + 8
208 add/w=32 24 + 1, 24 + 5, 24 + 1
209 add/w=32 24 + 9, 24 + 13, 24 + 9
210 add/w=32 24 + 1, 24 + 5, 24 + 1
211 add/w=32 24 + 9, 24 + 13, 24 + 9
212 ...
213
214 Finally, the svstep. instruction steps to the next set of indices
215
216 We have shown how to do the additions in a Vertical-first mode. Now
217 let's add the rest of the instructions in the QUARTERROUNDs. For the
218 XOR instructions of both QUARTERROUNDs groups only, assuming that d =
219 XOR(d, a):
220
221 x12 = x12 ^ x0
222 x4 = x4 ^ x8
223 x12 = x12 ^ x0
224 x4 = x4 ^ x8
225 x13 = x13 ^ x1
226 x5 = x5 ^ x9
227 x13 = x13 ^ x1
228 x5 = x5 ^ x9
229 x14 = x14 ^ x2
230 x6 = x6 ^ x10
231 x14 = x14 ^ x2
232 x6 = x6 ^ x10
233 x15 = x15 ^ x3
234 x7 = x7 ^ x11
235 x15 = x15 ^ x3
236 x7 = x7 ^ x11
237 x15 = x15 ^ x0
238 x5 = x5 ^ x10
239 x12 = x15 ^ x0
240 x5 = x5 ^ x10
241 x12 = x12 ^ x1
242 x6 = x6 ^ x11
243 x12 = x12 ^ x1
244 x6 = x6 ^ x11
245 x13 = x13 ^ x2
246 x7 = x7 ^ x8
247 x13 = x13 ^ x1
248 x7 = x7 ^ x8
249 x14 = x14 ^ x3
250 x4 = x4 ^ x9
251 x14 = x14 ^ x3
252 x4 = x4 ^ x9
253
254 We will need to create another set of indices for the XOR instructions. We
255 will only need one set as the other set of indices is the same as RT
256 for sv.add (SHAPE0). So, remembering that our
257
258 | 12 | 4 | 12 | 4 | 13 | 5 | 13 | 5 |
259 | 14 | 6 | 14 | 6 | 15 | 7 | 15 | 7 |
260 | 15 | 5 | 15 | 5 | 12 | 6 | 12 | 6 |
261 | 13 | 7 | 13 | 7 | 14 | 4 | 14 | 4 |
262
263 Again, we find
264
265 SVSHAPE2: | 0x50d050d040c040c | 0x70f070f060e060e | 0x60c060c050f050f | 0x40e040e070d070d |
266
267 The next operation is the ROTATE which takes as operand the result of the
268 XOR and a shift argument. You can easily see that the indices used in this
269 case are the same as the XOR. However, the shift values cycle every 4:
270 16, 12, 8, 7. For the indices we can again use svindex, like this:
271
272 svindex 8, 2, 1, 3, 0, 1, 0
273
274 Which again means SVSHAPE2, operating on 8-bit elements, starting
275 from GPR #16 (8*2). For the shift values cycling every 4 elements,
276 the svshape2 instruction will be used:
277
278 svshape2 0, 0, 3, 4, 0, 1
279
280 This will create an SVSHAPE3, which will use a modulo 4 for all of its
281 elements. Now we can list both XOR and ROTATE instructions in assembly,
282 together with the respective svremap instructions:
283
284 svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111)
285 sv.xor/w=32 *x, *x, *x
286 svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110)
287 sv.rldcl/w=32 *x, *x, *SHIFTS, 0
288
289 So, in a similar fashion, we instruct XOR (sv.xor) to use SVSHAPE2 for
290 RA and RS and SVSHAPE0 for RB, again for 32-bit elements, while ROTATE
291 (sv.rldcl) will also use SVSHAPE2 for RA and RS, but SVSHAPE3 for RB
292 (the shift values, which cycle every 4 elements). Note that the actual
293 indices for SVSHAPE3 will have to be in 32-bit elements:
294
295 SHIFTS: | 0x0000000c00000010 | 0x0000000700000008 |
296
297 The complete algorithm for a loop with 10 iterations is as follows:
298
299 li 7, 10 # Load value 10 into GPR #7
300 mtctr 7 # Set up counter on GPR #7
301
302 # set up VL=32 vertical-first, and SVSHAPEs 0-2
303 setvl 0, 0, 32, 1, 1, 1
304 # SHAPE0, used by sv.add starts at GPR #8
305 svindex 8/2, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a
306 # SHAPE1, used by sv.xor starts at GPR #12
307 svindex 12/2, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b
308 # SHAPE2, used by sv.rldcl starts at GPR #16
309 svindex 16/2, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c
310 # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
311 # The inner loop will do 32 iterations, but there are only
312 # 4 shift values, so we mod by 4, and can cycle through them
313 svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod4
314
315 .outer:
316 # outer loop begins here (standard CTR loop)
317 setvl 0, 0, 32, 1, 1, 1 # MAXVL=VL=32, VF=1
318 # inner loop begins here. add-xor-rotl32 with remap, step, branch
319 .inner:
320 svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011)
321 sv.add/w=32 *x, *x, *x
322 svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111)
323 sv.xor/w=32 *x, *x, *x
324 svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110)
325 sv.rldcl/w=32 *x, *x, *SHIFTS, 0
326 # 16 is the destination containing the result of svstep.
327 # it overlaps with SHAPE2 which is also 16. the first 8 indices
328 # will get corrupted.
329 svstep. 7, 1, 0 # step to next in-regs element
330 bc 6, 3, .inner # svstep. Rc=1 loop-end-condition?
331 # inner-loop done: outer loop standard CTR-decrement to setvl again
332 bdnz .outer # Loop until CTR is zero