pysvp64db: fix traversal
[openpower-isa.git] / crypto / chacha20 / chacha20_svp64.txt
1 Main loop for xchacha_hchacha20:
2
3 for (i = 0; i < 10; i++){
4 QUARTERROUND(x0, x4, x8, x12);
5 QUARTERROUND(x1, x5, x9, x13);
6 QUARTERROUND(x2, x6, x10, x14);
7 QUARTERROUND(x3, x7, x11, x15);
8 QUARTERROUND(x0, x5, x10, x15);
9 QUARTERROUND(x1, x6, x11, x12);
10 QUARTERROUND(x2, x7, x8, x13);
11 QUARTERROUND(x3, x4, x9, x14);
12 }
13
14 #define QUARTERROUND(a,b,c,d) \
15 a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
16 c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
17 a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
18 c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
19
20 We see that the loop is split in two groups of QUARTERROUND calls, one with step=4:
21
22 QUARTERROUND(x0, x4, x8, x12);
23 QUARTERROUND(x1, x5, x9, x13);
24 QUARTERROUND(x2, x6, x10, x14);
25 QUARTERROUND(x3, x7, x11, x15);
26
27 and another with step=5:
28
29 QUARTERROUND(x0, x5, x10, x15);
30 QUARTERROUND(x1, x6, x11, x12);
31 QUARTERROUND(x2, x7, x8, x13);
32 QUARTERROUND(x3, x4, x9, x14);
33
34 Let's start with the first group of QUARTERROUNDs, by unrolling it, essentially it results in the following instructions:
35
36 x0 = x0 + x4; x12 = ROTATE(x12 ^ x0, 16);
37 x8 = x8 + x12; x4 = ROTATE(x4 ^ x8, 12);
38 x0 = x0 + x4; x12 = ROTATE(x12 ^ x0, 8);
39 x8 = x8 + x12; x4 = ROTATE(x4 ^ x8, 7);
40 x1 = x1 + x5; x13 = ROTATE(x13 ^ x1, 16);
41 x9 = x9 + x13; x5 = ROTATE(x5 ^ x9, 12);
42 x1 = x1 + x5; x13 = ROTATE(x13 ^ x1, 8);
43 x9 = x9 + x13; x5 = ROTATE(x5 ^ x9, 7);
44 x2 = x2 + x6; x14 = ROTATE(x14 ^ x2, 16);
45 x10 = x10 + x14; x6 = ROTATE(x6 ^ x10, 12);
46 x2 = x2 + x6; x14 = ROTATE(x14 ^ x2, 8);
47 x10 = x10 + x14; x6 = ROTATE(x6 ^ x10, 7);
48 x3 = x3 + x7; x15 = ROTATE(x15 ^ x3, 16);
49 x11 = x11 + x15; x7 = ROTATE(x7 ^ x11, 12);
50 x3 = x3 + x7; x15 = ROTATE(x15 ^ x3, 8);
51 x11 = x11 + x15; x7 = ROTATE(x7 ^ x11, 7);
52
53 Second group of QUARTERROUNDs, unrolled:
54 x0 = x0 + x5; x15 = ROTATE(x15 ^ x0, 16);
55 x10 = x10 + x15; x5 = ROTATE(x5 ^ x10, 12);
56 x0 = x0 + x5; x12 = ROTATE(x15 ^ x0, 8);
57 x10 = x10 + x15; x5 = ROTATE(x5 ^ x10, 7);
58 x1 = x1 + x6; x12 = ROTATE(x12 ^ x1, 16);
59 x11 = x11 + x12; x6 = ROTATE(x6 ^ x11, 12);
60 x1 = x1 + x6; x12 = ROTATE(x12 ^ x1, 8);
61 x11 = x11 + x12; x6 = ROTATE(x6 ^ x11, 7);
62 x2 = x2 + x7; x13 = ROTATE(x13 ^ x2, 16);
63 x8 = x8 + x13; x7 = ROTATE(x7 ^ x8, 12);
64 x2 = x2 + x7; x13 = ROTATE(x13 ^ x2, 8);
65 x8 = x8 + x13; x7 = ROTATE(x7 ^ x8, 7);
66 x3 = x3 + x4; x14 = ROTATE(x14 ^ x3, 16);
67 x9 = x9 + x14; x4 = ROTATE(x4 ^ x9, 12);
68 x3 = x3 + x4; x14 = ROTATE(x14 ^ x3, 8);
69 x9 = x9 + x14; x4 = ROTATE(x4 ^ x9, 7);
70
71 Let's list the additions only:
72
73 x0 = x0 + x4
74 x8 = x8 + x12
75 x0 = x0 + x4
76 x8 = x8 + x12
77 x1 = x1 + x5
78 x9 = x9 + x13
79 x1 = x1 + x5
80 x9 = x9 + x13
81 x2 = x2 + x6
82 x10 = x10 + x14
83 x2 = x2 + x6
84 x10 = x10 + x14
85 x3 = x3 + x7
86 x11 = x11 + x15
87 x3 = x3 + x7
88 x11 = x11 + x15
89 x0 = x0 + x5
90 x10 = x10 + x15
91 x0 = x0 + x5
92 x10 = x10 + x15
93 x1 = x1 + x6
94 x11 = x11 + x12
95 x1 = x1 + x6
96 x11 = x11 + x12
97 x2 = x2 + x7
98 x8 = x8 + x13
99 x2 = x2 + x7
100 x8 = x8 + x13
101 x3 = x3 + x4
102 x9 = x9 + x14
103 x3 = x3 + x4
104 x9 = x9 + x14
105
106 Since we're going to use Vertical-First mode, the additions will be executed one by one and we need to note the indices that are going to be used for each operation.
107 We remind that sv.add is the instruction that will be executed, in the form:
108
109 sv.add RT, RA, RB # RT = RA + RB
110
111 Let's assume the values x in the registers 24-36
112
113 GPR 24 | x0 | x1 | x2 | x3 |
114 GPR 28 | x4 | x5 | x6 | x7 |
115 GPR 32 | x8 | x9 | x10 | x11 |
116 GPR 36 | x12 | x13 | x14 | x15 |
117
118 So for the addition in Vertical-First mode, RT (and RA as they are the same) indices are (in terms of x):
119
120 | 0 | 8 | 0 | 8 | 1 | 9 | 1 | 9 |
121 | 2 | 10 | 2 | 10 | 3 | 11 | 3 | 11 |
122 | 0 | 10 | 0 | 10 | 1 | 11 | 1 | 11 |
123 | 2 | 8 | 2 | 8 | 3 | 9 | 3 | 9 |
124
125 However, since the indices are small values, using a single 64-bit register for a single index value is a waste so we will compress them, 8 indices in a 64-bit register:
126 So, RT indices will fit inside these 4 registers (in Little Endian format):
127
128 SVSHAPE0: | 0x901090108000800 | 0xb030b030a020a02 | 0xb010b010a000a00 | 0x903090308020802 |
129
130 Similarly we find the RB indices:
131
132 | 4 | 12 | 4 | 12 | 5 | 13 | 5 | 13 |
133 | 6 | 14 | 6 | 14 | 7 | 15 | 7 | 15 |
134 | 5 | 15 | 5 | 15 | 6 | 12 | 6 | 12 |
135 | 7 | 13 | 7 | 13 | 4 | 14 | 7 | 14 |
136
137 Using a similar method, we find the final 4 registers with the RB indices:
138
139 SVSHAPE1: | 0xd050d050c040c04 | 0xf070f070e060e06 | 0xc060c060f050f05 | 0xe040e040d070d07 |
140
141 Now, we can construct the Vertical First loop:
142
143 svindex 4, 0, 1, 3, 0, 1, 0 # SVSHAPE0, add RA/RT indices
144 svindex 6, 1, 1, 3, 0, 1, 0 # SVSHAPE1, add RB indices
145 setvl 0, 0, 32, 0, 1, 1 # MAXVL=VL=32
146 # set r22 from VL, set vertical-first
147 setvl 22, 0, 32, 1, 0, 1 # vertical-first mode
148 svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011)
149 sv.add/w=32 *x, *x, *x # RT, RB will use SHAPE0, RA will use SHAPE1
150 svstep. 16, 1, 0 # step to next in-regs element
151
152 What this code snippet does is the following:
153
154 The first instruction
155
156 svindex 4, 0, 1, 3, 0, 1, 0
157
158 loads the add RT indices in the SVSHAPE0, in register 8. You will note that 4 is listed, but that's because it only works on even registers, so in order to save a bit, we have to double that number to get the actual register. So, SVSHAPE0 will be listed in GPRs 8-12. The number 3 lists that the elements will be 8-bit long. 0=64-bit, 1=32-bit, 2=16-bit, 3=8-bit.
159
160 The next step instruction
161
162 svindex 6, 1, 1, 3, 0, 1, 0
163
164 loads the add RB indices into SVSHAPE1. Again, even though we list 6, the actual registers will be loaded in GPR #12, again a use of 8-bit elements is denoted.
165 Next, the setvl instructions:
166
167 setvl 0, 0, 32, 0, 1, 1
168 setvl 22, 0, 32, 1, 0, 1
169
170 We have to call setvl twice, the first one sets MAXVL and VL to 32. The second setvl, stores the VL to register 22 and also configures Vertical-First mode.
171 Afterwards, we have to instruct the way we intend to use the indices, and we do this using svremap.
172
173 svremap 31, 1, 0, 0, 0, 0, 0
174
175 svremap basically instructs the scheduler to use SVSHAPE0 for RT and RB, SVSHAPE1 for RA.
176 The next instruction performs the *actual* addition:
177
178 sv.add/w=32 *x, *x, *x
179
180 Note the /w=32 suffix. This instructs the adder to perform the operation in elements of w=32 bits. Since the Power CPU is a 64-bit CPU, this means that we need to have 2 32-bit elements loaded in each register. Also, note that in all parameters we use the *x as argument. This instructs the scheduler to act on the registers as a vector, or a sequence of elements. But even though they are all the same, their indices will be taken from the SVSHAPE0/SVSHAPE1 indices as defined previously. Also note that the indices are relative to the actual register used. So, if *x starts in GPR 24 for example, in essence this instruction will issue the following sequence of instructions:
181
182 add/w=32 24 + 0, 24 + 4, 24 + 0
183 add/w=32 24 + 8, 24 + 12, 24 + 8
184 add/w=32 24 + 0, 24 + 4, 24 + 0
185 add/w=32 24 + 8, 24 + 12, 24 + 8
186 add/w=32 24 + 1, 24 + 5, 24 + 1
187 add/w=32 24 + 9, 24 + 13, 24 + 9
188 add/w=32 24 + 1, 24 + 5, 24 + 1
189 add/w=32 24 + 9, 24 + 13, 24 + 9
190 ...
191
192 Finally, the svstep. instruction steps to the next set of indices
193
194 We have shown how to do the additions in a Vertical-first mode. Now let's add the rest of the instructions in the QUARTERROUNDs.
195 For the XOR instructions of both QUARTERROUNDs groups only, assuming that d = XOR(d, a):
196
197 x12 = x12 ^ x0
198 x4 = x4 ^ x8
199 x12 = x12 ^ x0
200 x4 = x4 ^ x8
201 x13 = x13 ^ x1
202 x5 = x5 ^ x9
203 x13 = x13 ^ x1
204 x5 = x5 ^ x9
205 x14 = x14 ^ x2
206 x6 = x6 ^ x10
207 x14 = x14 ^ x2
208 x6 = x6 ^ x10
209 x15 = x15 ^ x3
210 x7 = x7 ^ x11
211 x15 = x15 ^ x3
212 x7 = x7 ^ x11
213 x15 = x15 ^ x0
214 x5 = x5 ^ x10
215 x12 = x15 ^ x0
216 x5 = x5 ^ x10
217 x12 = x12 ^ x1
218 x6 = x6 ^ x11
219 x12 = x12 ^ x1
220 x6 = x6 ^ x11
221 x13 = x13 ^ x2
222 x7 = x7 ^ x8
223 x13 = x13 ^ x1
224 x7 = x7 ^ x8
225 x14 = x14 ^ x3
226 x4 = x4 ^ x9
227 x14 = x14 ^ x3
228 x4 = x4 ^ x9
229
230 We will need to create another set of indices for the XOR instructions. We will only need one set as the other set of indices is the same as RT for sv.add (SHAPE0). So, remembering that our
231
232 | 12 | 4 | 12 | 4 | 13 | 5 | 13 | 5 |
233 | 14 | 6 | 14 | 6 | 15 | 7 | 15 | 7 |
234 | 15 | 5 | 15 | 5 | 12 | 6 | 12 | 6 |
235 | 13 | 7 | 13 | 7 | 14 | 4 | 14 | 4 |
236
237 Again, we find
238
239 SVSHAPE2: | 0x50d050d040c040c | 0x70f070f060e060e | 0x60c060c050f050f | 0x40e040e070d070d |
240
241 The next operation is the ROTATE which takes as operand the result of the XOR and a shift argument. You can easily see that the indices used in this case are the same as the XOR. However, the shift values cycle every 4: 16, 12, 8, 7. For the indices we can again use svindex, like this:
242
243 svindex 8, 2, 1, 3, 0, 1, 0
244
245 Which again means SVPSHAPE2, operating on 8-bit elements, starting from GPR #16 (8*2). For the shift values cycling every 4 elements, the svshape2 instruction will be used:
246
247 svshape2 0, 0, 3, 4, 0, 1
248
249 This will create an SVSHAPE3, which will use a modulo 4 for all of its elements. Now we can list both XOR and ROTATE instructions in assembly, together with the respective svremap instructions:
250
251 svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111)
252 sv.xor/w=32 *x, *x, *x
253 svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110)
254 sv.rldcl/w=32 *x, *x, *SHIFTS, 0
255
256 So, in a similar fashion, we instruct XOR (sv.xor) to use SVSHAPE2 for RA and RS and SVSHAPE0 for RB, again for 32-bit elements, while ROTATE (sv.rldcl) will also use SVSHAPE2 for RA and RS, but SVSHAPE3 for RB (the shift values, which cycle every 4 elements). Note that the actual indices for SVSHAPE3 will have to be in 32-bit elements:
257
258 SHIFTS: | 0x0000000c00000010 | 0x0000000700000008 |
259
260 The complete algorithm for a loop with 10 iterations is as follows:
261
262 li 7, 10 # Load value 10 into GPR #7
263 mtctr 7 # Set up counter on GPR #7
264
265 # set up VL=32 vertical-first, and SVSHAPEs 0-2
266 # set VL/MAXVL first
267 setvl 0, 0, 32, 0, 1, 1 # MAXVL=VL=32
268 # set r22 from VL, set vertical-first
269 setvl 22, 0, 32, 1, 0, 1 # vertical-first mode
270 # SHAPE0, used by sv.add starts at GPR #8
271 svindex 8/2, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a
272 # SHAPE1, used by sv.xor starts at GPR #12
273 svindex 12/2, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b
274 # SHAPE2, used by sv.rldcl starts at GPR #16
275 svindex 16/2, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c
276 # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
277 # The inner loop will do 32 iterations, but there are only 4 shift values, so we mod 4
278 svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod 4
279
280 .outer:
281 # outer loop begins here (standard CTR loop)
282 setvl 22, 22, 32, 1, 1, 0 # vertical-first, set VL from r22
283 # inner loop begins here. add-xor-rotl32 with remap, step, branch
284 .inner:
285 svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011)
286 sv.add/w=32 *x, *x, *x
287 svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111)
288 sv.xor/w=32 *x, *x, *x
289 svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110)
290 sv.rldcl/w=32 *x, *x, *SHIFTS, 0
291 # 16 is the destination containing the result of svstep.
292 # it overlaps with SHAPE2 which is also 16. the first 8 indices
293 # will get corrupted.
294 svstep. 7, 1, 0 # step to next in-regs element
295 bc 6, 3, .inner # svstep. Rc=1 loop-end-condition?
296 # inner-loop done: outer loop standard CTR-decrement to setvl again
297 bdnz .outer # Loop until CTR is zero