1 """Implementation of chacha20 core in SVP64
2 Copyright (C) 2022,2023 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
3 Licensed under the LGPLv3+
4 Funded by NLnet NGI-ASSURE under EU grant agreement No 957073.
5 * https://nlnet.nl/project/LibreSOC-GigabitRouter/
6 * https://bugs.libre-soc.org/show_bug.cgi?id=965
7 * https://libre-soc.org/openpower/sv/cookbook/chacha20/
11 from copy
import deepcopy
13 from nmutil
.formaltest
import FHDLTestCase
14 from openpower
.decoder
.isa
.caller
import SVP64State
, set_masked_reg
15 from openpower
.decoder
.isa
.test_caller
import run_tst
16 from openpower
.decoder
.selectable_int
import SelectableInt
17 from openpower
.simulator
.program
import Program
18 from openpower
.insndb
.asm
import SVP64Asm
21 # originally from https://github.com/pts/chacha20
22 # the functtion is turned into a "schedule" of the
23 # operations to be applied, where the add32 xor32 rotl32
24 # are actually carried out by the sthth_round
25 # higher-order-function. this "split-out" (code-morph)
26 # of the original code by pts@fazekas.hu allows us to
27 # share the "schedule" between the pure-python chacha20
28 # and the SVP64 implementation. the schedule is static:
29 # it can be printed out and loaded as "magic constants"
30 # into registers. more details:
31 # https://libre-soc.org/openpower/sv/cookbook/chacha20/
32 def quarter_round_schedule(x
, a
, b
, c
, d
):
33 """collate list of reg-offsets for use with svindex/svremap
35 #x[a] = (x[a] + x[b]) & 0xffffffff - add32
36 #x[d] = x[d] ^ x[a] - xor32
37 #x[d] = rotate(x[d], 16) - rotl32
38 x
.append((a
, b
, d
, 16))
40 #x[c] = (x[c] + x[d]) & 0xffffffff - add32
41 #x[b] = x[b] ^ x[c] - xor32
42 #x[b] = rotate(x[b], 12) - rotl32
43 x
.append((c
, d
, b
, 12))
45 #x[a] = (x[a] + x[b]) & 0xffffffff - add32
46 #x[d] = x[d] ^ x[a] - xor32
47 #x[d] = rotate(x[d], 8) - rotl32
48 x
.append((a
, b
, d
, 8))
50 #x[c] = (x[c] + x[d]) & 0xffffffff - add32
51 #x[b] = x[b] ^ x[c] - xor32
52 #x[b] = rotate(x[b], 7) - rotl32
53 x
.append((c
, d
, b
, 7))
58 res
= ((v
<< c
) & 0xffffffff) | v
>> (32 - c
)
59 print("op rotl32", hex(res
), hex(v
), hex(c
))
64 res
= (a
+ b
) & 0xffffffff
65 print("op add32", hex(res
), hex(a
), hex(b
))
71 print("op xor32", hex(res
), hex(a
), hex(b
))
75 # originally in pts's code there were 4 of these, explicitly loop-unrolled.
76 # the common constants were extracted (a,b,c,d,rot) and this is what is left
77 def sthth_round(x
, a
, b
, d
, rot
):
78 x
[a
] = add32 (x
[a
], x
[b
])
79 x
[d
] = xor32 (x
[d
], x
[a
])
80 x
[d
] = rotl32(x
[d
], rot
)
82 # pts's version of quarter_round has the add/xor/rot explicitly
83 # loop-unrolled four times. instead we call the 16th-round function
84 # with the appropriate offsets/rot-magic-constants.
85 def quarter_round(x
, a
, b
, c
, d
):
86 """collate list of reg-offsets for use with svindex/svremap
88 sthth_round(x
, a
, b
, d
, 16)
89 sthth_round(x
, c
, d
, b
, 12)
90 sthth_round(x
, a
, b
, d
, 8)
91 sthth_round(x
, c
, d
, b
, 7)
94 # again in pts's version, this is what was originally
95 # the loop around quarter_round. we can either pass in
96 # a function that simply collates the indices *or*
97 # actually do the same job as pts's original code,
98 # just by passing in a different fn.
99 def chacha_idx_schedule(x
, fn
=quarter_round_schedule
):
110 class SVSTATETestCase(FHDLTestCase
):
112 def _check_regs(self
, sim
, expected
):
116 self
.assertEqual(sim
.gpr(i
), SelectableInt(expected
[i
], 64),
117 "GPR %d %x expected %x" % \
118 (i
, sim
.gpr(i
).value
, expected
[i
]))
120 def test_1_sv_chacha20_main_rounds(self
):
121 """chacha20 main rounds
123 RA, RB, RS and RT are set up via Indexing to perform the *individual*
124 add/xor/rotl32 operations (with elwidth=32)
126 the inner loop uses "svstep." which detects src/dst-step reaching
127 the end of the loop, setting CR0.eq=1. no need for an additional
128 counter-register-with-a-decrement. this has the side-effect of
129 freeing up CTR for use as a straight decrement-counter.
131 both loops are 100% deterministic meaning that there should be
132 *ZERO* branch-prediction misses, obviating a need for loop-unrolling.
135 nrounds
= 2 # should be 10 for full algorithm
137 block
= 24 # register for block of 16
138 vl
= 22 # copy of VL placed in here
142 shifts
= 20 # registers for 4 32-bit shift amounts
143 ctr
= 7 # register for CTR
146 # set up VL=32 vertical-first, and SVSHAPEs 0-2
147 # vertical-first, set MAXVL (and r17)
148 'setvl 0, 0, 32, 1, 1, 1', # vertical-first, set VL
149 'svindex %d, 0, 1, 3, 0, 1, 0' % (SHAPE0
//2), # SVSHAPE0, a
150 'svindex %d, 1, 1, 3, 0, 1, 0' % (SHAPE1
//2), # SVSHAPE1, b
151 'svindex %d, 2, 1, 3, 0, 1, 0' % (SHAPE2
//2), # SVSHAPE2, c
152 'svshape2 0, 0, 3, 4, 0, 1', # SVSHAPE3, shift amount, mod 4
153 # establish CTR for outer round count
154 'addi %d, 0, %d' % (ctr
, nrounds
), # set number of rounds
155 'mtspr 9, %d' % ctr
, # set CTR to number of rounds
156 # outer loop begins here (standard CTR loop)
157 'setvl 0, 0, 32, 1, 1, 1', # vertical-first, set VL
158 # inner loop begins here. add-xor-rotl32 with remap, step, branch
159 'svremap 31, 1, 0, 0, 0, 0, 0', # RA=1, RB=0, RT=0 (0b01011)
160 'sv.add/w=32 *%d, *%d, *%d' % (block
, block
, block
),
161 'svremap 31, 2, 0, 2, 2, 0, 0', # RA=2, RB=0, RS=2 (0b00111)
162 'sv.xor/w=32 *%d, *%d, *%d' % (block
, block
, block
),
163 'svremap 31, 0, 3, 2, 2, 0, 0', # RA=2, RB=3, RS=2 (0b01110)
164 'sv.rldcl/w=32 *%d, *%d, *%d, 0' % (block
, block
, shifts
),
165 'svstep. %d, 0, 1, 0' % ctr
, # step to next in-regs element
166 'bc 6, 3, -0x28', # svstep. Rc=1 loop-end-condition?
167 # inner-loop done: outer loop standard CTR-decrement to setvl again
171 print("listing", lst
)
174 chacha_idx_schedule(schedule
, fn
=quarter_round_schedule
)
176 # initial values in GPR regfile
177 initial_regs
= [0] * 128
180 for i
, (a
, b
, c
, d
) in enumerate(schedule
):
181 print ("chacha20 schedule", i
, hex(a
), hex(b
), hex(c
), hex(d
))
182 set_masked_reg(initial_regs
, SHAPE0
, i
, ew_bits
=8, value
=a
)
183 set_masked_reg(initial_regs
, SHAPE1
, i
, ew_bits
=8, value
=b
)
184 set_masked_reg(initial_regs
, SHAPE2
, i
, ew_bits
=8, value
=c
)
186 # offsets for d (modulo 4 shift amount)
187 shiftvals
= [16, 12, 8, 7] # chacha20 shifts
189 set_masked_reg(initial_regs
, shifts
, i
, ew_bits
=32,
192 # set up input test vector then pack it into regs
211 # use packing function which emulates element-width overrides @ 32-bit
213 set_masked_reg(initial_regs
, block
, i
, ew_bits
=32, value
=x
[i
])
216 svstate
= SVP64State()
217 #svstate.vl = 32 # VL
218 #svstate.maxvl = 32 # MAXVL
219 print("SVSTATE", bin(svstate
.asint()))
221 # copy before running, compute expected results
222 expected_regs
= deepcopy(initial_regs
)
223 expected_regs
[ctr
] = 0 # reaches zero
224 #expected_regs[vl] = 32 # gets set to MAXVL
225 expected
= deepcopy(x
)
226 # use the pts-derived quarter_round function to
227 # compute a pure-python version of chacha20
228 for i
in range(nrounds
):
229 chacha_idx_schedule(expected
, fn
=quarter_round
)
231 set_masked_reg(expected_regs
, block
, i
, ew_bits
=32,
234 with
Program(lst
, bigendian
=False) as program
:
235 sim
= self
.run_tst_program(program
, initial_regs
, svstate
=svstate
)
237 # print out expected: 16 values @ 32-bit ea -> QTY8 64-bit regs
239 RS
= sim
.gpr(i
+block
).value
240 print("expected", i
+block
, hex(RS
), hex(expected_regs
[i
+block
]))
243 SVSHAPE0
= sim
.spr
['SVSHAPE0']
244 SVSHAPE1
= sim
.spr
['SVSHAPE1']
245 print("SVSTATE after", bin(sim
.svstate
.asint()))
246 print(" vl", bin(sim
.svstate
.vl
))
247 print(" mvl", bin(sim
.svstate
.maxvl
))
248 print(" srcstep", bin(sim
.svstate
.srcstep
))
249 print(" dststep", bin(sim
.svstate
.dststep
))
250 print(" RMpst", bin(sim
.svstate
.RMpst
))
251 print(" SVme", bin(sim
.svstate
.SVme
))
252 print(" mo0", bin(sim
.svstate
.mo0
))
253 print(" mo1", bin(sim
.svstate
.mo1
))
254 print(" mi0", bin(sim
.svstate
.mi0
))
255 print(" mi1", bin(sim
.svstate
.mi1
))
256 print(" mi2", bin(sim
.svstate
.mi2
))
257 print("STATE0svgpr", hex(SVSHAPE0
.svgpr
))
258 print("STATE0 xdim", SVSHAPE0
.xdimsz
)
259 print("STATE0 ydim", SVSHAPE0
.ydimsz
)
260 print("STATE0 skip", bin(SVSHAPE0
.skip
))
261 print("STATE0 inv", SVSHAPE0
.invxyz
)
262 print("STATE0order", SVSHAPE0
.order
)
263 print(sim
.gpr
.dump())
264 self
._check
_regs
(sim
, expected_regs
)
265 self
.assertEqual(sim
.svstate
.RMpst
, 0)
266 self
.assertEqual(sim
.svstate
.SVme
, 0b11111)
267 self
.assertEqual(sim
.svstate
.mi0
, 0)
268 self
.assertEqual(sim
.svstate
.mi1
, 3)
269 self
.assertEqual(sim
.svstate
.mi2
, 2)
270 self
.assertEqual(sim
.svstate
.mo0
, 2)
271 self
.assertEqual(sim
.svstate
.mo1
, 0)
272 #self.assertEqual(SVSHAPE0.svgpr, 22)
273 #self.assertEqual(SVSHAPE1.svgpr, 30)
275 def run_tst_program(self
, prog
, initial_regs
=None,
277 if initial_regs
is None:
278 initial_regs
= [0] * 32
279 simulator
= run_tst(prog
, initial_regs
, svstate
=svstate
)
284 if __name__
== "__main__":