First working version of SVP64 arm xchacha_hchacha20() function
[openpower-isa.git] / crypto / chacha20 / src / xchacha20_svp64.s
1 .set out_ptr, 3
2 .set in_ptr, 4
3 .set k_ptr, 5
4 .set ctr, 7
5 .set SHAPE0, 8
6 .set SHAPE1, 12
7 .set SHAPE2, 16
8 .set SHIFTS, 20
9 .set x, 24
10
11 .macro lwi rD, const
12 .if (\const >= -0x8000) && (\const <= 0x7fff)
13 li \rD, \const
14 .else
15 lis \rD, \const@ha
16 ori \rD, \rD, \const@l
17 .endif
18 .endm
19
20 .macro ldi rD, const
21 .if (\const >= -0x80000000) && (\const <= 0x7fffffff)
22 lwi \rD, \const
23 .else
24 # load high word into the high word of rD
25 lis \rD,\const@highest # load msg bits 48-63 into rD bits 16-31
26 ori \rD,\rD,\const@higher # load msg bits 32-47 into rD bits 0-15
27
28 rldicr \rD,\rD,32,31 # rotate r4's low word into rD's high word
29
30 # load low word into the low word of rD
31 oris \rD,\rD,\const@h # load msg bits 16-31 into rD bits 16-31
32 ori \rD,\rD,\const@l # load msg bits 0-15 into rD bits 0-15
33 .endif
34 .endm
35
36 .machine libresoc
37 .file "xchacha20_svp64.s"
38 .abiversion 2
39 .section ".text"
40 .align 2
41 .globl xchacha_hchacha20_svp64_real
42 .type xchacha_hchacha20_svp64_real, @function
43 xchacha_hchacha20_svp64_real:
44 .LFB0:
45 .cfi_startproc
46 # load x[0] = 0x61707865, x[1] = 0x3320646e
47 ldi x+0, 0x3320646e61707865
48 # load x[2] = 0x79622d32, x[3] = 0x6b206574
49 ldi x+1, 0x6b20657479622d32
50 # load SHAPE0 indices
51 ldi SHAPE0+0, 0x901090108000800
52 ldi SHAPE0+1, 0xb030b030a020a02
53 ldi SHAPE0+2, 0xb010b010a000a00
54 ldi SHAPE0+3, 0x903090308020802
55 # load SHAPE1 indices
56 ldi SHAPE1+0, 0xd050d050c040c04
57 ldi SHAPE1+1, 0xf070f070e060e06
58 ldi SHAPE1+2, 0xc060c060f050f05
59 ldi SHAPE1+3, 0xe040e040d070d07
60 # load SHAPE2 indices
61 ldi SHAPE2+0, 0x50d050d040c040c
62 ldi SHAPE2+1, 0x70f070f060e060e
63 ldi SHAPE2+2, 0x60c060c050f050f
64 ldi SHAPE2+3, 0x40e040e070d070d
65 #shift values
66 ldi SHIFTS+0, 0x0000000c00000010
67 ldi SHIFTS+1, 0x0000000700000008
68
69 # Load 8 values from k_ptr
70 setvl 0,0,4,0,1,1 # Set VL to 8 elements
71 sv.ld *x+2, 0(k_ptr)
72
73 # Load 4 values from in_ptr
74 setvl 0,0,2,0,1,1 # Set VL to 4 elements
75 sv.ld *x+6, 0(in_ptr)
76
77 # set up VL=32 vertical-first, and SVSHAPEs 0-2
78 # set vertical firstMAXVL (and r22)a
79 setvl 0, 0, 32, 0, 1, 1 # MAXVL=VL=32
80 setvl 22, 0, 32, 1, 0, 1 # vertical-first mode
81 # SHAPE0, used by sv.add starts at GPR #8
82 svindex 4, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a
83 # SHAPE1, used by sv.xor starts at GPR #12
84 svindex 6, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b
85 # SHAPE2, used by sv.rldcl starts at GPR #16
86 svindex 8, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c
87 # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
88 # The inner loop will do 32 iterations, but there are only 4 shift values, so we mod 4
89 svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod 4
90
91 # establish CTR for outer round count
92 li ctr, 10
93 mtctr ctr # Set up counter
94
95 .outer:
96 # outer loop begins here (standard CTR loop)
97 setvl 22, 22, 32, 1, 1, 0 # vertical-first, set VL from r22
98 # inner loop begins here. add-xor-rotl32 with remap, step, branch
99 .inner:
100 svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011)
101 sv.add/w=32 *x, *x, *x
102 svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111)
103 sv.xor/w=32 *x, *x, *x
104 svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110)
105 sv.rldcl/w=32 *x, *x, *SHIFTS, 0
106 # 16 is the destination containing the result of svstep.
107 # it overlaps with SHAPE2 which is also 16. the first 8 indices
108 # will get corrupted.
109 svstep. ctr, 1, 0 # step to next in-regs element
110 bc 6, 3, .inner # svstep. Rc=1 loop-end-condition?
111 # inner-loop done: outer loop standard CTR-decrement to setvl again
112 bdnz .outer # Loop until CTR is zero
113
114 # store x0-x3 directly to *out_ptr
115 setvl 0,0,2,0,1,1 # Set VL to 4 elements
116 sv.std *x, 0(out_ptr)
117 # store x12-x15 to *out_ptr + 16
118 sv.std *x+6, 16(out_ptr)
119 blr
120 .long 0
121 .byte 0,0,0,0,0,3,0,0
122 .cfi_endproc
123
124 .LFE0:
125 .size xchacha_hchacha20_svp64_real,.-xchacha_hchacha20_svp64_real
126 .ident "GCC: (Debian 8.3.0-6) 8.3.0"
127 .section .note.GNU-stack,"",@progbits