12 .if (\const >= -0x8000) && (\const <= 0x7fff)
16 ori \rD, \rD, \const@l
21 .if (\const >= -0x80000000) && (\const <= 0x7fffffff)
24 # load high word into the high word of rD
25 lis \rD,\const@highest # load msg bits 48-63 into rD bits 16-31
26 ori \rD,\rD,\const@higher # load msg bits 32-47 into rD bits 0-15
28 rldicr \rD,\rD,32,31 # rotate r4's low word into rD's high word
30 # load low word into the low word of rD
31 oris \rD,\rD,\const@h # load msg bits 16-31 into rD bits 16-31
32 ori \rD,\rD,\const@l # load msg bits 0-15 into rD bits 0-15
37 .file "xchacha20_svp64.s"
41 .globl xchacha_hchacha20_svp64_real
42 .type xchacha_hchacha20_svp64_real, @function
43 xchacha_hchacha20_svp64_real:
46 # load x[0] = 0x61707865, x[1] = 0x3320646e
47 ldi x+0, 0x3320646e61707865
48 # load x[2] = 0x79622d32, x[3] = 0x6b206574
49 ldi x+1, 0x6b20657479622d32
51 ldi SHAPE0+0, 0x901090108000800
52 ldi SHAPE0+1, 0xb030b030a020a02
53 ldi SHAPE0+2, 0xb010b010a000a00
54 ldi SHAPE0+3, 0x903090308020802
56 ldi SHAPE1+0, 0xd050d050c040c04
57 ldi SHAPE1+1, 0xf070f070e060e06
58 ldi SHAPE1+2, 0xc060c060f050f05
59 ldi SHAPE1+3, 0xe040e040d070d07
61 ldi SHAPE2+0, 0x50d050d040c040c
62 ldi SHAPE2+1, 0x70f070f060e060e
63 ldi SHAPE2+2, 0x60c060c050f050f
64 ldi SHAPE2+3, 0x40e040e070d070d
66 ldi SHIFTS+0, 0x0000000c00000010
67 ldi SHIFTS+1, 0x0000000700000008
69 # Load 8 values from k_ptr
70 setvl 0,0,4,0,1,1 # Set VL to 8 elements
73 # Load 4 values from in_ptr
74 setvl 0,0,2,0,1,1 # Set VL to 4 elements
77 # set up VL=32 vertical-first, and SVSHAPEs 0-2
78 # set vertical firstMAXVL (and r22)a
79 setvl 0, 0, 32, 0, 1, 1 # MAXVL=VL=32
80 setvl 22, 0, 32, 1, 0, 1 # vertical-first mode
81 # SHAPE0, used by sv.add starts at GPR #8
82 svindex 4, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a
83 # SHAPE1, used by sv.xor starts at GPR #12
84 svindex 6, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b
85 # SHAPE2, used by sv.rldcl starts at GPR #16
86 svindex 8, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c
87 # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
88 # The inner loop will do 32 iterations, but there are only 4 shift values, so we mod 4
89 svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod 4
91 # establish CTR for outer round count
93 mtctr ctr # Set up counter
96 # outer loop begins here (standard CTR loop)
97 setvl 22, 22, 32, 1, 1, 0 # vertical-first, set VL from r22
98 # inner loop begins here. add-xor-rotl32 with remap, step, branch
100 svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011)
101 sv.add/w=32 *x, *x, *x
102 svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111)
103 sv.xor/w=32 *x, *x, *x
104 svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110)
105 sv.rldcl/w=32 *x, *x, *SHIFTS, 0
106 # 16 is the destination containing the result of svstep.
107 # it overlaps with SHAPE2 which is also 16. the first 8 indices
108 # will get corrupted.
109 svstep. ctr, 1, 0 # step to next in-regs element
110 bc 6, 3, .inner # svstep. Rc=1 loop-end-condition?
111 # inner-loop done: outer loop standard CTR-decrement to setvl again
112 bdnz .outer # Loop until CTR is zero
114 # store x0-x3 directly to *out_ptr
115 setvl 0,0,2,0,1,1 # Set VL to 4 elements
116 sv.std *x, 0(out_ptr)
117 # store x12-x15 to *out_ptr + 16
118 sv.std *x+6, 16(out_ptr)
121 .byte 0,0,0,0,0,3,0,0
125 .size xchacha_hchacha20_svp64_real,.-xchacha_hchacha20_svp64_real
126 .ident "GCC: (Debian 8.3.0-6) 8.3.0"
127 .section .note.GNU-stack,"",@progbits