From: Konstantinos Margaritis Date: Fri, 17 Mar 2023 09:36:53 +0000 (+0000) Subject: Refactor code, add quarterround macros X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ebe7e4f4f3f02c128d52ec1d3e97535b08fb3f5a;p=openpower-isa.git Refactor code, add quarterround macros --- diff --git a/crypto/chacha20/src/xchacha20_svp64.s b/crypto/chacha20/src/xchacha20_svp64.s deleted file mode 100644 index 4b5ce6e3..00000000 --- a/crypto/chacha20/src/xchacha20_svp64.s +++ /dev/null @@ -1,127 +0,0 @@ -.set out_ptr, 3 -.set in_ptr, 4 -.set k_ptr, 5 -.set ctr, 7 -.set SHAPE0, 8 -.set SHAPE1, 12 -.set SHAPE2, 16 -.set SHIFTS, 20 -.set x, 24 - -.macro lwi rD, const -.if (\const >= -0x8000) && (\const <= 0x7fff) - li \rD, \const -.else - lis \rD, \const@ha - ori \rD, \rD, \const@l -.endif -.endm - -.macro ldi rD, const -.if (\const >= -0x80000000) && (\const <= 0x7fffffff) - lwi \rD, \const -.else - # load high word into the high word of rD - lis \rD,\const@highest # load msg bits 48-63 into rD bits 16-31 - ori \rD,\rD,\const@higher # load msg bits 32-47 into rD bits 0-15 - - rldicr \rD,\rD,32,31 # rotate r4's low word into rD's high word - - # load low word into the low word of rD - oris \rD,\rD,\const@h # load msg bits 16-31 into rD bits 16-31 - ori \rD,\rD,\const@l # load msg bits 0-15 into rD bits 0-15 -.endif -.endm - - .machine libresoc - .file "xchacha20_svp64.s" - .abiversion 2 - .section ".text" - .align 2 - .globl xchacha_hchacha20_svp64_real - .type xchacha_hchacha20_svp64_real, @function -xchacha_hchacha20_svp64_real: -.LFB0: - .cfi_startproc - # load x[0] = 0x61707865, x[1] = 0x3320646e - ldi x+0, 0x3320646e61707865 - # load x[2] = 0x79622d32, x[3] = 0x6b206574 - ldi x+1, 0x6b20657479622d32 - # load SHAPE0 indices - ldi SHAPE0+0, 0x901090108000800 - ldi SHAPE0+1, 0xb030b030a020a02 - ldi SHAPE0+2, 0xb010b010a000a00 - ldi SHAPE0+3, 0x903090308020802 - # load SHAPE1 indices - ldi SHAPE1+0, 0xd050d050c040c04 - ldi SHAPE1+1, 0xf070f070e060e06 - ldi SHAPE1+2, 0xc060c060f050f05 - ldi SHAPE1+3, 0xe040e040d070d07 - # load SHAPE2 indices - ldi SHAPE2+0, 0x50d050d040c040c - ldi SHAPE2+1, 0x70f070f060e060e - ldi SHAPE2+2, 0x60c060c050f050f - ldi SHAPE2+3, 0x40e040e070d070d - #shift values - ldi SHIFTS+0, 0x0000000c00000010 - ldi SHIFTS+1, 0x0000000700000008 - - # Load 8 values from k_ptr - setvl 0,0,4,0,1,1 # Set VL to 8 elements - sv.ld *x+2, 0(k_ptr) - - # Load 4 values from in_ptr - setvl 0,0,2,0,1,1 # Set VL to 4 elements - sv.ld *x+6, 0(in_ptr) - - # set up VL=32 vertical-first, and SVSHAPEs 0-2 - # set vertical firstMAXVL (and r22)a - setvl 0, 0, 32, 0, 1, 1 # MAXVL=VL=32 - setvl 22, 0, 32, 1, 0, 1 # vertical-first mode - # SHAPE0, used by sv.add starts at GPR #8 - svindex 4, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a - # SHAPE1, used by sv.xor starts at GPR #12 - svindex 6, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b - # SHAPE2, used by sv.rldcl starts at GPR #16 - svindex 8, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c - # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20 - # The inner loop will do 32 iterations, but there are only 4 shift values, so we mod 4 - svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod 4 - - # establish CTR for outer round count - li ctr, 10 - mtctr ctr # Set up counter - -.outer: - # outer loop begins here (standard CTR loop) - setvl 22, 22, 32, 1, 1, 0 # vertical-first, set VL from r22 - # inner loop begins here. add-xor-rotl32 with remap, step, branch -.inner: - svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011) - sv.add/w=32 *x, *x, *x - svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111) - sv.xor/w=32 *x, *x, *x - svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110) - sv.rldcl/w=32 *x, *x, *SHIFTS, 0 - # 16 is the destination containing the result of svstep. - # it overlaps with SHAPE2 which is also 16. the first 8 indices - # will get corrupted. - svstep. ctr, 1, 0 # step to next in-regs element - bc 6, 3, .inner # svstep. Rc=1 loop-end-condition? - # inner-loop done: outer loop standard CTR-decrement to setvl again - bdnz .outer # Loop until CTR is zero - - # store x0-x3 directly to *out_ptr - setvl 0,0,2,0,1,1 # Set VL to 4 elements - sv.std *x, 0(out_ptr) - # store x12-x15 to *out_ptr + 16 - sv.std *x+6, 16(out_ptr) - blr - .long 0 - .byte 0,0,0,0,0,3,0,0 - .cfi_endproc - -.LFE0: - .size xchacha_hchacha20_svp64_real,.-xchacha_hchacha20_svp64_real - .ident "GCC: (Debian 8.3.0-6) 8.3.0" - .section .note.GNU-stack,"",@progbits diff --git a/crypto/chacha20/src/xchacha_hchacha20_svp64.s b/crypto/chacha20/src/xchacha_hchacha20_svp64.s new file mode 100644 index 00000000..86866cc2 --- /dev/null +++ b/crypto/chacha20/src/xchacha_hchacha20_svp64.s @@ -0,0 +1,55 @@ + .machine libresoc + .file "xchacha_hchacha20_svp64.s" + .abiversion 2 + .section ".text" + .align 2 + + .include "xchacha_svp64_macros.s" + + .set out_ptr, 3 + .set in_ptr, 4 + .set k_ptr, 5 + .set ctr, 7 + .set x, 24 + .set SHAPE0, 8 + .set SHAPE1, 12 + .set SHAPE2, 16 + .set SHIFTS, 20 + .set VL, 22 + + .globl xchacha_hchacha20_svp64_real + .type xchacha_hchacha20_svp64_real, @function +xchacha_hchacha20_svp64_real: + .cfi_startproc + # load x[0] = 0x61707865, x[1] = 0x3320646e + ldi x+0, 0x3320646e61707865 + # load x[2] = 0x79622d32, x[3] = 0x6b206574 + ldi x+1, 0x6b20657479622d32 + # Load 8 values from k_ptr + setvl 0,0,4,0,1,1 # Set VL to 8 elements + sv.ld *x+2, 0(k_ptr) + # Load 4 values from in_ptr + setvl 0,0,2,0,1,1 # Set VL to 4 elements + sv.ld *x+6, 0(in_ptr) + + # Set up quarterround constants, SHAPE0, SHAPE1, SHAPE2, SHIFTS + quarterround_const SHAPE0, SHAPE1, SHAPE2, SHIFTS + + # establish CTR for outer round count and call quarterround macro + li ctr, 10 + quarterround x, ctr, VL, SHAPE0, SHAPE1, SHAPE2, SHIFTS + + # store x0-x3 directly to *out_ptr + setvl 0,0,2,0,1,1 # Set VL to 4 elements + sv.std *x, 0(out_ptr) + # store x12-x15 to *out_ptr + 16 + sv.std *x+6, 16(out_ptr) + blr + .long 0 + .byte 0,0,0,0,0,3,0,0 + .cfi_endproc + +.LFE0: + .size xchacha_hchacha20_svp64_real,.-xchacha_hchacha20_svp64_real + .ident "GCC: (Debian 8.3.0-6) 8.3.0" + .section .note.GNU-stack,"",@progbits diff --git a/crypto/chacha20/src/xchacha_svp64_macros.s b/crypto/chacha20/src/xchacha_svp64_macros.s new file mode 100644 index 00000000..c22fe8af --- /dev/null +++ b/crypto/chacha20/src/xchacha_svp64_macros.s @@ -0,0 +1,89 @@ +# Helper macros for assembly + +# load word immediate for 32-bit constants +.macro lwi rD, const +.if (\const >= -0x8000) && (\const <= 0x7fff) + li \rD, \const +.else + lis \rD, \const@ha + ori \rD, \rD, \const@l +.endif +.endm + +# load double word immediate for 64-bit constants +.macro ldi rD, const +.if (\const >= -0x80000000) && (\const <= 0x7fffffff) + lwi \rD, \const +.else + # load high word into the high word of rD + lis \rD,\const@highest # load msg bits 48-63 into rD bits 16-31 + ori \rD,\rD,\const@higher # load msg bits 32-47 into rD bits 0-15 + + rldicr \rD,\rD,32,31 # rotate r4's low word into rD's high word + + # load low word into the low word of rD + oris \rD,\rD,\const@h # load msg bits 16-31 into rD bits 16-31 + ori \rD,\rD,\const@l # load msg bits 0-15 into rD bits 0-15 +.endif +.endm + +# This macro uses registers 8-21 +.macro quarterround_const _SHAPE0, _SHAPE1, _SHAPE2, _SHIFTS + # load SHAPE0 indices + ldi \_SHAPE0+0, 0x901090108000800 + ldi \_SHAPE0+1, 0xb030b030a020a02 + ldi \_SHAPE0+2, 0xb010b010a000a00 + ldi \_SHAPE0+3, 0x903090308020802 + # load SHAPE1 indices + ldi \_SHAPE1+0, 0xd050d050c040c04 + ldi \_SHAPE1+1, 0xf070f070e060e06 + ldi \_SHAPE1+2, 0xc060c060f050f05 + ldi \_SHAPE1+3, 0xe040e040d070d07 + # load SHAPE2 indices + ldi \_SHAPE2+0, 0x50d050d040c040c + ldi \_SHAPE2+1, 0x70f070f060e060e + ldi \_SHAPE2+2, 0x60c060c050f050f + ldi \_SHAPE2+3, 0x40e040e070d070d + #shift values + ldi \_SHIFTS+0, 0x0000000c00000010 + ldi \_SHIFTS+1, 0x0000000700000008 +.endm + +# This macro uses registers 8-21 +.macro quarterround _x, _ctr, _VL, _SHAPE0, _SHAPE1, _SHAPE2, _SHIFTS + mtctr \_ctr # Set up counter + + # set up VL=32 vertical-first, and SVSHAPEs 0-2 + # set VL/MAXVL first + setvl 0, 0, 32, 0, 1, 1 # MAXVL=VL=32 + # set r22 from VL, set vertical-first + setvl \_VL, 0, 32, 1, 0, 1 # vertical-first mode + # SHAPE0, used by sv.add starts at GPR #8 + svindex \_SHAPE0/2, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a + # SHAPE1, used by sv.xor starts at GPR #12 + svindex \_SHAPE1/2, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b + # SHAPE2, used by sv.rldcl starts at GPR #16 + svindex \_SHAPE2/2, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c + # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20 + # The inner loop will do 32 iterations, but there are only 4 shift values, so we mod 4 + svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod 4 + +.outer: + # outer loop begins here (standard CTR loop) + setvl \_VL, \_VL, 32, 1, 1, 0 # vertical-first, set VL from r22 + # inner loop begins here. add-xor-rotl32 with remap, step, branch +.inner: + svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011) + sv.add/w=32 *\_x, *\_x, *\_x + svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111) + sv.xor/w=32 *\_x, *\_x, *\_x + svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110) + sv.rldcl/w=32 *\_x, *\_x, *\_SHIFTS, 0 + # 16 is the destination containing the result of svstep. + # it overlaps with SHAPE2 which is also 16. the first 8 indices + # will get corrupted. + svstep. \_ctr, 1, 0 # step to next in-regs element + bc 6, 3, .inner # svstep. Rc=1 loop-end-condition? + # inner-loop done: outer loop standard CTR-decrement to setvl again + bdnz .outer # Loop until CTR is zero +.endm