From f02002b2f0ea4afe4ac2996c1928886f16b07dbc Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Wed, 26 Apr 2023 21:30:40 +0000 Subject: [PATCH] more formatting fixes in chacha20 doc --- openpower/sv/cookbook/chacha20.mdwn | 86 ++++++++++++++--------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/openpower/sv/cookbook/chacha20.mdwn b/openpower/sv/cookbook/chacha20.mdwn index 771492587..98c8cfe4f 100644 --- a/openpower/sv/cookbook/chacha20.mdwn +++ b/openpower/sv/cookbook/chacha20.mdwn @@ -132,28 +132,28 @@ Let's assume the values x in the registers 24-36 So for the addition in Vertical-First mode, RT (and RA as they are the same) indices are (in terms of x): - | 0 | 8 | 0 | 8 | 1 | 9 | 1 | 9 | - | 2 | 10 | 2 | 10 | 3 | 11 | 3 | 11 | - | 0 | 10 | 0 | 10 | 1 | 11 | 1 | 11 | - | 2 | 8 | 2 | 8 | 3 | 9 | 3 | 9 | + | 0 | 8 | 0 | 8 | 1 | 9 | 1 | 9 | + | 2 | 10 | 2 | 10 | 3 | 11 | 3 | 11 | + | 0 | 10 | 0 | 10 | 1 | 11 | 1 | 11 | + | 2 | 8 | 2 | 8 | 3 | 9 | 3 | 9 | However, since the indices are small values, using a single 64-bit register for a single index value is a waste so we will compress them, 8 indices in a 64-bit register: So, RT indices will fit inside these 4 registers (in Little Endian format): - SVSHAPE0: | 0x901090108000800 | 0xb030b030a020a02 | 0xb010b010a000a00 | 0x903090308020802 | + SVSHAPE0: | 0x901090108000800 | 0xb030b030a020a02 | 0xb010b010a000a00 | 0x903090308020802 | Similarly we find the RB indices: - | 4 | 12 | 4 | 12 | 5 | 13 | 5 | 13 | - | 6 | 14 | 6 | 14 | 7 | 15 | 7 | 15 | - | 5 | 15 | 5 | 15 | 6 | 12 | 6 | 12 | - | 7 | 13 | 7 | 13 | 4 | 14 | 7 | 14 | + | 4 | 12 | 4 | 12 | 5 | 13 | 5 | 13 | + | 6 | 14 | 6 | 14 | 7 | 15 | 7 | 15 | + | 5 | 15 | 5 | 15 | 6 | 12 | 6 | 12 | + | 7 | 13 | 7 | 13 | 4 | 14 | 7 | 14 | Using a similar method, we find the final 4 registers with the RB indices: - SVSHAPE1: | 0xd050d050c040c04 | 0xf070f070e060e06 | 0xc060c060f050f05 | 0xe040e040d070d07 | + SVSHAPE1: | 0xd050d050c040c04 | 0xf070f070e060e06 | 0xc060c060f050f05 | 0xe040e040d070d07 | Now, we can construct the Vertical First loop: @@ -305,37 +305,37 @@ indices for `SVSHAPE3` will have to be in 32-bit elements: The complete algorithm for a loop with 10 iterations is as follows: - li 7, 10 # Load value 10 into GPR #7 - mtctr 7 # Set up counter on GPR #7 + li 7, 10 # Load value 10 into GPR #7 + mtctr 7 # Set up counter on GPR #7 - # set up VL=32 vertical-first, and SVSHAPEs 0-2 - setvl 0, 0, 32, 1, 1, 1 - # SHAPE0, used by sv.add starts at GPR #8 - svindex 8/2, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a - # SHAPE1, used by sv.xor starts at GPR #12 - svindex 12/2, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b - # SHAPE2, used by sv.rldcl starts at GPR #16 - svindex 16/2, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c - # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20 - # The inner loop will do 32 iterations, but there are only - # 4 shift values, so we mod by 4, and can cycle through them - svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod4 - -.outer: - # outer loop begins here (standard CTR loop) - setvl 0, 0, 32, 1, 1, 1 # MAXVL=VL=32, VF=1 - # inner loop begins here. add-xor-rotl32 with remap, step, branch -.inner: - svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011) - sv.add/w=32 *x, *x, *x - svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111) - sv.xor/w=32 *x, *x, *x - svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110) - sv.rldcl/w=32 *x, *x, *SHIFTS, 0 - # 16 is the destination containing the result of svstep. - # it overlaps with SHAPE2 which is also 16. the first 8 indices - # will get corrupted. - svstep. 7, 1, 0 # step to next in-regs element - bc 6, 3, .inner # svstep. Rc=1 loop-end-condition? - # inner-loop done: outer loop standard CTR-decrement to setvl again - bdnz .outer # Loop until CTR is zero + # set up VL=32 vertical-first, and SVSHAPEs 0-2 + setvl 0, 0, 32, 1, 1, 1 + # SHAPE0, used by sv.add starts at GPR #8 + svindex 8/2, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a + # SHAPE1, used by sv.xor starts at GPR #12 + svindex 12/2, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b + # SHAPE2, used by sv.rldcl starts at GPR #16 + svindex 16/2, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c + # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20 + # The inner loop will do 32 iterations, but there are only + # 4 shift values, so we mod by 4, and can cycle through them + svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod4 + + .outer: + # outer loop begins here (standard CTR loop) + setvl 0, 0, 32, 1, 1, 1 # MAXVL=VL=32, VF=1 + # inner loop begins here. add-xor-rotl32 with remap, step, branch + .inner: + svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011) + sv.add/w=32 *x, *x, *x + svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111) + sv.xor/w=32 *x, *x, *x + svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110) + sv.rldcl/w=32 *x, *x, *SHIFTS, 0 + # 16 is the destination containing the result of svstep. + # it overlaps with SHAPE2 which is also 16. the first 8 indices + # will get corrupted. + svstep. 7, 1, 0 # step to next in-regs element + bc 6, 3, .inner # svstep. Rc=1 loop-end-condition? + # inner-loop done: outer loop standard CTR-decrement to setvl again + bdnz .outer # Loop until CTR is zero -- 2.30.2