add in ```s and deliberately leave in indentation.

author Luke Kenneth Casson Leighton <lkcl@lkcl.net>

Thu, 27 Apr 2023 09:45:19 +0000 (10:45 +0100)

committer Luke Kenneth Casson Leighton <lkcl@lkcl.net>

Thu, 27 Apr 2023 09:45:22 +0000 (10:45 +0100)
author Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Thu, 27 Apr 2023 09:45:19 +0000 (10:45 +0100)
committer Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Thu, 27 Apr 2023 09:45:22 +0000 (10:45 +0100)
diff --git a/openpower/sv/cookbook/chacha20.mdwn b/openpower/sv/cookbook/chacha20.mdwn

index fa3e9db5ae6563219d368620abd98e0e44307275..0a825a326c1df2c7a18a831ffff506f6041510fb 100644 (file)
--- a/openpower/sv/cookbook/chacha20.mdwn
+++ b/openpower/sv/cookbook/chacha20.mdwn
@@ -13,42 +13,47 @@ The function under inspection is `xchacha_hchacha20`. If we notice we will that
  Main loop for `xchacha_hchacha20`:
  
  ```
-for (i = 0; i < 10; i++){
-    QUARTERROUND(x0, x4,  x8, x12);
-    QUARTERROUND(x1, x5,  x9, x13);
-    QUARTERROUND(x2, x6, x10, x14);
-    QUARTERROUND(x3, x7, x11, x15);
-    QUARTERROUND(x0, x5, x10, x15);
-    QUARTERROUND(x1, x6, x11, x12);
-    QUARTERROUND(x2, x7,  x8, x13);
-    QUARTERROUND(x3, x4,  x9, x14);
-}
-
-#define QUARTERROUND(a,b,c,d) \
-    a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
-    c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
-    a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
-    c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
+    for (i = 0; i < 10; i++){
+        QUARTERROUND(x0, x4,  x8, x12);
+        QUARTERROUND(x1, x5,  x9, x13);
+        QUARTERROUND(x2, x6, x10, x14);
+        QUARTERROUND(x3, x7, x11, x15);
+        QUARTERROUND(x0, x5, x10, x15);
+        QUARTERROUND(x1, x6, x11, x12);
+        QUARTERROUND(x2, x7,  x8, x13);
+        QUARTERROUND(x3, x4,  x9, x14);
+    }
+
+    #define QUARTERROUND(a,b,c,d) \
+        a = PLUS(a,b); d = ROTATE(XOR(d,a),16); \
+        c = PLUS(c,d); b = ROTATE(XOR(b,c),12); \
+        a = PLUS(a,b); d = ROTATE(XOR(d,a), 8); \
+        c = PLUS(c,d); b = ROTATE(XOR(b,c), 7);
  ```
      
  We see that the loop is split in two groups of `QUARTERROUND` calls,
  one with `step=4`:
  
+```
      QUARTERROUND(x0, x4,  x8, x12);
      QUARTERROUND(x1, x5,  x9, x13);
      QUARTERROUND(x2, x6, x10, x14);
      QUARTERROUND(x3, x7, x11, x15);
+```
      
  and another with `step=5`:
  
+```
      QUARTERROUND(x0, x5, x10, x15);
      QUARTERROUND(x1, x6, x11, x12);
      QUARTERROUND(x2, x7,  x8, x13);
      QUARTERROUND(x3, x4,  x9, x14);
+```
  
  Let's start with the first group of `QUARTERROUND`s, by unrolling it,
  essentially it results in the following instructions:
  
+```
      x0  = x0 + x4;   x12 = ROTATE(x12 ^ x0, 16);
      x8  = x8 + x12;   x4 = ROTATE(x4 ^ x8, 12);
      x0  = x0 + x4;   x12 = ROTATE(x12 ^ x0, 8);
@@ -65,9 +70,11 @@ essentially it results in the following instructions:
      x11 = x11 + x15;  x7 = ROTATE(x7 ^ x11, 12);
      x3  = x3 + x7;   x15 = ROTATE(x15 ^ x3, 8);
      x11 = x11 + x15;  x7 = ROTATE(x7 ^ x11, 7);
+```
  
  Second group of `QUARTERROUND`s, unrolled:
  
+```
      x0  = x0 + x5;   x15 = ROTATE(x15 ^ x0, 16);
      x10 = x10 + x15;  x5 = ROTATE(x5 ^ x10, 12);
      x0  = x0 + x5;   x12 = ROTATE(x15 ^ x0, 8);
@@ -84,9 +91,11 @@ Second group of `QUARTERROUND`s, unrolled:
      x9  = x9 + x14;   x4 = ROTATE(x4 ^ x9, 12);
      x3  = x3 + x4;   x14 = ROTATE(x14 ^ x3, 8);
      x9  = x9 + x14;   x4 = ROTATE(x4 ^ x9, 7);
+```
      
  Let's list the additions only:
  
+```
      x0  = x0 + x4
      x8  = x8 + x12
      x0  = x0 + x4
@@ -119,6 +128,7 @@ Let's list the additions only:
      x9  = x9 + x14
      x3  = x3 + x4
      x9  = x9 + x14
+```
      
  ## Introduction to Vertical-First Mode
  
@@ -128,7 +138,9 @@ mode, or even in traditional SIMD mode, the instruction is executed
  across a (Horizontal) Vector. So, if we have, for example `VL=8`, then
  the instruction
  
+```
      sv.add *RT, *RA, *RB  # RT = RA + RB
+```
  
  will be executed on all elements of the vector, **before** moving to
  the next assembly instruction.  This behaviour changes in Vertical-First
@@ -186,18 +198,22 @@ Using a similar method, we find the final 4 registers with the `RB` indices:
     
  Now, we can construct the Vertical First loop:
  
+```
      svindex             4, 0, 1, 3, 0, 1, 0     # SVSHAPE0, add RA/RT indices
      svindex             6, 1, 1, 3, 0, 1, 0     # SVSHAPE1, add RB indices
      setvl               0, 0, 32, 1, 1, 1       # MAXVL=VL=32, VF=1
      svremap             31, 1, 0, 0, 0, 0, 0    # RA=1, RB=0, RT=0 (0b01011)
      sv.add/w=32         *x, *x, *x              # RT, RB: SHAPE0. RA: SHAPE1
      svstep. 16, 1, 0                            # step to next in-regs element
+```
  
  What this code snippet does is the following:
  
  The first instruction
  
+```
      svindex             4, 0, 1, 3, 0, 1, 0
+```
  
  loads the add `RT` indices in the `SVSHAPE0`, in register 8. You will note
  that 4 is listed, but that's because it only works on even registers,
@@ -208,25 +224,33 @@ actual register. So, `SVSHAPE0` will be listed in GPRs 8-12. The number
  
  The next step instruction
  
+```
      svindex             6, 1, 1, 3, 0, 1, 0
+```
  
  loads the add `RB` indices into `SVSHAPE1`. Again, even though we list 6,
  the actual registers will be loaded in GPR #12, again a use of 8-bit
  elements is denoted.
  Next, the `setvl` instructions:
  
+```
      setvl               0, 0, 32, 1, 1, 1
+```
  
  We have to call `setvl` to set `MAXVL` and `VL` to 32 and also configure
  Vertical-First mode.  Afterwards, we have to instruct the way we intend
  to use the indices, and we do this using `svremap`.
  
+```
      svremap 31, 1, 0, 0, 0, 0, 0
+```
      
  `svremap` basically instructs the scheduler to use `SVSHAPE0` for `RT` and `RB`,
  `SVSHAPE1` for `RA`.  The next instruction performs the **actual** addition:
  
+```
      sv.add/w=32         *x, *x, *x
+```
      
  Note the `/w=32` suffix. This instructs the adder to perform the operation
  in elements of `w=32` bits. Since the Power CPU is a 64-bit CPU, this means
@@ -239,6 +263,7 @@ note that the indices are relative to the actual register used. So,
  if `*x` starts in GPR 24 for example, in essence this instruction will
  issue the following sequence of instructions:
  
+```
      add/w=32            24 + 0, 24 +  4, 24 + 0
      add/w=32            24 + 8, 24 + 12, 24 + 8
      add/w=32            24 + 0, 24 +  4, 24 + 0
@@ -248,6 +273,7 @@ issue the following sequence of instructions:
      add/w=32            24 + 1, 24 +  5, 24 + 1
      add/w=32            24 + 9, 24 + 13, 24 + 9
      ...
+```
  
  Finally, the `svstep.` instruction steps to the next set of indices 
  
@@ -256,6 +282,7 @@ let's add the rest of the instructions in the `QUARTERROUND`s.  For the
  `XOR` instructions of both `QUARTERROUND`s groups only, assuming that `d =
  XOR(d, a)`:
  
+```
      x12 = x12 ^ x0
      x4 = x4 ^ x8
      x12 = x12 ^ x0
@@ -288,6 +315,7 @@ XOR(d, a)`:
      x4 = x4 ^ x9
      x14 = x14 ^ x3
      x4 = x4 ^ x9
+```
  
  We will need to create another set of indices for the `XOR` instructions. We
  will only need one set as the other set of indices is the same as `RT`
@@ -311,22 +339,28 @@ The next operation is the `ROTATE` which takes as operand the result of the
  case are the same as the `XOR`. However, the shift values cycle every 4:
  16, 12, 8, 7. For the indices we can again use `svindex`, like this:
  
+```
      svindex             8, 2, 1, 3, 0, 1, 0
+```
      
  Which again means `SVSHAPE2`, operating on 8-bit elements, starting
  from GPR #16 (`8*2`). For the shift values cycling every 4 elements,
  the `svshape2` instruction will be used:
  
+```
      svshape2            0, 0, 3, 4, 0, 1
+```
      
  This will create an `SVSHAPE3`, which will use a modulo 4 for all of its
  elements. Now we can list both `XOR` and `ROTATE` instructions in assembly,
  together with the respective svremap instructions:
  
+```
      svremap             31, 2, 0, 2, 2, 0, 0       # RA=2, RB=0, RS=2 (0b00111)
      sv.xor/w=32         *x, *x, *x
      svremap             31, 0, 3, 2, 2, 0, 0       # RA=2, RB=3, RS=2 (0b01110)
      sv.rldcl/w=32       *x, *x, *SHIFTS, 0
+```
  
  So, in a similar fashion, we instruct `XOR` (`sv.xor`) to use `SVSHAPE2` for
  `RA` and `RS` and `SVSHAPE0` for `RB`, again for 32-bit elements, while `ROTATE`
author	Luke Kenneth Casson Leighton <lkcl@lkcl.net>
	Thu, 27 Apr 2023 09:45:19 +0000 (10:45 +0100)
committer	Luke Kenneth Casson Leighton <lkcl@lkcl.net>
	Thu, 27 Apr 2023 09:45:22 +0000 (10:45 +0100)