From 9caa3bcf6897582da7e22180c0ea1345e3df64c0 Mon Sep 17 00:00:00 2001
From: lkcl <lkcl@web>
Date: Fri, 5 Jan 2024 15:30:07 +0000
Subject: [PATCH]

---
 openpower/sv/cookbook/fortran_maxloc.mdwn | 50 ++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/openpower/sv/cookbook/fortran_maxloc.mdwn b/openpower/sv/cookbook/fortran_maxloc.mdwn
index 089cfaaa1..147341a73 100644
--- a/openpower/sv/cookbook/fortran_maxloc.mdwn
+++ b/openpower/sv/cookbook/fortran_maxloc.mdwn
@@ -25,12 +25,60 @@ int m2(int * const restrict a, int n)
 }
 ```
 
-Read:
+**AVX-512**
+
+An informative article by Vamsi Sripathi of Intel shows the extent of the problem
+faced by SIMD ISAs (in the case below, AVX-512). Significant loop-unrolling is performed
+which leaves blocks that need to be merged: this is carried out with "blending"
+instructions.
+
+Article:
 <https://www.intel.com/content/www/us/en/developer/articles/technical/optimizing-maxloc-operation-using-avx-512-vector-instructions.html#gs.12t5y0>
 
 <img src="https://www.intel.com/content/dam/developer/articles/technical/optimizing-maxloc-operation-using-avx-512-vector-instructions/optimizing-maxloc-operation-using-avx-512-vector-instructions-code2.png
 " alt="NLnet foundation logo" width="100%" />
 
+**ARM NEON**
+
+From stackexchange in ARM NEON intrinsics, one developer (Pavel P) wrote the
+subroutine below, explaining that it finds the index of a minimum value within
+a group of eight unsigned bytes. It is necessary to use a second outer loop
+to perform many of these searches in parallel, followed by conditionally
+offsetting each of the block-results. 
+
+<https://stackoverflow.com/questions/49683866/find-min-and-position-of-the-min-element-in-uint8x8-t-neon-register>
+
+```
+#define VMIN8(x, index, value)                               \
+do {                                                         \
+    uint8x8_t m = vpmin_u8(x, x);                            \
+    m = vpmin_u8(m, m);                                      \
+    m = vpmin_u8(m, m);                                      \
+    uint8x8_t r = vceq_u8(x, m);                             \
+                                                             \
+    uint8x8_t z = vand_u8(vmask, r);                         \
+                                                             \
+    z = vpadd_u8(z, z);                                      \
+    z = vpadd_u8(z, z);                                      \
+    z = vpadd_u8(z, z);                                      \
+                                                             \
+    unsigned u32 = vget_lane_u32(vreinterpret_u32_u8(z), 0); \
+    index = __lzcnt(u32);                                    \
+    value = vget_lane_u8(m, 0);                              \
+} while (0)
+
+
+uint8_t v[8] = { ... };
+
+static const uint8_t mask[] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+uint8x8_t vmask = vld1_u8(mask);
+
+uint8x8_t v8 = vld1_u8(v);
+int ret;
+int ret_pos;
+VMIN8(v8, ret_pos, ret);
+```
+
 [[!tag svp64_cookbook ]]
 
 
-- 
2.30.2