From 9caa3bcf6897582da7e22180c0ea1345e3df64c0 Mon Sep 17 00:00:00 2001 From: lkcl Date: Fri, 5 Jan 2024 15:30:07 +0000 Subject: [PATCH] --- openpower/sv/cookbook/fortran_maxloc.mdwn | 50 ++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/openpower/sv/cookbook/fortran_maxloc.mdwn b/openpower/sv/cookbook/fortran_maxloc.mdwn index 089cfaaa1..147341a73 100644 --- a/openpower/sv/cookbook/fortran_maxloc.mdwn +++ b/openpower/sv/cookbook/fortran_maxloc.mdwn @@ -25,12 +25,60 @@ int m2(int * const restrict a, int n) } ``` -Read: +**AVX-512** + +An informative article by Vamsi Sripathi of Intel shows the extent of the problem +faced by SIMD ISAs (in the case below, AVX-512). Significant loop-unrolling is performed +which leaves blocks that need to be merged: this is carried out with "blending" +instructions. + +Article: NLnet foundation logo +**ARM NEON** + +From stackexchange in ARM NEON intrinsics, one developer (Pavel P) wrote the +subroutine below, explaining that it finds the index of a minimum value within +a group of eight unsigned bytes. It is necessary to use a second outer loop +to perform many of these searches in parallel, followed by conditionally +offsetting each of the block-results. + + + +``` +#define VMIN8(x, index, value) \ +do { \ + uint8x8_t m = vpmin_u8(x, x); \ + m = vpmin_u8(m, m); \ + m = vpmin_u8(m, m); \ + uint8x8_t r = vceq_u8(x, m); \ + \ + uint8x8_t z = vand_u8(vmask, r); \ + \ + z = vpadd_u8(z, z); \ + z = vpadd_u8(z, z); \ + z = vpadd_u8(z, z); \ + \ + unsigned u32 = vget_lane_u32(vreinterpret_u32_u8(z), 0); \ + index = __lzcnt(u32); \ + value = vget_lane_u8(m, 0); \ +} while (0) + + +uint8_t v[8] = { ... }; + +static const uint8_t mask[] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; +uint8x8_t vmask = vld1_u8(mask); + +uint8x8_t v8 = vld1_u8(v); +int ret; +int ret_pos; +VMIN8(v8, ret_pos, ret); +``` + [[!tag svp64_cookbook ]] -- 2.30.2