From: Shriya Sharma <shriya@redsemiconductor.com>
Date: Mon, 8 Jan 2024 12:19:57 +0000 (+0000)
Subject: Bug1244:added code for pospopcount
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=1e46f31c338a297db695fd447723727f6973e330;p=libreriscv.git

Bug1244:added code for pospopcount
---

diff --git a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex
index c699eed90..fcdccdbc5 100644
--- a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex
+++ b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex
@@ -180,9 +180,34 @@ function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
   \end{itemize}
 }
 \frame{\frametitle{Pospopcount}
-  \begin{itemize}
-	\item "TODO
-  \end{itemize}	
+	\begin{semiverbatim}
+	 // Copyright (c) 2020 Robert Clausecker <fuz@fuz.su>
+	// count8 reference implementation for tests.  Do not alter.
+	func count8safe(counts *[8]int, buf []uint8) {
+		for i := range buf {
+			for j := 0; j < 8; j++ {
+				counts[j] += int(buf[i] >> j & 1)
+			}
+		}
+	}
+
+   A simple but still hardware-paralleliseable SVP64 assembler for 8-bit input values (count8safe) is as follows:
+
+	mtspr 9, 3                 # move r3 to CTR
+	setvl 3,0,8,0,1,1          # set MVL=8, VL=r3=MIN(MVL,CTR)
+	# load VL bytes (update r4 addr) but compressed (dw=8)
+	addi 6, 0, 0               # initialise all 64-bits of r6 to zero
+	sv.lbzu/pi/dw=8 *6, 1(4)   # should be /lf here as well
+	# gather performs the transpose (which gets us to positional..)
+	gbbd 8,6
+	# now those bits have been turned around, popcount and sum them
+	setvl 0,0,8,0,1,1          # set MVL=VL=8
+	sv.popcntd/sw=8 *24,*8     # do the (now transposed) popcount
+	sv.add *16,*16,*24         # and accumulate in results
+	# branch back if CTR still non-zero. works even though VL=8
+	sv.bc/all 16, *0, -0x28   # reduce CTR by VL and stop if -ve
+	\end{semiverbatim}
+
 }
 \frame{\frametitle{strncpy}
   \begin{itemize}