From a6da490bb899a68ecce686e36d53a185d4d170ed Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Wed, 17 May 2023 19:01:55 +0000
Subject: [PATCH] update to OpenSearch2023 paper

---
 conferences/opensearch2023/opensearch2023.tex | 112 ++++++++++++++++--
 1 file changed, 102 insertions(+), 10 deletions(-)

diff --git a/conferences/opensearch2023/opensearch2023.tex b/conferences/opensearch2023/opensearch2023.tex
index 2486b6b0e..da25dfc7e 100644
--- a/conferences/opensearch2023/opensearch2023.tex
+++ b/conferences/opensearch2023/opensearch2023.tex
@@ -102,7 +102,7 @@ of Scalar Loop Construct.  This is what SIMD and normal Vector ISAs
 look like:
 
 \begin{verbatim}
-	for i in range(SIMDlength):
+    for i in range(SIMDlength):
         VR(RT)[i] = VR(RA)[i] + VR(RB)[i]
 \end{verbatim}
 
@@ -119,11 +119,11 @@ is a 50-year invention dating back to Zilog Z80 CPIR and LDIR.
 
 \begin{verbatim}
     for i in range(VL):
-        if predicate.bit[i] clear:
+        if predicate.bit[i] clear: # skip?
            continue
         GPR(RT+i) = GPR(RA+i) + GPR(RB+i)
-        if CCTest(GPR(RT+i)) is failure:
-            VL = i
+        if CCTest(GPR(RT+i)) fails: # end?
+            VL = i # truncate the Vector
             break
 \end{verbatim}
 
@@ -159,11 +159,57 @@ usual hassle with SIMD - often compensated for with hard-coded
 dedicated "Memory copy" or "String copy" instructions that cannot be
 leveraged for any other purpose, goes away.
 
+\section{strncpy}
+
+strncpy presents some unique challenges for an ISA and hardware,
+the primary being that in a SIMD (parallel) context, strncpy
+operates in bytes where SIMD operates in power-of-two multiples
+only.  PackedSIMD is the worst offender: PredicatedSIMD is better.
+If SIMD Load and Store has to start on an Aligned Memory location
+things get even worse.  The operations that were supposed to speed
+up algorithms have to have "preamble" and "postamble" to take care
+of the corner-cases.
+
+Worse, a naive SIMD ISA cannot have Conditional inter-relationships.
+64-byte or 128-byte-wide LOADs either succeed in full or they fail
+in full.  If the strncpy subroutine happens to copy from the last
+few bytes in memory, SIMD LOADs are the worst thing to use.
+We need a way to Conditionally terminate the LOAD and inform the
+Programmer, and this is where Load-Fault-First comes into play.
+
+However even this is not enough: once LOADed it is necessary to
+first spot the NUL character, and once identified to then begin
+copying NUL characters from that point onwards.
+
+\begin{verbatim}
+     for (i = 0; i < n && src[i] != '\0'; i++)
+         dest[i] = src[i];
+     for ( ; i < n; i++)
+        dest[i] = '\0';
+\end{verbatim}
+
+Performing such a conditional NUL-character search in a SIMD ISA
+is typically extremely convoluted.  A usual approach would be
+to perform a Parallel compare against NUL (easy enough) followed
+by an instruction that then searches sequentially for the first
+fail, followed by another instruction that explicitly truncates
+the Vector Length, followed finally by the actual STORE.
+
+\textit{All of the sequential-search-and-truncate} is part of
+the Data-Dependent Fail-First Mode that is a first-order construct
+in SVP64.  When applied to the \textbf{sv.cmpi} instruction,
+which produces a Vector of Condition Codes ()as opposed to just
+one for the Scalar \textbf{cmpi} instruction),
+the search for the NUL character truncates the Vector Length
+at the required point, such that the next instruction (STORE)
+is already set up to copy up to and including the NUL
+(if one was indeed found).
+
 \begin{verbatim}
      mtspr 9, 3   # move r3 to CTR
      addi 0,0,0   # initialise r0 to zero
      # chr-copy loop starts here:
-     # for (i = 0; i < n && src[i] != '\0'; i++)
+     # for (i=0; i<n && src[i] != '\0'; i++)
      #    dest[i] = src[i];
      # VL (and r1) = MIN(CTR,MAXVL=4)
      setvl 1,0,MAXVL,0,1,1
@@ -186,11 +232,57 @@ leveraged for any other purpose, goes away.
      sv.bc 16, *0, -0xc
 \end{verbatim}
 
+The next most important addition to SVP64 is a Vector-aware
+Branch-Conditional instruction.  Where \textbf{sv.cmpi} had
+created a Vector of Condition Codes, \textbf{sv.bc/all}
+will only Branch back to continue loading/copying of bytes
+iff no NUL was found and there are more characters to copy.
+
+A normal ISA would not have such parallel Condition Code
+Branch instructions. It would perhaps have a way to reduce
+a batch of parallel Condition Codes down to a \textit{single}
+Condition Code, and then use a \textit{Scalar} Branch-Conditional.
+Additionally the opportunity is taken to reduce the \textbf{CTR}
+Special Purpose Register by the (run-time truncated) Vector Length,
+saving the Programmer from having to explicitly copy the Vector
+Length into a GPR, explicitly subtract that from a copy of CTR, then
+explicitly copy the subtraction result back into CTR.
+
+The end-result of these enhancements is an overwhelmingly-compact
+\textit{general-purpose} Vector ISA that effectively did nothing
+more complex than bring back 50-year-old concepts from 8-bit
+micro-processors.  With the high reduction in program size comes
+a high "bang-per-buck" ratio that often allows the core inner
+loop (in this case the entire strncpy subroutine) to fit into a
+single L1 Cache Line, avoiding TLB misses and thus significantly
+saving on power consumption as well as potential operational delays.
+
 \flushcolsend
 
 
-\section{APPENDIX}
-TODO
+\section{Conclusion}
+Our goal as part of NGI Search is to validate that the approach
+taken above works across multiple algorithms.  VectorScan was
+chosen as a high-value library due to the sheer overwhelming
+complexity needed for other ISAs.  libc6 was also chosen as it
+is such a low-level library that any Search algorithm utilising
+it would benefit from increased compactness and efficiency.
+
+SVP64 chose very deliberately a design paradigm that only
+general-purpose constructs be added. There are no hard-coded
+dedicated specialist "memory copy" instructions, with the
+crucial side-effect that a \textbf{strncpyW} instruction
+(a UCS-2 variant of strncpy) is simply a matter of using
+general-purpose 16-bit cmp and
+16-bit LOAD/STORE instead of general-purpose
+8-bit cmp and 8-bit LOAD/STORE.
+
+Thus it is anticipated that future programmers may be freed
+from many of the limitations inherent in other ISAs, by being
+able to express high-level language constructs much more directly,
+cleanly and clearly in SVP64 Assembler.  All whilst retaining
+an all-important \textit{general-purpose} Sequential Programming
+paradigm.
 
 %
 % only for "biblatex"
@@ -203,9 +295,9 @@ TODO
 	%\begin{thebibliography}{99}   % Use for  10-99  references
 	\begin{thebibliography}{9} % Use for 1-9 references
 	
-	\bibitem{jacow-help}
-		JACoW,
-		\url{http://www.jacow.org}
+	\bibitem{vectorscan}
+		VectorScan, \\
+		\url{https://github.com/VectorCamp/vectorscan}
 	
 	%%\bibitem{IEEE}
 	%%	\textit{IEEE Editorial Style Manual},
-- 
2.30.2