\usepackage{beamerthemesplit}
\usepackage{graphics}
\usepackage{pstricks}
+\usepackage{pgffor}
+\usepackage{listings}
\graphicspath{{./}}
-%%\frame{\frametitle{Simple SBC-style SoC}
-%%
-%%\begin{center}
-%%\includegraphics[width=0.9\textwidth]{shakti_libre_soc.jpg}
-%%\end{center}
-
-%%}
-
-
-
-
\begin{frame}[fragile]
-\frametitle{Simple-V ADD in a nutshell}
+\frametitle{Simple-V CMPI in a nutshell}
\begin{semiverbatim}
-function op\_add(rd, rs1, rs2, predr) # add not VADD!
- int i, id=0, irs1=0, irs2=0;
+function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
+ (assuming you know power-isa)
+ int i, id=0, ira=0;
for (i = 0; i < VL; i++)
- if (ireg[predr] & 1<<i) # predication uses intregs
- ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
- if (reg\_is\_vectorised[rd] ) \{ id += 1; \}
- if (reg\_is\_vectorised[rs1]) \{ irs1 += 1; \}
- if (reg\_is\_vectorised[rs2]) \{ irs2 += 1; \}
+ CR[BA+id] <= compare(ireg[RA+ira], SI);
+ if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
+ if (reg\_is\_vectorised[RA]) \{ ira += 1; \}
\end{semiverbatim}
\begin{itemize}
- \item Above is oversimplified: Reg. indirection left out (for clarity).
- \item SIMD slightly more complex (case above is elwidth = default)
+ \item Above is oversimplified: predication etc. left out
\item Scalar-scalar and scalar-vector and vector-vector now all in one
- \item OoO may choose to push ADDs into instr. queue (v. busy!)
+ \item OoO may choose to push CMPIs into instr. queue (v. busy!)
\end{itemize}
\end{frame}
-\frame{\frametitle{Additional Simple-V features}
- \begin{itemize}
- \item "fail-on-first" (POWER9 VSX strncpy segfaults on boundary!)
- \item "Twin Predication" (covers VSPLAT, VGATHER, VSCATTER, VINDEX etc.)
- \item SVP64: extensive "tag" (Vector context) augmentation
- \item "Context propagation": a VLIW-like context. Allows contexts
- to be repeatedly applied.
- Effectively a "hardware compression algorithm" for ISAs.
- \item Ultimate goal: cut down I-Cache usage, cuts down on power
- \item Typical GPU has its own I-Cache and small shaders.\\
- \textit{We are a Hybrid CPU/GPU: I-Cache is not separate!}
- \item Needs to go through OpenPOWER Foundation `approval'
+\frame{\frametitle{Load/Store Fault-First}
+
+ \begin{itemize}
+ \item Problem: vector load and store can cause a page fault
+ \item Solution: a protocol that allows optional load/store
+ \item instruction \textit{requests} a number of elements
+ \item instruction \textit{informs} the number actually loaded
+ \item first element load/store is not optional (cannot fail)
+ \item ARM SVE: https://arxiv.org/pdf/1803.06185.pdf
+ \item more: wikipedia Vector processor page: Fault/Fail First
+ \vspace{10pt}
+ \item Load/Store is Memory to/from Register, what about
+ Register to Register?
+ \item Register-to-register: "Data-Dependent Fail-First."
+ \item Z80 LDIR: Mem-Register, CPIR: Register-Register
+ \end{itemize}
+}
+
+\begin{frame}[fragile]
+ \frametitle{Data-Dependent-Fail-First in a nutshell}
+
+ \begin{semiverbatim}
+function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi!
+int i, id=0, ira=0;
+for (i = 0; i < VL; i++)
+ CR[BA+id] <= compare(ireg[RA+ira], SI);
+ if (reg\_is\_vectorised[BA] ) \{ id += 1; \}
+ if (reg\_is\_vectorised[RA]) \{ ira += 1; \}
+ if test (CR[BA+id]) == FAIL: \{ VL = i + 1; break \}
+ \end{semiverbatim}
+
+ \begin{itemize}
+ \item Parallelism still perfectly possible
+ ("hold" writing results until sequential post-analysis
+ carried out. Best done with OoO)
+ \item VL truncation can be inclusive or exclusive
+ (include or exclude a NULL pointer or a
+ string-end character, or overflow result)
+ \item \textit{Truncation can be to zero Vector Length}
+ \end{itemize}
+\end{frame}
+
+\frame{\frametitle{Power ISA v3.1 vstribr}
+
+ \lstinputlisting[language={}]{vstribr.txt}
+
+ \begin{itemize}
+ \item ironically this hard-coded instruction is
+ identical to general-purpose Simple-V DD-FFirst...
+ \end{itemize}
+
+}Po
+
+\frame{\frametitle{maxloc}
+ \begin{itemize}
+ \item "TODO
\end{itemize}
}
+\frame{\frametitle{Pospopcount}
+
+ \begin{itemize}
+ \item Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values.
+ \item Notoriously difficult to do in SIMD assembler: typically 550 lines
+
+ \end{itemize}
+
+ \lstinputlisting[language={}]{pospopcount.c}
+
+}
+
+\frame{\frametitle{Pospopcount}
+
+ \begin{center}
+ \includegraphics[width=0.6\textwidth]{pospopcount.png}
+ \end{center}
+
+}
+
+\frame{\frametitle{Pospopcount}
+
+ \begin{center}
+ \includegraphics[width=0.6\textwidth]{array_popcnt.png}
+ \end{center}
+ \begin{itemize}
+ \item The challenge is to perform an appropriate transpose of the data,
+ in blocks that suit the processor and the ISA capacity.
+ \item The draft gbbd instruction implemets the transpose,
+ preparing the data for using the standard popcount instruction.
+
+
+ \end{itemize}
+
+}
+
+\frame{\frametitle{Pospopcount.s}
+
+
+\lstinputlisting[language={}]{pospopcount.s}
+
+}
+
+
+\frame{\frametitle{strncpy}
+
+ \lstinputlisting[language={}]{strncpy.c}
+ \begin{itemize}
+ \item "TODO
+ \end{itemize}
+}
+
+
+
+\frame{\frametitle{strncpy assembler}
+
+\lstinputlisting[language={}]{strncpy.s}
+
+}
+
+\frame{\frametitle{linked-list walking}
+ \begin{itemize}
+ \item "TODO
+ \end{itemize}
+}
\frame{\frametitle{Summary}
\begin{itemize}