From 9849da14b4d5d86d4a34a416a68d393425646084 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Wed, 3 May 2023 11:51:46 +0100 Subject: [PATCH] add siliconsalon2023 latex slides --- .../siliconsalon2023/siliconsalon2023.tex | 258 ++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 conferences/siliconsalon2023/siliconsalon2023.tex diff --git a/conferences/siliconsalon2023/siliconsalon2023.tex b/conferences/siliconsalon2023/siliconsalon2023.tex new file mode 100644 index 000000000..2a9956bcb --- /dev/null +++ b/conferences/siliconsalon2023/siliconsalon2023.tex @@ -0,0 +1,258 @@ +\documentclass[slidestop]{beamer} +\usepackage{beamerthemesplit} +\usepackage{graphics} +\usepackage{pstricks} + +\graphicspath{{./}} + +\title{Big Integer Arithmetic Instruction design} +\author{Luke Kenneth Casson Leighton} + + +\begin{document} + +\frame{ + \begin{center} + \huge{Big Integer Arithmetic Instruction design}\\ + \vspace{32pt} + \Large{An analysis of big-integer arithmetic instructions}\\ + \Large{(why not to put all eggs in Custom Silicon basket)}\\ + \vspace{24pt} + \Large{Silicon Salon 2023}\\ + \vspace{16pt} + \large{Sponsored by NLnet's Assure Programme}\\ + \vspace{6pt} + \large{\today} + \end{center} +} + + +\frame{\frametitle{Who are we?} + + \begin{itemize} + \item Libre-SOC: a fully Libre Project with the goal of creating + a Hybrid 3D CPU-VPU-GPU including designing a powerful Vector + Extension (for the Power ISA). https://libre-soc.org + \vspace{6pt} + \item RED Semiconductor Ltd: a commercial realisation of Libre-SOC + designs. https://redsemiconductor.com + \vspace{6pt} + \item Libre-SOC researches and designs instructions that are then + proposed to the OpenPOWER Foundation ISA Technical Workgroup; + RED Semiconductor (as an OPF ISA WG Voting Member) then keeps + an eye on the RFC. + \vspace{6pt} + \item RED Semiconductor Ltd seeks VC funding and commercial business + propositions, Libre-SOC covers Research. + \vspace{6pt} + + \end{itemize} +} + + +\frame{\frametitle{What are the challenges faced by Biginteger?} + + \begin{itemize} + \item Algorithms especially post-quantum are now fast-moving. This + does not go down well! It typically takes 5-10 years for an + algorithm to become "trustable". + \vspace{6pt} + \item Custom Cryptographic Hardware will typically take 3 years from + design concept to first production silicon: Certification even longer. + If a fault is found in the algorithm, the entire investment is wasted. + \vspace{6pt} + \item Performance on 32-bit and 64-bit Embedded Hardware sucks. Algorithms + are roughly O(N\textsuperscript{2}) which wreaks havoc. The + temptation therefore is to add SIMD instructions or dedicated + "custom" instructions which makes the problem worse. + \vspace{6pt} + \item So how can these polar opposites be solved? + \vspace{6pt} + \end{itemize} +} + + +\begin{frame}[fragile]\frametitle{Go back to the algorithms.} + + \begin{itemize} + \item https://libre-soc.org/openpower/sv/biginteger/analysis/ + \item Starting with Knuth's Algorithm D and M, if a True-Scalable + Vector ISA can cope with those, chances are good it'll cope + with more (Karatsuba, and so on). + \item SVP64 has "looping" as a primary construct: \\ + loop i 0..VL-1: GPR(RT+i) = ADD(GPR(RA+i), GPR(RB+i)\\ + \vspace{1pt} + \item If however Carry-in and Carry-out are included in that, we + have arbitrary-length Big-Integer Vector Add! + \item For all other operations as long as Scalar-Vector is ok, + it turns out to be possible to do 64-bit carry-in and + 64-bit carry-out, without significant hardware disruption. + \item Irony: all relevant Scalar instructions (shift, mul, div) + usually drop 1/2 the result on the floor! + \end{itemize} + +\end{frame} + +\begin{frame}[fragile]\frametitle{Turning add-with-carry into Vector-Add} + + \begin{itemize} + \item Add-with-Carry is the building-block of larger operations + \item Let's simply chain them together. + \item sv.adde (Add-Carry with Vector loop) creates chains + \end{itemize} + + \begin{verbatim} + R0,CA = A0+B0+CA adde r0,a0,b0 + | + +----------+ + | + R1,CA = A1+B1+CA adde r1,a1,b1 + | + +----------+ + | + R2,CA = A2+B2+CA adde r2,a2,b2 + \end{verbatim} + +\end{frame} + +\begin{frame}[fragile]\frametitle{Vector-Scalar Shift} + + \begin{itemize} + \item Shift by 64-bit is just "pick a register" + \item Add a 2nd input register with what needs to be shifted IN\\ + (64-bit carry in) + \item Add a 2nd output register saving what normally gets thrown away\\ + (64-bit carry-out) + \item Again: a chain of these performs Vector-by-Scalar shift + \end{itemize} + + \begin{verbatim} + brs(uint64_t s, uint64_t r[], uint64_t un[], int n) { + for (int i = 0; i < n - 1; i++) + r[i] = (un[i] >> s) | (un[i + 1] << (64 - s)); + r[n - 1] = un[n - 1] >> s; + } + \end{verbatim} + +\end{frame} + +\begin{frame}[fragile]\frametitle{Vector-Scalar Multiply} + + \begin{itemize} + \item Normally in FMAC the top 64-bits is thrown away. + \item What if we stored those 64-bits in a 2nd register?\\ + (64-bit carry-out) + \item And what if the next FMAC added that "digit" on?\\ + (64-bit carry-in) + \item Again: a chain of these performs Vector-by-Scalar Multiply + \end{itemize} + + \begin{verbatim} + RT0, RC0 = RA0 * RB0 + 0 + | + +----------------+ + | + RT1, RC1 = RA1 * RB1 + RC0 + | + +----------------+ + | + RT2, RC2 = RA2 * RB2 + RC1 + \end{verbatim} + +\end{frame} + +\begin{frame}[fragile]\frametitle{Vector-Scalar Divide} + + \begin{itemize} + \item Same story. special-case for overflow. + \end{itemize} + + \begin{verbatim} + RT0 = (( 0<<64) | RA0) / RB0 + RC0 = (( 0<<64) | RA0) % RB0 + | + +-------+ + | + RT1 = ((RC0<<64) | RA1) / RB1 + RC1 = ((RC0<<64) | RA1) % RB1 + | + +-------+ + | + RT2 = ((RC1<<64) | RA2) / RB2 + RC2 = ((RC1<<64) | RA2) % RB2 + \end{verbatim} + +\end{frame} + +\frame{\frametitle{Summary so far} + + \begin{itemize} + \item Extending the usual 1-bit Carry-in Carry-out to 64-bit and + adding a loop-construct inherently turns Scalar operations + into arbitrary-length Vectorised ones + \item Irony: 30 years ago Power ISA actually had a "Carry SPR", where + the normally-discarded upper half of multiply would be placed in + that SPR (it was deprecated). + \item Hardware is NOT made more complex because in all shift multiply + and divide operations these bits are discarded in other ISAs, + which is why you end up with complex carry workarounds. This + gives ISAs a "bad rep" for doing Big-int + \item The "complication" is that you need 3-in 2-out instructions, + but actually in Micro-code you can do operand-forwarding. + 1st op: 3-in 1-out. chain: 2-in 1-out. Last: 2-in 2-out. + \end{itemize} +} + +\frame{\frametitle{OpenTITAN} + + \begin{itemize} + \item https://opentitan.org/book/hw/ip/otbn/index.html + \item 256b wide data path with 32 256b wide registers + \item Zero-Overhead Loop Control would have been better\\ + https://ieeexplore.ieee.org/abstract/document/1692906/ + \item Formal verification completion time is a factor of the operation + bit-width. 256-bit unlikely to be reasonable time. + \item 256-bit is great for EC25519 but for RSA (etc.) you run + into exactly the same problem as a Scalar ISA, made worse. + \item Opportunities to optimise algorithms not possible. + \end{itemize} +} + +\frame{\frametitle{Conclusion} + + \begin{itemize} + \item We went back to the algorithms (Knuth D and M) and examined + what they are trying to achieve. + \item Turns out they need a 64-bit carry-in and carry-out + \item Keeping to 64-bit maximum hardware means Formal Proofs complete + in reasonable time (less than heat-death of universe) + \item Reasonably straightforward: creates and uses partial results + normally thrown away (needing more instructions) + \item Freaks out pure-RISC proponents (3-in 2-out) but look at the + number of instructions (and temporary registers) needed + otherwise, and the overall algorithm efficiency, and the + case for these instructions is clear. + \item They also speed up \textbf{general-purpose} code + \end{itemize} +} + +\frame{ + \begin{center} + {\Huge The end\\ + Thank you\\ + Questions?\\\vspace{5pt} + } + \end{center} + + \begin{itemize} + \item https://redsemiconductor.com + \item Discussion: http://lists.libre-soc.org + \item Libera.Chat IRC \#libre-soc + \item http://libre-soc.org/ + \item http://nlnet.nl/assure + \item https://libre-soc.org/nlnet/\#faq + \end{itemize} +} + + +\end{document} -- 2.30.2