X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=conferences%2Ffosdem2024%2Ffosdem2024_ddffirst%2Ffosdem2024_ddffirst.tex;h=06301e8998b12505898bd0677ce316c5830e362e;hb=946f05168d76c8040ad982ec649ba8bcd9499c2f;hp=96e032a60fe20f4599a5bb88ee3820a408e7fcb8;hpb=bd67eb0035989caeccfb238de630db2c9373e533;p=libreriscv.git diff --git a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex index 96e032a60..06301e899 100644 --- a/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex +++ b/conferences/fosdem2024/fosdem2024_ddffirst/fosdem2024_ddffirst.tex @@ -2,11 +2,13 @@ \usepackage{beamerthemesplit} \usepackage{graphics} \usepackage{pstricks} +\usepackage{pgffor} +\usepackage{listings} \graphicspath{{./}} -\title{The Libre-SOC Hybrid 3D CPU} -\author{Luke Kenneth Casson Leighton} +\title{Data-Dependent-Fail-First} +\author{Luke Kenneth Casson Leighton and Shriya Sharma} \begin{document} @@ -15,11 +17,10 @@ \begin{center} \huge{The Libre-SOC Hybrid 3D CPU}\\ \vspace{32pt} - \Large{Augmenting the OpenPOWER ISA}\\ - \Large{to provide 3D and Video instructions}\\ - \Large{(properly and officially) and make a GPU}\\ + \Large{Data-Dependent-Fail-First}\\ + \vspace{24pt} - \Large{FOSDEM2021}\\ + \Large{FOSDEM2024}\\ \vspace{16pt} \large{Sponsored by NLnet's PET Programme}\\ \vspace{6pt} @@ -49,23 +50,6 @@ } -\frame{\frametitle{Why OpenPOWER?} - -\vspace{15pt} - - \begin{itemize} - \item Good ecosystem essential\\ - linux kernel, u-boot, compilers, OSes,\\ - Reference Implementation(s)\vspace{10pt} - \item Supportive Foundation and Members\\ - need to be able to submit ISA augmentations\\ - (for proper peer review)\vspace{10pt} - \item No NDAs, full transparency must be acceptable\\ - due to being funded under NLnet's PET Programme\vspace{10pt} - \item OpenPOWER: established for decades, excellent Foundation,\\ - Microwatt as Reference, approachable and friendly. - \end{itemize} -} \frame{\frametitle{How can you help?} @@ -100,7 +84,7 @@ heat sink normally not required (simplifies overall design) \vspace{3pt} \item Fully-integrated peripherals (not Northbridge/Southbridge)\\ - USB, HDMI, RGB/TTL, SD/MMC, I2C, UART, SPI, GPIO etc. etc. + USB, HDMI, RGB/TTL, SD/MMC, I2C, UART, SPI, GPIO etc. etc. \vspace{3pt} \item Built-in GPU (shared memory bus, 3rd party licensed) \vspace{3pt} \item Built-in VPU (likewise, proprietary)\vspace{3pt} @@ -114,214 +98,158 @@ -%%\frame{\frametitle{Simple SBC-style SoC} -%% -%%\begin{center} -%%\includegraphics[width=0.9\textwidth]{shakti_libre_soc.jpg} -%%\end{center} - -} +\begin{frame}[fragile] +\frametitle{Simple-V CMPI in a nutshell} -\frame{\frametitle{What's different about Libre-SOC?} +\begin{semiverbatim} +function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi! + (assuming you know power-isa) +  int i, id=0, ira=0; +  for (i = 0; i < VL; i++) +   CR[BA+id] <= compare(ireg[RA+ira], SI); + if (reg\_is\_vectorised[BA] ) \{ id += 1; \} + if (reg\_is\_vectorised[RA])  \{ ira += 1; \} +\end{semiverbatim} - \begin{itemize} - \item Hybrid - integrated. The CPU \textit{is} the GPU.\\ - The GPU \textit{is} the CPU. The VPU \textit{is} the CPU.\\ - \textit{There is No Separate VPU/GPU Pipeline or Processor}\\ - \vspace{9pt} - \item written in nmigen (a python-based HDL). Not VHDL\\ - not Verilog (definitely not Chisel3/Scala)\\ - This is an extremely important strategic decision.\\ - \vspace{9pt} - \item Simple-V Vector Extension. See `SIMD Considered harmful'.\\ - https://tinyurl.com/simd-considered-harmful\\ - SV effectively a "hardware for-loop" on standard scalar ISA\\ - (conceptually similar to Zero-Overhead Loops in DSPs) - \vspace{6pt} - \item Yes great, but what's different compared to Intel, AMD, NVIDIA, - ARM and IBM? + \begin{itemize} + \item Above is oversimplified: predication etc. left out + \item Scalar-scalar and scalar-vector and vector-vector now all in one + \item OoO may choose to push CMPIs into instr. queue (v. busy!) \end{itemize} -} +\end{frame} -\frame{\frametitle{OpenPOWER Cell Processor and upwards} - \begin{itemize} - \item OpenPOWER ISA developed from PowerPC, with the RS6000 in the 90s. - \vspace{6pt} - \item Sony, IBM and Toshiba began the Cell Processor in 2001 \\ - (Sony Playstation 3) - NUMA approach - \vspace{6pt} - \item Raw brute-force performance pissed all over the competition - at the time - \vspace{6pt} - \item VSX later evolved out of this initiative. - \vspace{6pt} - \item VSX, a SIMD extension, now showing its age. \\ - Fixed-width, no predication, limited pixel formats (15 bit) - \vspace{6pt} - \item (Vulkan requires dozens of pixel formats) - \end{itemize} +\frame{\frametitle{Load/Store Fault-First} + + \begin{itemize} + \item Problem: vector load and store can cause a page fault + \item Solution: a protocol that allows optional load/store + \item instruction \textit{requests} a number of elements + \item instruction \textit{informs} the number actually loaded + \item first element load/store is not optional (cannot fail) + \item ARM SVE: https://arxiv.org/pdf/1803.06185.pdf + \item more: wikipedia Vector processor page: Fault/Fail First + \vspace{10pt} + \item Load/Store is Memory to/from Register, what about + Register to Register? + \item Register-to-register: "Data-Dependent Fail-First." + \item Z80 LDIR: Mem-Register, CPIR: Register-Register + \end{itemize} } -\frame{\frametitle{Apple M1 (ARM) vs Intel / AMD (x86)} +\begin{frame}[fragile] + \frametitle{Data-Dependent-Fail-First in a nutshell} + + \begin{semiverbatim} +function op\_cmpi(BA, RA, SI) # cmpi not vector-cmpi! +int i, id=0, ira=0; +for (i = 0; i < VL; i++) + CR[BA+id] <= compare(ireg[RA+ira], SI); + if (reg\_is\_vectorised[BA] ) \{ id += 1; \} + if (reg\_is\_vectorised[RA])  \{ ira += 1; \} + if test (CR[BA+id]) == FAIL: \{ VL = i + 1; break \} + \end{semiverbatim} + + \begin{itemize} + \item Parallelism still perfectly possible + ("hold" writing results until sequential post-analysis + carried out. Best done with OoO) + \item VL truncation can be inclusive or exclusive + (include or exclude a NULL pointer or a + string-end character, or overflow result) + \item \textit{Truncation can be to zero Vector Length} + \end{itemize} +\end{frame} - \begin{itemize} - \item Very interesting article: tinyurl.com/apple-m1-review - \item Apple M1: uses ARM. Intel: implements x86 - \item Apple M1: RISC multi-issue. Intel: CISC multi-issue. - \item Apple M1: uniform (easy) instruction decode \\ - Intel: \textit{Cannot easily identify start of instruction} - \item Result: multi-issue x86 decoder is so complex, it misses - opportunities to keep back-end execution engines 100 percent - occupied - \item OpenPOWER happens to be RISC (easy decode), which is why POWER10 - has 8-way multi-issue. - \item Libre-SOC can do the same tricks that IBM POWER10 and Apple M1 - can. Intel (x86) literally cannot keep up. +\frame{\frametitle{Power ISA v3.1 vstribr} + + \lstinputlisting[language={}]{vstribr.txt} + + \begin{itemize} + \item ironically this hard-coded instruction is + identical to general-purpose Simple-V DD-FFirst... + \end{itemize} + +}Po + +\frame{\frametitle{maxloc} + \begin{itemize} + \item "TODO \end{itemize} } +\frame{\frametitle{Pospopcount} + + \begin{itemize} + \item Positional popcount adds up the totals of each bit set to 1 in each bit-position, of an array of input values. + \item Notoriously difficult to do in SIMD assembler: typically 550 lines + \item https://github.com/clausecker/pospop -\frame{\frametitle{Hybrid Architecture: Augmented 6600} + \end{itemize} + + \lstinputlisting[language={}]{pospopcount.c} - \begin{itemize} - \item CDC 6600 is a design from 1965. The \textit{augmentations} are not.\\ - Help from Mitch Alsup includes \textit{precise exceptions}, \\ - multi-issue and more. Academic literature on 6600 utterly misleading. - 6600 Scoreboards completely underestimated (Seymour Cray and - James Thornton - solved problems they didn't realise existed elsewhere!) - \item Front-end Vector ISA, back-end "Predicated (masked) SIMD"\\ - nmigen (python OO) strategically critical to achieving this. - \item Out-of-order combined with Simple-V allows scalar operations\\ - at the developer end to be turned into SIMD at the back-end\\ - \textit{without the developer needing to do SIMD} - \item IEEE754 sin / cos / atan2, Texturisation opcodes, YUV2RGB\\ - all automatically vectorised. - \end{itemize} + } -\frame{\frametitle{Learning from these and putting it together} - - \begin{itemize} - \item Apple M1 and IBM POWER10 show that RISC plus superscalar - multi-issue produces insane performance - \item Intel AVX 512 and CISC in general is getting out of hand (what's - next: 256-bit length instructions, AVX 1024?) - \item RISC-V RVV shows Cray-style Vectors can save power. Simple-V - has the same benefits with far less instructions (188 for RVV, - 3 to 5 new instructions for Simple-V). - \item CDC 6600 shows that intelligently-implemented designs can do the - job, with far less resources. - \item Libre-SOC combines the best of historical processor designs, - co-opting and innovating on them (pissing in the back yard of - every incumbent CPU and GPU company in the process). - \item It's a Libre project: you get to help - \end{itemize} +\frame{\frametitle{Pospopcount} + + \begin{center} + \includegraphics[width=0.5\textwidth]{pospopcount.png} + \end{center} + \begin{itemize} + \item The challenge is to perform an appropriate transpose of the data (the CPU can only work on registers, horizontally), + in blocks that suit the processor and the ISA capacity. + + + \end{itemize} } +\frame{\frametitle{Pospopcount} + + \begin{center} + \includegraphics[width=0.6\textwidth]{array_popcnt.png} + \end{center} -\frame{\frametitle{Why nmigen?} + \begin{itemize} - \begin{itemize} - \item Uses python to build an AST (Abstract Syntax Tree). - Actually hands that over to yosys (to create ILANG file) - after which verilog can (if necessary) be created - \item Deterministic synthesiseable behaviour (Signals are declared - with their reset pattern: no more forgetting "if rst" block). - \item python OO programming techniques can be deployed. classes - and functions created which pass in parameters which change - what HDL is created (IEEE754 FP16 / 32 / 64 for example) - \item python-based for-loops can e.g. read CSV files then generate - a hierarchical nested suite of HDL Switch / Case statements - (this is how the Libre-soc PowerISA decoder is implemented) - \item extreme OO abstraction can even be used to create "dynamic - partitioned Signals" that have the same operator-overloaded - "add", "subtract", "greater-than" operators - - \end{itemize} + \item The draft gbbd instruction implements the transpose (shown above), + preparing the data to use standard popcount. + (gbbd is based on Power ISA vgbbd, v3.1 p445) + + \end{itemize} + } -\frame{\frametitle{Why another Vector ISA? (or: not-exactly another)} +\frame{\frametitle{Pospopcount.s} - \begin{itemize} - \item Simple-V is a 'register tag' system. \textit{There are no opcodes}\\ - SV 'tags' scalar operations (scalar regfiles) as 'vectorised' - \item (PowerISA SIMD is around 700 opcodes, making it unlikely to be - able to fit a PowerISA decoder in only one clock cycle) - \item Effectively a 'hardware sub-counter for-loop': pauses the PC\\ - then rolls incrementally through the operand register numbers\\ - issuing \textit{multiple} scalar instructions into the pipelines\\ - (hence the reason for a multi-issue OoO microarchitecture) - \item Current \textit{and future} PowerISA scalar opcodes inherently - \textit{and automatically} become 'vectorised' by SV without - needing an explicit new Vector opcode. - \item Predication and element width polymorphism are also 'tags'. - elwidth polymorphism allows for BF16 / FP16 / 80 / 128 to be added to - the ISA \textit{without modifying the ISA} - - \end{itemize} -} -\frame{\frametitle{Quick refresher on SIMD} +\lstinputlisting[language={}]{pospopcount.s} - \begin{itemize} - \item SIMD very easy to implement (and very seductive) - \item Parallelism is in the ALU - \item Zero-to-Negligeable impact for rest of core - \end{itemize} - Where SIMD Goes Wrong:\vspace{6pt} - \begin{itemize} - \item See "SIMD instructions considered harmful" - https://sigarch.org/simd-instructions-considered-harmful - \item Setup and corner-cases alone are extremely complex.\\ - Hardware is easy, but software is hell.\\ - strncpy VSX patch for POWER9: 250 hand-written asm lines!\\ - (RVV / SimpleV strncpy is 14 instructions) - \item O($N^{6}$) ISA opcode proliferation (1000s of instructions)\\ - opcode, elwidth, veclen, src1-src2-dest hi/lo - \end{itemize} } -\begin{frame}[fragile] -\frametitle{Simple-V ADD in a nutshell} -\begin{semiverbatim} -function op\_add(rd, rs1, rs2, predr) # add not VADD! -  int i, id=0, irs1=0, irs2=0; -  for (i = 0; i < VL; i++) -   if (ireg[predr] & 1<