X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=simple_v_extension%2Fsimple_v_chennai_2018.tex;h=33b2a3fb85e74b7edc9d280f5a9e54cb7937b4df;hb=9186251687c57fbc16a4f4d355e98b2f9020447f;hp=fb01060f30a538eba45a19fb2cd7422b40994d8f;hpb=521b44ac746f96f83dfad32b1e7c303ce8a7093b;p=libreriscv.git diff --git a/simple_v_extension/simple_v_chennai_2018.tex b/simple_v_extension/simple_v_chennai_2018.tex index fb01060f3..33b2a3fb8 100644 --- a/simple_v_extension/simple_v_chennai_2018.tex +++ b/simple_v_extension/simple_v_chennai_2018.tex @@ -11,13 +11,14 @@ \frame{ \begin{center} - \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\ + \huge{Simple-V RISC-V Parallelism Abstraction Extension}\\ \vspace{32pt} \Large{Flexible Vectorisation}\\ \Large{(aka not so Simple-V?)}\\ + \Large{(aka A Parallelism API for the RISC-V ISA)}\\ \vspace{24pt} \Large{[proposed for] Chennai 9th RISC-V Workshop}\\ - \vspace{24pt} + \vspace{16pt} \large{\today} \end{center} } @@ -28,174 +29,380 @@ \begin{itemize} \item The Designers of RISC-V\vspace{15pt} \item The RVV Working Group and contributors\vspace{15pt} - \item Jacob Bachmeyer, Xan Phung, Chuanhua Chang and others\vspace{15pt} + \item Allen Baum, Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\ + Guy Lemurieux, Jonathan Neuschafer, Roger Brussee, + and others\vspace{15pt} \item ISA-Dev Group Members\vspace{10pt} \end{itemize} } -\frame{\frametitle{The Simon Sinek lowdown (Why, How, What)} +\frame{\frametitle{Quick refresher on SIMD} \begin{itemize} - \item Vectorisation needs to fit an implementor's scope:\\ - RV32E, Embedded/Mobile, DSP, Servers and more.\vspace{15pt} - \item By implicitly marking INT/FP regs as "Vectorised",\\ - everything else follows from there.\vspace{15pt} - \item A Standard Vector "API" with flexibility for implementors:\\ - choice to optimise for area or performance as desired\vspace{10pt} + \item SIMD very easy to implement (and very seductive)\vspace{8pt} + \item Parallelism is in the ALU\vspace{8pt} + \item Zero-to-Negligeable impact for rest of core\vspace{8pt} + \end{itemize} + Where SIMD Goes Wrong:\vspace{10pt} + \begin{itemize} + \item See "SIMD instructions considered harmful" + https://sigarch.org/simd-instructions-considered-harmful + \item Setup and corner-cases alone are extremely complex.\\ + Hardware is easy, but software is hell. + \item O($N^{6}$) ISA opcode proliferation!\\ + opcode, elwidth, veclen, src1-src2-dest hi/lo \end{itemize} } - -\frame{\frametitle{Why another Vector Extension?} +\frame{\frametitle{Quick refresher on RVV} \begin{itemize} - \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt} - \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt} - \item Graded levels: hardware, hybrid or traps\vspace{10pt} - \item Even Compressed instructions become vectorised\vspace{10pt} + \item Effectively a variant of SIMD / SIMT (arbitrary length)\vspace{4pt} + \item Extremely powerful (extensible to 256 registers)\vspace{4pt} + \item Supports polymorphism, several datatypes (inc. FP16)\vspace{4pt} + \item Requires a separate Register File (32 w/ext to 256)\vspace{4pt} + \item Implemented as a separate pipeline (no impact on scalar) \end{itemize} - What Simple-V is not:\vspace{10pt} + However... \begin{itemize} - \item A full supercomputer-level Vector Proposal\vspace{10pt} - \item A replacement for RVV (designed to be augmented)\vspace{10pt} + \item 98 percent opcode duplication with rest of RV (CLIP) + \item Extending RVV requires customisation not just of h/w:\\ + gcc, binutils also need customisation (and maintenance) + \item Fascinatingly, despite being a SIMD-variant, RVV only has + O(1) opcode proliferation! (extremely well designed) \end{itemize} } -\frame{\frametitle{Quick refresher on SIMD} +\frame{\frametitle{The Simon Sinek lowdown (Why, How, What)} \begin{itemize} - \item SIMD very easy to implement (and very seductive)\vspace{10pt} - \item Parallelism is in the ALU\vspace{10pt} - \item Zero-to-Negligeable impact for rest of core\vspace{10pt} + \item Why? + Implementors need flexibility in vectorisation to optimise for + area or performance depending on the scope: + embedded DSP, Mobile GPU's, Server CPU's and more.\\ + Compilers also need flexibility in vectorisation to optimise for cost + of pipeline setup, amount of state to context switch + and software portability + \item How? + By marking INT/FP regs as "Vectorised" and + adding a level of indirection, + SV expresses how existing instructions should act + on [contiguous] blocks of registers, in parallel, WITHOUT + needing any new extra arithmetic opcodes. + \item What? + Simple-V is an "API" that implicitly extends + existing (scalar) instructions with explicit parallelisation\\ + i.e. SV is actually about parallelism NOT vectors per se.\\ + Has a lot in common with VLIW (without the actual VLIW). \end{itemize} - Where SIMD Goes Wrong:\vspace{10pt} +} + + +\frame{\frametitle{What's the value of SV? Why adopt it even in non-V?} + + \begin{itemize} + \item memcpy becomes much smaller (higher bang-per-buck) + \item context-switch (LOAD/STORE multiple): 1-2 instructions + \item Compressed instrs further reduces I-cache (etc.) + \item Reduced I-cache load (and less I-reads) + \item Amazingly, SIMD becomes tolerable (no corner-cases) + \item Modularity/Abstraction in both the h/w and the toolchain. + \item "Reach" of registers accessible by Compressed is enhanced + \item Future: double the standard INT/FP register file sizes. + \end{itemize} + Note: \begin{itemize} - \item See "SIMD instructions considered harmful" - https://www.sigarch.org/simd-instructions-considered-harmful - \item Corner-cases alone are extremely complex.\\ - Hardware is easy, but software is hell. - \item O($N^{6}$) ISA opcode proliferation!\\ - opcode, elwidth, veclen, src1-src2-dest hi/lo + \item It's not just about Vectors: it's about instruction effectiveness + \item Anything implementor is not interested in HW-optimising,\\ + let it fall through to exceptions (implement as a trap). \end{itemize} } -\frame{\frametitle{Quick refresher on RVV} + +\frame{\frametitle{How does Simple-V relate to RVV? What's different?} \begin{itemize} - \item Extremely powerful (extensible to 256 registers)\vspace{10pt} - \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt} - \item Requires a separate Register File\vspace{10pt} - \item Can be implemented as a separate pipeline\vspace{10pt} + \item RVV very heavy-duty (excellent for supercomputing)\vspace{8pt} + \item Simple-V abstracts parallelism (based on best of RVV)\vspace{8pt} + \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{8pt} + \item Even Compressed become vectorised (RVV can't)\vspace{8pt} + \item No polymorphism in SV (too complex)\vspace{8pt} \end{itemize} - However...\vspace{10pt} + What Simple-V is not:\vspace{4pt} \begin{itemize} - \item 98 percent opcode duplication with rest of RV (CLIP) - \item Extending RVV requires customisation not just of h/w:\\ - gcc and s/w also need customisation (and maintenance) + \item A full supercomputer-level Vector Proposal + \item A replacement for RVV (SV is designed to be over-ridden\\ + by - or augmented to become - RVV) \end{itemize} } -\frame{\frametitle{How is Parallelism abstracted?} +\frame{\frametitle{How is Parallelism abstracted in Simple-V?} \begin{itemize} - \item Almost all opcodes removed in favour of implicit "typing"\vspace{10pt} - \item Primarily at the Instruction issue phase (except SIMD)\vspace{10pt} - \item Standard (and future, and custom) opcodes now parallel\vspace{10pt} - \end{itemize} - Notes:\vspace{10pt} + \item Register "typing" turns any op into an implicit Vector op:\\ + registers are reinterpreted through a level of indirection + \item Primarily at the Instruction issue phase (except SIMD)\\ + Note: it's ok to pass predication through to ALU (like SIMD) + \item Standard and future and custom opcodes now parallel\\ + (crucially: with NO extra instructions needing to be added) + \end{itemize} + Note: EVERYTHING is parallelised: \begin{itemize} - \item LOAD/STORE (inc. C.LD and C.ST, LDX: everything) - \item All ALU ops (soft / hybrid / full HW, on per-op basis) - \item All branches become predication targets (C.FNE added) + \item All LOAD/STORE (inc. Compressed, Int/FP versions) + \item All ALU ops (Int, FP, SIMD, DSP, everything) + \item All branches become predication targets (C.FNE added?) \item C.MV of particular interest (s/v, v/v, v/s) + \item FCVT, FMV, FSGNJ etc. very similar to C.MV \end{itemize} } -\frame{\frametitle{Implementation Options} +\frame{\frametitle{What's the deal / juice / score?} \begin{itemize} - \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)\vspace{10pt} - \item Hardware loop, single-instruction issue\vspace{10pt} - \item Hardware loop, parallel (multi-instruction) issue\vspace{10pt} - \item Hardware loop, full parallel ALU (not recommended)\vspace{10pt} + \item Standard Register File(s) overloaded with CSR "reg is vector"\\ + (see pseudocode slides for examples) + \item "2nd FP\&INT register bank" possibility, reserved for future\\ + (would allow standard regfiles to remain unmodified) + \item Element width concept remain same as RVV\\ + (CSRs give new size: overrides opcode-defined meaning) + \item CSRs are key-value tables (overlaps allowed: v. important) + \end{itemize} + Key differences from RVV: + \begin{itemize} + \item Predication in INT reg as a BIT field (max VL=XLEN) + \item Minimum VL must be Num Regs - 1 (all regs single LD/ST) + \item SV may condense sparse Vecs: RVV cannot (SIMD-like):\\ + SV gives choice to Zero or skip non-predicated elements\\ + (no such choice in RVV: zeroing-only) \end{itemize} - Notes:\vspace{10pt} +} + + +\begin{frame}[fragile] +\frametitle{ADD pseudocode (or trap, or actual hardware loop)} + +\begin{semiverbatim} +function op\_add(rd, rs1, rs2, predr) # add not VADD! +  int i, id=0, irs1=0, irs2=0; +  for (i = 0; i < VL; i++) +   if (ireg[predr] & 1< 1; -s2 = vectorlen[src2] > 1; -for (int i = 0; i < VL; ++i) - preg[rs3] |= 1 << cmp(s1 ? reg[src1+i] : reg[src1], - s2 ? reg[src2+i] : reg[src2]); +function op\_mv(rd, rs) # MV not VMV! +  rd = int\_vec[rd].isvector ? int\_vec[rd].regidx : rd; +  rs = int\_vec[rs].isvector ? int\_vec[rs].regidx : rs; +  ps = get\_pred\_val(FALSE, rs); # predication on src +  pd = get\_pred\_val(FALSE, rd); # ... AND on dest +  for (int i = 0, int j = 0; i < VL && j < VL;): + if (int\_vec[rs].isvec) while (!(ps \& 1<