X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=simple_v_extension%2Fsimple_v_chennai_2018.tex;h=180519bea82f022c0d1b3f2dd5ffc91326358497;hb=27ab55401456175de16e2f0700a10beeeeb2a52c;hp=b0561901007c781f29b81b0c6216ef09d8a82d6d;hpb=c0580a67e43d4bdaa0dd7c8ee85da4033674866e;p=libreriscv.git diff --git a/simple_v_extension/simple_v_chennai_2018.tex b/simple_v_extension/simple_v_chennai_2018.tex index b05619010..180519bea 100644 --- a/simple_v_extension/simple_v_chennai_2018.tex +++ b/simple_v_extension/simple_v_chennai_2018.tex @@ -15,9 +15,10 @@ \vspace{32pt} \Large{Flexible Vectorisation}\\ \Large{(aka not so Simple-V?)}\\ + \Large{(aka How to Parallelise the RISC-V ISA)}\\ \vspace{24pt} \Large{[proposed for] Chennai 9th RISC-V Workshop}\\ - \vspace{24pt} + \vspace{16pt} \large{\today} \end{center} } @@ -28,8 +29,8 @@ \begin{itemize} \item The Designers of RISC-V\vspace{15pt} \item The RVV Working Group and contributors\vspace{15pt} - \item Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\ - Guy Lemurieux, Jonathan Neuschafer, Roger Bruisse, + \item Allen Baum, Jacob Bachmeyer, Xan Phung, Chuanhua Chang,\\ + Guy Lemurieux, Jonathan Neuschafer, Roger Brussee, and others\vspace{15pt} \item ISA-Dev Group Members\vspace{10pt} \end{itemize} @@ -39,15 +40,15 @@ \frame{\frametitle{Quick refresher on SIMD} \begin{itemize} - \item SIMD very easy to implement (and very seductive)\vspace{10pt} - \item Parallelism is in the ALU\vspace{10pt} - \item Zero-to-Negligeable impact for rest of core\vspace{10pt} + \item SIMD very easy to implement (and very seductive)\vspace{8pt} + \item Parallelism is in the ALU\vspace{8pt} + \item Zero-to-Negligeable impact for rest of core\vspace{8pt} \end{itemize} Where SIMD Goes Wrong:\vspace{10pt} \begin{itemize} \item See "SIMD instructions considered harmful" - https://www.sigarch.org/simd-instructions-considered-harmful - \item Corner-cases alone are extremely complex.\\ + https://sigarch.org/simd-instructions-considered-harmful + \item Setup and corner-cases alone are extremely complex.\\ Hardware is easy, but software is hell. \item O($N^{6}$) ISA opcode proliferation!\\ opcode, elwidth, veclen, src1-src2-dest hi/lo @@ -59,14 +60,14 @@ \begin{itemize} \item Extremely powerful (extensible to 256 registers)\vspace{10pt} \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt} - \item Requires a separate Register File\vspace{10pt} - \item Can be implemented as a separate pipeline\vspace{10pt} + \item Requires a separate Register File (16 w/ext to 256)\vspace{10pt} + \item Implemented as a separate pipeline (no impact on scalar)\vspace{10pt} \end{itemize} However...\vspace{10pt} \begin{itemize} \item 98 percent opcode duplication with rest of RV (CLIP) \item Extending RVV requires customisation not just of h/w:\\ - gcc and s/w also need customisation (and maintenance) + gcc, binutils also need customisation (and maintenance) \end{itemize} } @@ -82,29 +83,52 @@ of pipeline setup, amount of state to context switch and software portability\vspace{4pt} \item How? - By implicitly marking INT/FP regs as "Vectorised",\\ + By marking INT/FP regs as "Vectorised" and + adding a level of indirection, SV expresses how existing instructions should act on [contiguous] blocks of registers, in parallel.\vspace{4pt} \item What? Simple-V is an "API" that implicitly extends - existing (scalar) instructions with explicit parallelisation. + existing (scalar) instructions with explicit parallelisation\\ + (i.e. SV is actually about parallelism NOT vectors per se) + \end{itemize} +} + + +\frame{\frametitle{What's the value of SV? Why adopt it even in non-V?} + + \begin{itemize} + \item memcpy becomes much smaller (higher bang-per-buck) + \item context-switch (LOAD/STORE multiple): 1-2 instructions + \item Compressed instrs further reduces I-cache (etc.) + \item Greatly-reduced I-cache load (and less reads) + \item Amazingly, SIMD becomes (more) tolerable\\ + (corner-cases for setup and teardown are gone) + \item Modularity/Abstraction in both the h/w and the toolchain. + \end{itemize} + Note: + \begin{itemize} + \item It's not just about Vectors: it's about instruction effectiveness + \item Anything that makes SIMD tolerable has to be a good thing + \item Anything implementor is not interested in HW-optimising,\\ + let it fall through to exceptions (implement as a trap). \end{itemize} } -\frame{\frametitle{How does Simple-V relate to RVV?} +\frame{\frametitle{How does Simple-V relate to RVV? What's different?} \begin{itemize} \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt} \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt} \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{10pt} - \item Even Compressed instructions become vectorised\vspace{10pt} + \item Even Compressed become vectorised (RVV can't)\vspace{10pt} \end{itemize} What Simple-V is not:\vspace{10pt} \begin{itemize} \item A full supercomputer-level Vector Proposal \item A replacement for RVV (SV is designed to be over-ridden\\ - by - or augmented to become, or just be replaced by - RVV) + by - or augmented to become - RVV) \end{itemize} } @@ -112,17 +136,19 @@ \frame{\frametitle{How is Parallelism abstracted in Simple-V?} \begin{itemize} - \item Register "typing" turns any op into an implicit Vector op\vspace{10pt} + \item Register "typing" turns any op into an implicit Vector op:\\ + registers are reinterpreted through a level of indirection \item Primarily at the Instruction issue phase (except SIMD)\\ Note: it's ok to pass predication through to ALU (like SIMD) \item Standard (and future, and custom) opcodes now parallel\vspace{10pt} \end{itemize} - Notes:\vspace{6pt} + Note: EVERYTHING is parallelised: \begin{itemize} - \item LOAD/STORE (inc. C.LD and C.ST, LD.X: everything) + \item All LOAD/STORE (inc. Compressed, Int/FP versions) \item All ALU ops (soft / hybrid / full HW, on per-op basis) - \item All branches become predication targets (C.FNE added) + \item All branches become predication targets (C.FNE added?) \item C.MV of particular interest (s/v, v/v, v/s) + \item FCVT, FMV, FSGNJ etc. very similar to C.MV \end{itemize} } @@ -152,18 +178,44 @@ % but MODIFYING the remaining "vectorised" op, subtracting the now % scalar ops from it. +\frame{\frametitle{Predicated 8-parallel ADD: 1-wide ALU} + \begin{center} + \includegraphics[height=2.5in]{padd9_alu1.png}\\ + {\bf \red Predicated adds are shuffled down: 6 cycles in total} + \end{center} +} + + +\frame{\frametitle{Predicated 8-parallel ADD: 4-wide ALU} + \begin{center} + \includegraphics[height=2.5in]{padd9_alu4.png}\\ + {\bf \red Predicated adds are shuffled down: 4 in 1st cycle, 2 in 2nd} + \end{center} +} + + +\frame{\frametitle{Predicated 8-parallel ADD: 3 phase FIFO expansion} + \begin{center} + \includegraphics[height=2.5in]{padd9_fifo.png}\\ + {\bf \red First cycle takes first four 1s; second takes the rest} + \end{center} +} + + \frame{\frametitle{How are SIMD Instructions Vectorised?} \begin{itemize} - \item SIMD ALU(s) primarily unchanged\vspace{10pt} - \item Predication is added to each SIMD element (NO ZEROING!)\vspace{10pt} - \item End of Vector enables predication (NO ZEROING!)\vspace{10pt} + \item SIMD ALU(s) primarily unchanged\vspace{6pt} + \item Predication is added to each SIMD element\vspace{6pt} + \item Predication bits sent in groups to the ALU\vspace{6pt} + \item End of Vector enables (additional) predication\vspace{10pt} \end{itemize} - Considerations:\vspace{10pt} + Considerations:\vspace{4pt} \begin{itemize} - \item Many SIMD ALUs possible (parallel execution)\vspace{10pt} - \item Very long SIMD ALUs could waste die area (short vectors)\vspace{10pt} - \item Implementor free to choose (API remains the same)\vspace{10pt} + \item Many SIMD ALUs possible (parallel execution) + \item Implementor free to choose (API remains the same) + \item Unused ALU units wasted, but s/w DRASTICALLY simpler + \item Very long SIMD ALUs could waste significant die area \end{itemize} } % With multiple SIMD ALUs at for example 32-bit wide they can be used @@ -171,11 +223,21 @@ % or they can be used to cover several operations on totally different % vectors / registers. +\frame{\frametitle{Predicated 9-parallel SIMD ADD} + \begin{center} + \includegraphics[height=2.5in]{padd9_simd.png}\\ + {\bf \red 4-wide 8-bit SIMD, 4 bits of predicate passed to ALU} + \end{center} +} + + \frame{\frametitle{What's the deal / juice / score?} \begin{itemize} - \item Standard Register File(s) overloaded with "vector span"\vspace{10pt} - \item Element width and type concepts remain same as RVV\vspace{10pt} + \item Standard Register File(s) overloaded with CSR "reg is vector"\\ + (see pseudocode slides for examples) + \item Element width (and type?) concepts remain same as RVV\\ + (CSRs give new size (and meaning?) to elements in registers) \item CSRs are key-value tables (overlaps allowed)\vspace{10pt} \end{itemize} Key differences from RVV:\vspace{10pt} @@ -183,40 +245,116 @@ \item Predication in INT regs as a BIT field (max VL=XLEN) \item Minimum VL must be Num Regs - 1 (all regs single LD/ST) \item SV may condense sparse Vecs: RVV lets ALU do predication - \item NO ZEROING: non-predicated elements are skipped + \item Choice to Zero or skip non-predicated elements \end{itemize} } +\begin{frame}[fragile] +\frametitle{ADD pseudocode (or trap, or actual hardware loop)} + +\begin{semiverbatim} +function op\_add(rd, rs1, rs2, predr) # add not VADD! +  int i, id=0, irs1=0, irs2=0; +  for (i = 0; i < VL; i++) +   if (ireg[predr] & 1< 1; -s2 = vectorlen[src2] > 1; -for (int i = 0; i < VL; ++i) - preg[rs3] |= 1 << cmp(s1 ? reg[src1+i] : reg[src1], - s2 ? reg[src2+i] : reg[src2]); +struct vectorised fp\_vec[32]; +struct vectorised int\_vec[32]; + +for (i = 0; i < 16; i++) // 16 CSRs? + tb = int\_vec if CSRvectortb[i].type == 0 else fp\_vec + idx = CSRvectortb[i].regidx + tb[idx].elwidth = CSRpred[i].elwidth + tb[idx].regidx = CSRpred[i].regidx + tb[idx].isvector = true \end{semiverbatim} - \begin{itemize} - \item SIMD slightly more complex (case above is elwidth = default) - \item If s1 and s2 both scalars, Standard branch occurs - \item Predication stored in integer regfile as a bitfield - \item Scalar-vector and vector-vector supported + \begin{itemize} + \item All 64 (int and FP) Entries zero'd before setting + \item Might be a bit complex to set up (TBD) \end{itemize} + \end{frame} + \begin{frame}[fragile] -\frametitle{LD/LD.S/LD.X (or trap, or actual hardware loop)} +\frametitle{ADD pseudocode with redirection, this time} \begin{semiverbatim} -if (unit-strided) stride = elsize; -else stride = areg[as2]; // constant-strided -for (int i = 0; i < VL; ++i) - if (preg_enabled[rd] && ([!]preg[rd] & 1<