simple_v_extension/simple_v_chennai_2018.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5
   6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
   7 \author{Luke Kenneth Casson Leighton}
   8
   9
  10 \begin{document}
  11
  12 \frame{
  13    \begin{center}
  14     \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
  15     \vspace{32pt}
  16     \Large{Flexible Vectorisation}\\
  17     \Large{(aka not so Simple-V?)}\\
  18     \vspace{24pt}
  19     \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
  20     \vspace{24pt}
  21     \large{\today}
  22   \end{center}
  23 }
  24
  25 \frame{\frametitle{Why another Vector Extension?}
  26
  27  \begin{itemize}
  28    \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
  29    \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
  30    \item Graded levels: hardware, hybrid or traps\vspace{10pt}
  31    \item Even Compressed instructions become vectorised\vspace{10pt}
  32   \end{itemize}
  33   What Simple-V is not:\vspace{10pt}
  34    \begin{itemize}
  35    \item A full supercomputer-level Vector Proposal\vspace{10pt}
  36    \item A replacement for RVV (designed to be augmented)\vspace{10pt}
  37   \end{itemize}
  38 }
  39
  40 \frame{\frametitle{Quick refresher on SIMD}
  41
  42  \begin{itemize}
  43    \item SIMD very easy to implement (and very seductive)\vspace{10pt}
  44    \item Parallelism is in the ALU\vspace{10pt}
  45    \item Zero-to-Negligeable impact for rest of core\vspace{10pt}
  46   \end{itemize}
  47   Where SIMD Goes Wrong:\vspace{10pt}
  48    \begin{itemize}
  49    \item See "SIMD instructions considered harmful"
  50    https://www.sigarch.org/simd-instructions-considered-harmful
  51    \item (Corner-cases alone are extremely complex)\vspace{10pt}
  52    \item O($N^{6}$) ISA opcode proliferation!\vspace{10pt}
  53   \end{itemize}
  54 }
  55
  56 \frame{\frametitle{Quick refresher on RVV}
  57
  58  \begin{itemize}
  59    \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
  60    \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
  61    \item Requires a separate Register File\vspace{10pt}
  62    \item Can be implemented as a separate pipeline\vspace{10pt}
  63   \end{itemize}
  64   However...\vspace{10pt}
  65    \begin{itemize}
  66    \item 98 percent opcode duplication with rest of RV (CLIP)\vspace{10pt}
  67    \item Extending RVV requires customisation\vspace{10pt}
  68   \end{itemize}
  69 }
  70
  71
  72 \frame{\frametitle{How is Parallelism abstracted?}
  73
  74  \begin{itemize}
  75    \item Almost all opcodes removed in favour of implicit "typing"\vspace{10pt}
  76    \item Primarily at the Instruction issue phase (except SIMD)\vspace{10pt}
  77    \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
  78   \end{itemize}
  79   Notes:\vspace{10pt}
  80    \begin{itemize}
  81    \item LOAD/STORE (inc. C.LD and C.ST, LDX: everything)\vspace{10pt}
  82    \item All ALU ops (soft / hybrid / full HW, on per-op basis)\vspace{10pt}
  83    \item All branches become predication targets (C.FNE added)\vspace{10pt}
  84   \end{itemize}
  85 }
  86
  87
  88 \frame{\frametitle{Implementation Options}
  89
  90  \begin{itemize}
  91    \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)\vspace{10pt}
  92    \item Hardware loop, single-instruction issue\vspace{10pt}
  93    \item Hardware loop, parallel (multi-instruction) issue\vspace{10pt}
  94    \item Hardware loop, full parallel ALU (not recommended)\vspace{10pt}
  95   \end{itemize}
  96   Notes:\vspace{10pt}
  97   \begin{itemize}
  98    \item 4 (or more?) options above may be deployed on per-op basis
  99    \item Minimum MVL MUST be sufficient to cover regfile LD/ST
 100    \item OoO may split off 4+ single-instructions at a time
 101   \end{itemize}
 102 }
 103
 104
 105 \frame{\frametitle{How are SIMD Instructions Vectorised?}
 106
 107  \begin{itemize}
 108    \item SIMD ALU(s) primarily unchanged\vspace{10pt}
 109    \item Predication is added to each SIMD element (NO ZEROING!)\vspace{10pt}
 110    \item End of Vector enables predication (NO ZEROING!)\vspace{10pt}
 111   \end{itemize}
 112   Considerations:\vspace{10pt}
 113    \begin{itemize}
 114    \item Many SIMD ALUs possible (parallel execution)\vspace{10pt}
 115    \item Very long SIMD ALUs could waste die area (short vectors)\vspace{10pt}
 116    \item Implementor free to choose (API remains the same)\vspace{10pt}
 117   \end{itemize}
 118 }
 119 % With multiple SIMD ALUs at for example 32-bit wide they can be used
 120 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
 121 % or they can be used to cover several operations on totally different
 122 % vectors / registers.
 123
 124 \frame{\frametitle{What's the deal / juice / score?}
 125
 126  \begin{itemize}
 127    \item Standard Register File(s) overloaded with "vector span"\vspace{10pt}
 128    \item Element width and type concepts remain same as RVV\vspace{10pt}
 129    \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
 130   \end{itemize}
 131   Key differences from RVV:\vspace{10pt}
 132    \begin{itemize}
 133    \item Predication in INT regs as a BIT field (max VL=XLEN)\vspace{10pt}
 134    \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)\vspace{10pt}
 135    \item NO ZEROING: non-predicated elements are skipped\vspace{10pt}
 136   \end{itemize}
 137 }
 138
 139
 140 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
 141
 142  \begin{itemize}
 143    \item Same register(s) can have multiple "interpretations"\vspace{10pt}
 144    \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops\vspace{10pt}
 145    \item (32-bit GREV plus 4-wide 32-bit SIMD plus 32-bit GREVI)\vspace{10pt}
 146    \item 32-bit op followed by 16-bit op w/ 2x VL, 1/2 predicated\vspace{10pt}
 147   \end{itemize}
 148   Note:\vspace{10pt}
 149    \begin{itemize}
 150    \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$) \vspace{10pt}
 151    \item Hi-Performance: Macro-op fusion (more pipeline stages?)\vspace{10pt}
 152   \end{itemize}
 153 }
 154
 155
 156 \frame{\frametitle{Why no Zeroing (place zeros in non-predicated elements)?}
 157
 158  \begin{itemize}
 159    \item Zeroing is an implementation optimisation favouring OoO\vspace{10pt}
 160    \item Simple implementations may skip non-predicated operations\vspace{10pt}
 161    \item Simple implementations explicitly have to destroy data\vspace{10pt}
 162    \item Complex implementations may use reg-renames to save power\vspace{10pt}
 163   \end{itemize}
 164   Considerations:\vspace{10pt}
 165   \begin{itemize}
 166    \item Complex not really impacted, Simple impacted a LOT
 167    \item Overlapping "Vectors" may issue overlapping ops
 168    \item Please don't use Vectors for "security" (use Sec-Ext)
 169   \end{itemize}
 170 }
 171 % with overlapping "vectors" - bearing in mind that "vectors" are
 172 % just a remap onto the standard register file, if the top bits of
 173 % predication are zero, and there happens to be a second vector
 174 % that uses some of the same register file that happens to be
 175 % predicated out, the second vector op may be issued *at the same time*
 176 % if there are available parallel ALUs to do so.
 177
 178 \begin{frame}[fragile]
 179 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
 180
 181 \begin{semiverbatim}
 182 function op_add(rd, rs1, rs2, predr) # add not VADD!
 183   int i, id=0, irs1=0, irs2=0;
 184   for (i=0; i < MIN(VL, vectorlen[rd]); i++)
 185     if (ireg[predr] & 1<<i) # predication uses intregs
 186        ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
 187     if (reg_is_vectorised[rd]) \{ id += 1; \}
 188     if (reg_is_vectorised[rs1]) \{ irs1 += 1; \}
 189     if (reg_is_vectorised[rs2]) \{ irs2 += 1; \}
 190 \end{semiverbatim}
 191
 192   \begin{itemize}
 193    \item SIMD slightly more complex (case above is elwidth = default)
 194    \item Scalar-scalar and scalar-vector and vector-vector now all in one
 195    \item OoO may choose to push ADDs into instr. queue (v. busy!)
 196   \end{itemize}
 197 \end{frame}
 198
 199 \begin{frame}[fragile]
 200 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
 201
 202 \begin{semiverbatim}
 203 s1 = vectorlen[src1] > 1;
 204 s2 = vectorlen[src2] > 1;
 205 for (int i = 0; i < VL; ++i)
 206    preg[rs3] |= 1 << cmp(s1 ? reg[src1+i] : reg[src1],
 207                          s2 ? reg[src2+i] : reg[src2]);
 208 \end{semiverbatim}
 209
 210   \begin{itemize}
 211    \item SIMD slightly more complex (case above is elwidth = default)
 212    \item If s1 and s2 both scalars, Standard branch occurs
 213    \item Predication stored in integer regfile as a bitfield
 214    \item Scalar-vector and vector-vector supported
 215   \end{itemize}
 216 \end{frame}
 217
 218 \begin{frame}[fragile]
 219 \frametitle{LD/LD.S/LD.X (or trap, or actual hardware loop)}
 220
 221 \begin{semiverbatim}
 222 if (unit-strided) stride = elsize;
 223 else stride = areg[as2]; // constant-strided
 224 for (int i = 0; i < VL; ++i)
 225   if (preg_enabled[rd] && ([!]preg[rd] & 1<<i))
 226     for (int j = 0; j < seglen+1; j++)
 227       if (vectorised[rs2]) offs = vreg[rs2][i]
 228       else offs = i*(seglen+1)*stride;
 229       vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
 230 \end{semiverbatim}
 231
 232   \begin{itemize}
 233    \item Again: SIMD slightly more complex
 234    \item rs2 vectorised taken to implicitly indicate LD.X
 235   \end{itemize}
 236 \end{frame}
 237
 238
 239 \frame{\frametitle{Opcodes, compared to RVV}
 240
 241  \begin{itemize}
 242    \item All integer and FP opcodes all removed (no CLIP!)\vspace{10pt}
 243    \item VMPOP, VFIRST etc. all removed (use xBitManip)\vspace{10pt}
 244    \item VSLIDE, VEXTRACT, VINSERT removed (using regfile)\vspace{10pt}
 245    \item VSETVL, VGETVL, VSELECT stay\vspace{10pt}
 246    \item Issue: VCLIP is not in RV* (add with custom ext?)\vspace{10pt}
 247    \item Vector (or scalar-vector) use C.MV (MV is a pseudo-op)\vspace{10pt}
 248    \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)\vspace{10pt}
 249   \end{itemize}
 250 }
 251
 252
 253 \frame{\frametitle{Under consideration}
 254
 255  \begin{itemize}
 256    \item Can VSELECT be removed (or overloaded onto xBitManip)?\vspace{10pt}
 257    \item Can CLIP be done as a CSR (mode, like elwidth)\vspace{10pt}
 258    \item SIMD saturation (etc.) also set as a mode?\vspace{10pt}
 259   \end{itemize}
 260 }
 261
 262
 263 \frame{\frametitle{slide}
 264
 265  \begin{itemize}
 266    \item \vspace{10pt}
 267   \end{itemize}
 268   Considerations:\vspace{10pt}
 269   \begin{itemize}
 270    \item \vspace{10pt}
 271   \end{itemize}
 272 }
 273
 274
 275 \frame{\frametitle{slide}
 276
 277  \begin{itemize}
 278    \item \vspace{10pt}
 279   \end{itemize}
 280   Considerations:\vspace{10pt}
 281   \begin{itemize}
 282    \item \vspace{10pt}
 283   \end{itemize}
 284 }
 285
 286
 287 \frame{\frametitle{Including a plot}
 288  \begin{center}
 289 %  \includegraphics[height=2in]{dental.ps}\\
 290   {\bf \red Dental trajectories for 27 children:}
 291  \end{center}
 292 }
 293
 294 \frame{\frametitle{Creating .pdf slides in WinEdt}
 295
 296  \begin{itemize}
 297    \item LaTeX [Shift-Control-L]\vspace{10pt}
 298    \item dvi2pdf [click the button]\vspace{24pt}
 299   \end{itemize}
 300   To print 4 slides per page in acrobat click\vspace{10pt}
 301    \begin{itemize}
 302    \item File/print/properties\vspace{10pt}
 303    \item Change ``pages per sheet'' to 4\vspace{10pt}
 304   \end{itemize}
 305 }
 306
 307 \frame{
 308   \begin{center}
 309     {\Huge \red The end}
 310   \end{center}
 311 }
 312
 313
 314 \end{document}