a56088a69a2e1d139c65c12f07fb4299f9b863bc
[libreriscv.git] / simple_v_extension / simple_v_chennai_2018.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
7 \author{Luke Kenneth Casson Leighton}
8
9
10 \begin{document}
11
12 \frame{
13 \begin{center}
14 \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
15 \vspace{32pt}
16 \Large{Flexible Vectorisation}\\
17 \Large{(aka not so Simple-V?)}\\
18 \vspace{24pt}
19 \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
20 \vspace{24pt}
21 \large{\today}
22 \end{center}
23 }
24
25
26 \frame{\frametitle{Credits and Acknowledgements}
27
28 \begin{itemize}
29 \item The Designers of RISC-V\vspace{15pt}
30 \item The RVV Working Group and contributors\vspace{15pt}
31 \item Jacob Bachmeyer, Xan Phung, Chuanhua Chang and others\vspace{15pt}
32 \item ISA-Dev Group Members\vspace{10pt}
33 \end{itemize}
34 }
35
36
37 \frame{\frametitle{The Simon Sinek lowdown (Why, How, What)}
38
39 \begin{itemize}
40 \item Vectorisation needs to fit an implementor's scope:\\
41 RV32E, Embedded/Mobile, DSP, Servers and more.\vspace{15pt}
42 \item By implicitly marking INT/FP regs as "Vectorised",\\
43 everything else follows from there.\vspace{15pt}
44 \item A Standard Vector "API" with flexibility for implementors:\\
45 choice to optimise for area or performance as desired\vspace{10pt}
46 \end{itemize}
47 }
48
49
50 \frame{\frametitle{Why another Vector Extension?}
51
52 \begin{itemize}
53 \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
54 \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
55 \item Graded levels: hardware, hybrid or traps\vspace{10pt}
56 \item Even Compressed instructions become vectorised\vspace{10pt}
57 \end{itemize}
58 What Simple-V is not:\vspace{10pt}
59 \begin{itemize}
60 \item A full supercomputer-level Vector Proposal\vspace{10pt}
61 \item A replacement for RVV (designed to be augmented)\vspace{10pt}
62 \end{itemize}
63 }
64
65
66 \frame{\frametitle{Quick refresher on SIMD}
67
68 \begin{itemize}
69 \item SIMD very easy to implement (and very seductive)\vspace{10pt}
70 \item Parallelism is in the ALU\vspace{10pt}
71 \item Zero-to-Negligeable impact for rest of core\vspace{10pt}
72 \end{itemize}
73 Where SIMD Goes Wrong:\vspace{10pt}
74 \begin{itemize}
75 \item See "SIMD instructions considered harmful"
76 https://www.sigarch.org/simd-instructions-considered-harmful
77 \item Corner-cases alone are extremely complex.\\
78 Hardware is easy, but software is hell.
79 \item O($N^{6}$) ISA opcode proliferation!\\
80 opcode, elwidth, veclen, src1-src2-dest hi/lo
81 \end{itemize}
82 }
83
84 \frame{\frametitle{Quick refresher on RVV}
85
86 \begin{itemize}
87 \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
88 \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
89 \item Requires a separate Register File\vspace{10pt}
90 \item Can be implemented as a separate pipeline\vspace{10pt}
91 \end{itemize}
92 However...\vspace{10pt}
93 \begin{itemize}
94 \item 98 percent opcode duplication with rest of RV (CLIP)
95 \item Extending RVV requires customisation not just of h/w:\\
96 gcc and s/w also need customisation (and maintenance)
97 \end{itemize}
98 }
99
100
101 \frame{\frametitle{How is Parallelism abstracted?}
102
103 \begin{itemize}
104 \item Register "typing" turns any op into an implicit Vector op\vspace{10pt}
105 \item Primarily at the Instruction issue phase (except SIMD)\vspace{10pt}
106 \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
107 \end{itemize}
108 Notes:\vspace{10pt}
109 \begin{itemize}
110 \item LOAD/STORE (inc. C.LD and C.ST, LD.X: everything)
111 \item All ALU ops (soft / hybrid / full HW, on per-op basis)
112 \item All branches become predication targets (C.FNE added)
113 \item C.MV of particular interest (s/v, v/v, v/s)
114 \end{itemize}
115 }
116
117
118 \frame{\frametitle{Implementation Options}
119
120 \begin{itemize}
121 \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)\vspace{10pt}
122 \item Hardware loop, single-instruction issue\vspace{10pt}
123 \item Hardware loop, parallel (multi-instruction) issue\vspace{10pt}
124 \item Hardware loop, full parallel ALU (not recommended)\vspace{10pt}
125 \end{itemize}
126 Notes:\vspace{10pt}
127 \begin{itemize}
128 \item 4 (or more?) options above may be deployed on per-op basis
129 \item Minimum MVL MUST be sufficient to cover regfile LD/ST
130 \item OoO may repeatedly split off 4+ ops at a time into FIFO
131 \end{itemize}
132 }
133
134
135 \frame{\frametitle{How are SIMD Instructions Vectorised?}
136
137 \begin{itemize}
138 \item SIMD ALU(s) primarily unchanged\vspace{10pt}
139 \item Predication is added to each SIMD element (NO ZEROING!)\vspace{10pt}
140 \item End of Vector enables predication (NO ZEROING!)\vspace{10pt}
141 \end{itemize}
142 Considerations:\vspace{10pt}
143 \begin{itemize}
144 \item Many SIMD ALUs possible (parallel execution)\vspace{10pt}
145 \item Very long SIMD ALUs could waste die area (short vectors)\vspace{10pt}
146 \item Implementor free to choose (API remains the same)\vspace{10pt}
147 \end{itemize}
148 }
149 % With multiple SIMD ALUs at for example 32-bit wide they can be used
150 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
151 % or they can be used to cover several operations on totally different
152 % vectors / registers.
153
154 \frame{\frametitle{What's the deal / juice / score?}
155
156 \begin{itemize}
157 \item Standard Register File(s) overloaded with "vector span"\vspace{10pt}
158 \item Element width and type concepts remain same as RVV\vspace{10pt}
159 \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
160 \end{itemize}
161 Key differences from RVV:\vspace{10pt}
162 \begin{itemize}
163 \item Predication in INT regs as a BIT field (max VL=XLEN)\vspace{10pt}
164 \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)\vspace{10pt}
165 \item NO ZEROING: non-predicated elements are skipped\vspace{10pt}
166 \end{itemize}
167 }
168
169
170 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
171
172 \begin{itemize}
173 \item Same register(s) can have multiple "interpretations"\vspace{10pt}
174 \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops\vspace{10pt}
175 \item (32-bit GREV plus 4x8-bit SIMD plus 32-bit GREV)\vspace{10pt}
176 \item Same register(s) can be offset (no need for VSLIDE)\vspace{10pt}
177 \end{itemize}
178 Note:\vspace{10pt}
179 \begin{itemize}
180 \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$) \vspace{10pt}
181 \item Hi-Performance: Macro-op fusion (more pipeline stages?)\vspace{10pt}
182 \end{itemize}
183 }
184
185
186 \frame{\frametitle{Why no Zeroing (place zeros in non-predicated elements)?}
187
188 \begin{itemize}
189 \item Zeroing is an implementation optimisation favouring OoO\vspace{8pt}
190 \item Simple implementations may skip non-predicated operations\vspace{8pt}
191 \item Simple implementations explicitly have to destroy data\vspace{8pt}
192 \item Complex implementations may use reg-renames to save power\\
193 Zeroing on predication chains makes optimisation harder
194 \end{itemize}
195 Considerations:\vspace{10pt}
196 \begin{itemize}
197 \item Complex not really impacted, Simple impacted a LOT
198 \item Overlapping "Vectors" may issue overlapping ops
199 \item Please don't use Vectors for "security" (use Sec-Ext)
200 \end{itemize}
201 }
202 % with overlapping "vectors" - bearing in mind that "vectors" are
203 % just a remap onto the standard register file, if the top bits of
204 % predication are zero, and there happens to be a second vector
205 % that uses some of the same register file that happens to be
206 % predicated out, the second vector op may be issued *at the same time*
207 % if there are available parallel ALUs to do so.
208
209
210 \frame{\frametitle{Predication key-value CSR store}
211
212 \begin{itemize}
213 \item key is int regfile number or FP regfile number (1 bit)\vspace{10pt}
214 \item register to be predicated if referred to (5 bits, key)\vspace{10pt}
215 \item register to store actual predication in (5 bits, value)\vspace{10pt}
216 \item predication is inverted (1 bit)\vspace{10pt}
217 \end{itemize}
218 Notes:\vspace{10pt}
219 \begin{itemize}
220 \item Table should be expanded out for high-speed implementations
221 \item Multiple "keys" (and values) theoretically permitted
222 \item RVV rules about deleting higher-indexed CSRs followed
223 \end{itemize}
224 }
225
226
227 \frame{\frametitle{Register key-value CSR store}
228
229 \begin{itemize}
230 \item key is int regfile number or FP regfile number (1 bit)\vspace{10pt}
231 \item register to be predicated if referred to (5 bits, key)\vspace{10pt}
232 \item register to store actual predication in (5 bits, value)\vspace{10pt}
233 \item TODO\vspace{10pt}
234 \end{itemize}
235 Notes:\vspace{10pt}
236 \begin{itemize}
237 \item Table should be expanded out for high-speed implementations
238 \item Multiple "keys" (and values) theoretically permitted
239 \item RVV rules about deleting higher-indexed CSRs followed
240 \end{itemize}
241 }
242
243
244 \begin{frame}[fragile]
245 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
246
247 \begin{semiverbatim}
248 function op_add(rd, rs1, rs2, predr) # add not VADD!
249  int i, id=0, irs1=0, irs2=0;
250  for (i=0; i < MIN(VL, vectorlen[rd]); i++)
251   if (ireg[predr] & 1<<i) # predication uses intregs
252    ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
253 if (reg_is_vectorised[rd]) \{ id += 1; \}
254 if (reg_is_vectorised[rs1]) \{ irs1 += 1; \}
255 if (reg_is_vectorised[rs2]) \{ irs2 += 1; \}
256 \end{semiverbatim}
257
258 \begin{itemize}
259 \item SIMD slightly more complex (case above is elwidth = default)
260 \item Scalar-scalar and scalar-vector and vector-vector now all in one
261 \item OoO may choose to push ADDs into instr. queue (v. busy!)
262 \end{itemize}
263 \end{frame}
264
265 \begin{frame}[fragile]
266 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
267
268 \begin{semiverbatim}
269 s1 = vectorlen[src1] > 1;
270 s2 = vectorlen[src2] > 1;
271 for (int i = 0; i < VL; ++i)
272 preg[rs3] |= 1 << cmp(s1 ? reg[src1+i] : reg[src1],
273 s2 ? reg[src2+i] : reg[src2]);
274 \end{semiverbatim}
275
276 \begin{itemize}
277 \item SIMD slightly more complex (case above is elwidth = default)
278 \item If s1 and s2 both scalars, Standard branch occurs
279 \item Predication stored in integer regfile as a bitfield
280 \item Scalar-vector and vector-vector supported
281 \end{itemize}
282 \end{frame}
283
284 \begin{frame}[fragile]
285 \frametitle{LD/LD.S/LD.X (or trap, or actual hardware loop)}
286
287 \begin{semiverbatim}
288 if (unit-strided) stride = elsize;
289 else stride = areg[as2]; // constant-strided
290 for (int i = 0; i < VL; ++i)
291 if (preg_enabled[rd] && ([!]preg[rd] & 1<<i))
292 for (int j = 0; j < seglen+1; j++)
293 if (vectorised[rs2]) offs = vreg[rs2][i]
294 else offs = i*(seglen+1)*stride;
295 vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
296 \end{semiverbatim}
297
298 \begin{itemize}
299 \item Again: SIMD slightly more complex
300 \item rs2 vectorised taken to implicitly indicate LD.X
301 \end{itemize}
302 \end{frame}
303
304
305 \frame{\frametitle{C.MV extremely flexible!}
306
307 \begin{itemize}
308 \item scalar-to-vector (w/no pred): VSPLAT
309 \item scalar-to-vector (w/dest-pred): Sparse VSPLAT
310 \item scalar-to-vector (w/single dest-pred): VINSERT
311 \item vector-to-scalar (w/src-pred): VEXTRACT
312 \item vector-to-vector (w/no pred): Vector Copy
313 \item vector-to-vector (w/src xor dest pred): Sparse Vector Copy
314 \item vector-to-vector (w/src and dest pred): Vector Shuffle
315 \end{itemize}
316 \vspace{8pt}
317 Notes:\vspace{10pt}
318 \begin{itemize}
319 \item Really powerful!
320 \item Any other options?
321 \end{itemize}
322 }
323
324
325 \frame{\frametitle{Opcodes, compared to RVV}
326
327 \begin{itemize}
328 \item All integer and FP opcodes all removed (no CLIP!)\vspace{8pt}
329 \item VMPOP, VFIRST etc. all removed (use xBitManip)\vspace{8pt}
330 \item VSLIDE removed (use regfile overlaps)\vspace{8pt}
331 \item C.MV covers VEXTRACT VINSERT and VSPLAT (and more)\vspace{8pt}
332 \item VSETVL, VGETVL, VSELECT stay\vspace{8pt}
333 \item Issue: VCLIP is not in RV* (add with custom ext?)\vspace{8pt}
334 \item Vector (or scalar-vector) use C.MV (MV is a pseudo-op)\vspace{8pt}
335 \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)\vspace{8pt}
336 \end{itemize}
337 }
338
339
340 \frame{\frametitle{Under consideration}
341
342 \begin{itemize}
343 \item Can VSELECT be removed? (it's really complex)\vspace{10pt}
344 \item Can CLIP be done as a CSR (mode, like elwidth)\vspace{10pt}
345 \item SIMD saturation (etc.) also set as a mode?\vspace{10pt}
346 \item C.MV src predication no different from dest predication\\
347 What to do? Make one have different meaning?\vspace{10pt}
348 \item 8/16-bit ops is it worthwhile adding a "start offset"? \\
349 (a bit like misaligned addressing... for registers)\\
350 or just use predication to skip start?\vspace{10pt}
351 \end{itemize}
352 }
353
354
355 \frame{\frametitle{Summary}
356
357 \begin{itemize}
358 \item Designed for simplicity (graded levels of complexity)\vspace{10pt}
359 \item Fits RISC-V ethos: do more with less\vspace{10pt}
360 \item Reduces SIMD ISA proliferation by 3-4 orders of magnitude \\
361 (without SIMD downsides or sacrificing speed trade-off)\vspace{10pt}
362 \item Covers 98\% of RVV, allows RVV to fit "on top"\vspace{10pt}
363 \item Huge range of implementor freedom and flexibility\vspace{10pt}
364 \item Not designed for supercomputing (that's RVV), designed for
365 in between: DSPs, RV32E, Embedded 3D GPUs etc.\vspace{10pt}
366 \end{itemize}
367 }
368
369
370 \frame{\frametitle{slide}
371
372 \begin{itemize}
373 \item \vspace{10pt}
374 \end{itemize}
375 Considerations:\vspace{10pt}
376 \begin{itemize}
377 \item \vspace{10pt}
378 \end{itemize}
379 }
380
381
382 \frame{\frametitle{Including a plot}
383 \begin{center}
384 % \includegraphics[height=2in]{dental.ps}\\
385 {\bf \red Dental trajectories for 27 children:}
386 \end{center}
387 }
388
389 \frame{\frametitle{Creating .pdf slides in WinEdt}
390
391 \begin{itemize}
392 \item LaTeX [Shift-Control-L]\vspace{10pt}
393 \item dvi2pdf [click the button]\vspace{24pt}
394 \end{itemize}
395 To print 4 slides per page in acrobat click\vspace{10pt}
396 \begin{itemize}
397 \item File/print/properties\vspace{10pt}
398 \item Change ``pages per sheet'' to 4\vspace{10pt}
399 \end{itemize}
400 }
401
402 \frame{
403 \begin{center}
404 {\Huge \red The end}
405 \end{center}
406 }
407
408
409 \end{document}