update slides
[libreriscv.git] / simple_v_extension / simple_v_chennai_2018.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
7 \author{Luke Kenneth Casson Leighton}
8
9
10 \begin{document}
11
12 \frame{
13 \begin{center}
14 \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
15 \vspace{32pt}
16 \Large{Flexible Vectorisation}\\
17 \Large{(aka not so Simple-V?)}\\
18 \vspace{24pt}
19 \Large{[proposed for] Chennai 9th RISC-V Workshop}\\
20 \vspace{24pt}
21 \large{\today}
22 \end{center}
23 }
24
25 \frame{\frametitle{Why another Vector Extension?}
26
27 \begin{itemize}
28 \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
29 \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
30 \item Graded levels: hardware, hybrid or traps\vspace{10pt}
31 \item Even Compressed instructions become vectorised\vspace{10pt}
32 \end{itemize}
33 What Simple-V is not:\vspace{10pt}
34 \begin{itemize}
35 \item A full supercomputer-level Vector Proposal\vspace{10pt}
36 \item A replacement for RVV (designed to be augmented)\vspace{10pt}
37 \end{itemize}
38 }
39
40 \frame{\frametitle{Quick refresher on SIMD}
41
42 \begin{itemize}
43 \item SIMD very easy to implement (and very seductive)\vspace{10pt}
44 \item Parallelism is in the ALU\vspace{10pt}
45 \item Zero-to-Negligeable impact for rest of core\vspace{10pt}
46 \end{itemize}
47 Where SIMD Goes Wrong:\vspace{10pt}
48 \begin{itemize}
49 \item See "SIMD instructions considered harmful"
50 https://www.sigarch.org/simd-instructions-considered-harmful
51 \item (Corner-cases alone are extremely complex)\vspace{10pt}
52 \item O($N^{6}$) ISA opcode proliferation!\vspace{10pt}
53 \end{itemize}
54 }
55
56 \frame{\frametitle{Quick refresher on RVV}
57
58 \begin{itemize}
59 \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
60 \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
61 \item Requires a separate Register File\vspace{10pt}
62 \item Can be implemented as a separate pipeline\vspace{10pt}
63 \end{itemize}
64 However...\vspace{10pt}
65 \begin{itemize}
66 \item 98 percent opcode duplication with rest of RV (CLIP)\vspace{10pt}
67 \item Extending RVV requires customisation\vspace{10pt}
68 \end{itemize}
69 }
70
71
72 \frame{\frametitle{How is Parallelism abstracted?}
73
74 \begin{itemize}
75 \item Almost all opcodes removed in favour of implicit "typing"\vspace{10pt}
76 \item Primarily at the Instruction issue phase (except SIMD)\vspace{10pt}
77 \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
78 \end{itemize}
79 Notes:\vspace{10pt}
80 \begin{itemize}
81 \item LOAD/STORE (inc. C.LD and C.ST, LDX: everything)\vspace{10pt}
82 \item All ALU ops (soft / hybrid / full HW, on per-op basis)\vspace{10pt}
83 \item All branches become predication targets (C.FNE added)\vspace{10pt}
84 \end{itemize}
85 }
86
87
88 \frame{\frametitle{Implementation Options}
89
90 \begin{itemize}
91 \item Absolute minimum: Exceptions (if CSRs indicate "V", trap)\vspace{10pt}
92 \item Hardware loop, single-instruction issue\vspace{10pt}
93 \item Hardware loop, parallel (multi-instruction) issue\vspace{10pt}
94 \item Hardware loop, full parallel ALU (not recommended)\vspace{10pt}
95 \end{itemize}
96 Notes:\vspace{10pt}
97 \begin{itemize}
98 \item 4 (or more?) options above may be deployed on per-op basis
99 \item Minimum MVL MUST be sufficient to cover regfile LD/ST
100 \item OoO may split off 4+ single-instructions at a time
101 \end{itemize}
102 }
103
104
105 \frame{\frametitle{How are SIMD Instructions Vectorised?}
106
107 \begin{itemize}
108 \item SIMD ALU(s) primarily unchanged\vspace{10pt}
109 \item Predication is added to each SIMD element (NO ZEROING!)\vspace{10pt}
110 \item End of Vector enables predication (NO ZEROING!)\vspace{10pt}
111 \end{itemize}
112 Considerations:\vspace{10pt}
113 \begin{itemize}
114 \item Many SIMD ALUs possible (parallel execution)\vspace{10pt}
115 \item Very long SIMD ALUs could waste die area (short vectors)\vspace{10pt}
116 \item Implementor free to choose (API remains the same)\vspace{10pt}
117 \end{itemize}
118 }
119 % With multiple SIMD ALUs at for example 32-bit wide they can be used
120 % to either issue 64-bit or 128-bit or 256-bit wide SIMD operations
121 % or they can be used to cover several operations on totally different
122 % vectors / registers.
123
124 \frame{\frametitle{What's the deal / juice / score?}
125
126 \begin{itemize}
127 \item Standard Register File(s) overloaded with "vector span"\vspace{10pt}
128 \item Element width and type concepts remain same as RVV\vspace{10pt}
129 \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
130 \end{itemize}
131 Key differences from RVV:\vspace{10pt}
132 \begin{itemize}
133 \item Predication in INT regs as a BIT field (max VL=XLEN)\vspace{10pt}
134 \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)\vspace{10pt}
135 \item NO ZEROING: non-predicated elements are skipped\vspace{10pt}
136 \end{itemize}
137 }
138
139
140 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
141
142 \begin{itemize}
143 \item Same register(s) can have multiple "interpretations"\vspace{10pt}
144 \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops\vspace{10pt}
145 \item (32-bit GREV plus 4-wide 32-bit SIMD plus 32-bit GREVI)\vspace{10pt}
146 \item 32-bit op followed by 16-bit op w/ 2x VL, 1/2 predicated\vspace{10pt}
147 \end{itemize}
148 Note:\vspace{10pt}
149 \begin{itemize}
150 \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$) \vspace{10pt}
151 \item Hi-Performance: Macro-op fusion (more pipeline stages?)\vspace{10pt}
152 \end{itemize}
153 }
154
155
156 \frame{\frametitle{Why no Zeroing (place zeros in non-predicated elements)?}
157
158 \begin{itemize}
159 \item Zeroing is an implementation optimisation favouring OoO\vspace{10pt}
160 \item Simple implementations may skip non-predicated operations\vspace{10pt}
161 \item Simple implementations explicitly have to destroy data\vspace{10pt}
162 \item Complex implementations may use reg-renames to save power\vspace{10pt}
163 \end{itemize}
164 Considerations:\vspace{10pt}
165 \begin{itemize}
166 \item Complex not really impacted, Simple impacted a LOT
167 \item Overlapping "Vectors" may issue overlapping ops
168 \item Please don't use Vectors for "security" (use Sec-Ext)
169 \end{itemize}
170 }
171 % with overlapping "vectors" - bearing in mind that "vectors" are
172 % just a remap onto the standard register file, if the top bits of
173 % predication are zero, and there happens to be a second vector
174 % that uses some of the same register file that happens to be
175 % predicated out, the second vector op may be issued *at the same time*
176 % if there are available parallel ALUs to do so.
177
178 \begin{frame}[fragile]
179 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
180
181 \begin{semiverbatim}
182 function op_add(rd, rs1, rs2, predr) # add not VADD!
183  int i, id=0, irs1=0, irs2=0;
184  for (i=0; i < MIN(VL, vectorlen[rd]); i++)
185   if (ireg[predr] & 1<<i) # predication uses intregs
186    ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
187 if (reg_is_vectorised[rd]) \{ id += 1; \}
188 if (reg_is_vectorised[rs1]) \{ irs1 += 1; \}
189 if (reg_is_vectorised[rs2]) \{ irs2 += 1; \}
190 \end{semiverbatim}
191
192 \begin{itemize}
193 \item SIMD slightly more complex (case above is elwidth = default)
194 \item Scalar-scalar and scalar-vector and vector-vector now all in one
195 \item OoO may choose to push ADDs into instr. queue (v. busy!)
196 \end{itemize}
197 \end{frame}
198
199 \begin{frame}[fragile]
200 \frametitle{Predication-Branch (or trap, or actual hardware loop)}
201
202 \begin{semiverbatim}
203 s1 = vectorlen[src1] > 1;
204 s2 = vectorlen[src2] > 1;
205 for (int i = 0; i < VL; ++i)
206 preg[rs3] |= 1 << cmp(s1 ? reg[src1+i] : reg[src1],
207 s2 ? reg[src2+i] : reg[src2]);
208 \end{semiverbatim}
209
210 \begin{itemize}
211 \item SIMD slightly more complex (case above is elwidth = default)
212 \item If s1 and s2 both scalars, Standard branch occurs
213 \item Predication stored in integer regfile as a bitfield
214 \item Scalar-vector and vector-vector supported
215 \end{itemize}
216 \end{frame}
217
218 \begin{frame}[fragile]
219 \frametitle{LD/LD.S/LD.X (or trap, or actual hardware loop)}
220
221 \begin{semiverbatim}
222 if (unit-strided) stride = elsize;
223 else stride = areg[as2]; // constant-strided
224 for (int i = 0; i < VL; ++i)
225 if (preg_enabled[rd] && ([!]preg[rd] & 1<<i))
226 for (int j = 0; j < seglen+1; j++)
227 if (vectorised[rs2]) offs = vreg[rs2][i]
228 else offs = i*(seglen+1)*stride;
229 vreg[rd+j][i] = mem[sreg[base] + offs + j*stride]
230 \end{semiverbatim}
231
232 \begin{itemize}
233 \item Again: SIMD slightly more complex
234 \item rs2 vectorised taken to implicitly indicate LD.X
235 \end{itemize}
236 \end{frame}
237
238
239 \frame{\frametitle{Opcodes, compared to RVV}
240
241 \begin{itemize}
242 \item All integer and FP opcodes all removed (no CLIP!)\vspace{10pt}
243 \item VMPOP, VFIRST etc. all removed (use xBitManip)\vspace{10pt}
244 \item VSLIDE, VEXTRACT, VINSERT removed (using regfile)\vspace{10pt}
245 \item VSETVL, VGETVL, VSELECT stay\vspace{10pt}
246 \item Issue: VCLIP is not in RV* (add with custom ext?)\vspace{10pt}
247 \item Vector (or scalar-vector) use C.MV (MV is a pseudo-op)\vspace{10pt}
248 \item VMERGE: twin predicated C.MVs (one inverted. macro-op'd)\vspace{10pt}
249 \end{itemize}
250 }
251
252
253 \frame{\frametitle{Under consideration}
254
255 \begin{itemize}
256 \item Can VSELECT be removed (or overloaded onto xBitManip)?\vspace{10pt}
257 \item Can CLIP be done as a CSR (mode, like elwidth)\vspace{10pt}
258 \item SIMD saturation (etc.) also set as a mode?\vspace{10pt}
259 \end{itemize}
260 }
261
262
263 \frame{\frametitle{slide}
264
265 \begin{itemize}
266 \item \vspace{10pt}
267 \end{itemize}
268 Considerations:\vspace{10pt}
269 \begin{itemize}
270 \item \vspace{10pt}
271 \end{itemize}
272 }
273
274
275 \frame{\frametitle{slide}
276
277 \begin{itemize}
278 \item \vspace{10pt}
279 \end{itemize}
280 Considerations:\vspace{10pt}
281 \begin{itemize}
282 \item \vspace{10pt}
283 \end{itemize}
284 }
285
286
287 \frame{\frametitle{Including a plot}
288 \begin{center}
289 % \includegraphics[height=2in]{dental.ps}\\
290 {\bf \red Dental trajectories for 27 children:}
291 \end{center}
292 }
293
294 \frame{\frametitle{Creating .pdf slides in WinEdt}
295
296 \begin{itemize}
297 \item LaTeX [Shift-Control-L]\vspace{10pt}
298 \item dvi2pdf [click the button]\vspace{24pt}
299 \end{itemize}
300 To print 4 slides per page in acrobat click\vspace{10pt}
301 \begin{itemize}
302 \item File/print/properties\vspace{10pt}
303 \item Change ``pages per sheet'' to 4\vspace{10pt}
304 \end{itemize}
305 }
306
307 \frame{
308 \begin{center}
309 {\Huge \red The end}
310 \end{center}
311 }
312
313
314 \end{document}