more slides
[libreriscv.git] / simple_v_extension / simple_v_chennai_2018.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \title{Simple-V RISC-V Extension for Vectorisation and SIMD}
7 \author{Luke Kenneth Casson Leighton}
8
9
10 \begin{document}
11
12 \frame{
13 \begin{center}
14 \huge{Simple-V RISC-V Extension for Vectors and SIMD}\\
15 \vspace{32pt}
16 \Large{Flexible Vectorisation}\\
17 \Large{(aka not so Simple-V?)}\\
18 \vspace{24pt}
19 \Large{Chennai 9th RISC-V Workshop}\\
20 \vspace{24pt}
21 \large{\today}
22 \end{center}
23 }
24
25 \frame{\frametitle{Why another Vector Extension?}
26
27 \begin{itemize}
28 \item RVV very heavy-duty (excellent for supercomputing)\vspace{10pt}
29 \item Simple-V abstracts parallelism (based on best of RVV)\vspace{10pt}
30 \item Graded levels: hardware or software-emulation\vspace{10pt}
31 \item Even Compressed instructions become vectorised\vspace{10pt}
32 \end{itemize}
33 What Simple-V is not:\vspace{10pt}
34 \begin{itemize}
35 \item A full supercomputer-level Vector Proposal\vspace{10pt}
36 \item A replacement for RVV (designed to be augmented)\vspace{10pt}
37 \end{itemize}
38 }
39
40 \frame{\frametitle{Quick refresher on SIMD}
41
42 \begin{itemize}
43 \item SIMD very easy to implement (and very seductive)\vspace{10pt}
44 \item Parallelism is in the ALU\vspace{10pt}
45 \item Zero-to-Negligeable impact for rest of core\vspace{10pt}
46 \end{itemize}
47 Where SIMD Goes Wrong:\vspace{10pt}
48 \begin{itemize}
49 \item See "Why SIMD considered harmful"\vspace{10pt}
50 \item (Corner-cases alone are extremely complex)\vspace{10pt}
51 \item O($N^{6}$) ISA opcode proliferation!\vspace{10pt}
52 \end{itemize}
53 }
54
55 \frame{\frametitle{Quick refresher on RVV}
56
57 \begin{itemize}
58 \item Extremely powerful (extensible to 256 registers)\vspace{10pt}
59 \item Supports polymorphism, several datatypes (inc. FP16)\vspace{10pt}
60 \item Requires a separate Register File\vspace{10pt}
61 \item Can be implemented as a separate pipeline\vspace{10pt}
62 \end{itemize}
63 However...\vspace{10pt}
64 \begin{itemize}
65 \item 98 percent opcode duplication with rest of RV (CLIP)\vspace{10pt}
66 \item Extending RVV requires customisation\vspace{10pt}
67 \end{itemize}
68 }
69
70
71 \frame{\frametitle{How is Parallelism abstracted?}
72
73 \begin{itemize}
74 \item Almost all opcodes removed in favour of implicit "typing"\vspace{10pt}
75 \item Primarily at the Instruction issue phase (except SIMD)\vspace{10pt}
76 \item Standard (and future, and custom) opcodes now parallel\vspace{10pt}
77 \end{itemize}
78 Notes:\vspace{10pt}
79 \begin{itemize}
80 \item LOAD/STORE (inc. C.LD and C.ST, LDX: everything)\vspace{10pt}
81 \item All ALU ops (soft / hybrid / full HW, on per-op basis)\vspace{10pt}
82 \item All branches become predication targets (C.FNE added)\vspace{10pt}
83 \end{itemize}
84 }
85
86 \begin{frame}[fragile]
87 \frametitle{ADD pseudocode (or trap, or actual hardware loop)}
88
89 \begin{semiverbatim}
90 function op_add(rd, rs1, rs2, predr) \{ # add not PADD!
91  int i, id=0, irs1=0, irs2=0;
92  for (i=0; i < MIN(VL, vectorlen[rd]); i++)
93   if (ireg[predr] & 1<<i) # predication uses intregs
94      ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
95   # now increment idxs: src/dest all vec/scalar
96   if (reg_is_vectorised[rd]) \{ id += 1; \}
97   if (reg_is_vectorised[rs1]) \{ irs1 += 1; \}
98   if (reg_is_vectorised[rs2]) \{ irs2 += 1; \}
99 \}
100 \end{semiverbatim}
101 \begin{itemize}
102 \item Scalar-scalar and scalar-vector and vector-vector now all in one
103 \item OoO may choose to push ADDs into instr. queue (v. busy!)
104 \end{itemize}
105 \end{frame}
106
107 \frame{\frametitle{How are SIMD Instructions Vectorised?}
108
109 \begin{itemize}
110 \item SIMD ALU(s) primarily unchanged\vspace{10pt}
111 \item Predication is added to each SIMD element (NO ZEROING!)\vspace{10pt}
112 \item End of Vector enables predication (NO ZEROING!)\vspace{10pt}
113 \end{itemize}
114 Considerations:\vspace{10pt}
115 \begin{itemize}
116 \item Many SIMD ALUs possible (parallel execution)\vspace{10pt}
117 \item Very long SIMD ALUs could waste die area (short vectors)\vspace{10pt}
118 \item Implementor free to choose (API remains the same)\vspace{10pt}
119 \end{itemize}
120 }
121
122 \frame{\frametitle{What's the deal / juice / score?}
123
124 \begin{itemize}
125 \item Standard Register File(s) overloaded with "vector span"\vspace{10pt}
126 \item Element width and type concepts remain same as RVV\vspace{10pt}
127 \item CSRs are key-value tables (overlaps allowed)\vspace{10pt}
128 \end{itemize}
129 Key differences from RVV:\vspace{10pt}
130 \begin{itemize}
131 \item Predication in INT regs as a BIT field (max VL=XLEN)\vspace{10pt}
132 \item Minimum VL must be Num Regs - 1 (all regs single LD/ST)\vspace{10pt}
133 \item NO ZEROING: non-predicated elements are skipped\vspace{10pt}
134 \end{itemize}
135 }
136
137 \frame{\frametitle{Why are overlaps allowed in Regfiles?}
138
139 \begin{itemize}
140 \item Same register(s) can have multiple "interpretations"\vspace{10pt}
141 \item xBitManip plus SIMD plus xBitManip = Hi/Lo bitops\vspace{10pt}
142 \item (32-bit GREV plus 4-wide 32-bit SIMD plus 32-bit GREVI)\vspace{10pt}
143 \item 32-bit op followed by 16-bit op w/ 2x VL, 1/2 predicated\vspace{10pt}
144 \end{itemize}
145 Note:\vspace{10pt}
146 \begin{itemize}
147 \item xBitManip reduces O($N^{6}$) SIMD down to O($N^{3}$) \vspace{10pt}
148 \item Hi-Performance: Macro-op fusion (more pipeline stages?)\vspace{10pt}
149 \end{itemize}
150 }
151
152
153 \frame{\frametitle{Why no Zeroing (place zeros in non-predicated elements)?}
154
155 \begin{itemize}
156 \item Zeroing is an implementation optimisation favouring OoO\vspace{10pt}
157 \item Simple implementations may skip non-predicated operations\vspace{10pt}
158 \item Simple implementations explicitly have to destroy data\vspace{10pt}
159 \item Complex implementations may use reg-renames to save power\vspace{10pt}
160 \end{itemize}
161 Considerations:\vspace{10pt}
162 \begin{itemize}
163 \item Complex not really impacted, Simple impacted a LOT\vspace{10pt}
164 \item Please don't use Vectors for "security" (use Sec-Ext)\vspace{10pt}
165 \end{itemize}
166 }
167
168
169 \frame{\frametitle{slide}
170
171 \begin{itemize}
172 \item \vspace{10pt}
173 \end{itemize}
174 Considerations:\vspace{10pt}
175 \begin{itemize}
176 \item \vspace{10pt}
177 \end{itemize}
178 }
179
180
181 \frame{\frametitle{slide}
182
183 \begin{itemize}
184 \item \vspace{10pt}
185 \end{itemize}
186 Considerations:\vspace{10pt}
187 \begin{itemize}
188 \item \vspace{10pt}
189 \end{itemize}
190 }
191
192
193 \frame{\frametitle{slide}
194
195 \begin{itemize}
196 \item \vspace{10pt}
197 \end{itemize}
198 Considerations:\vspace{10pt}
199 \begin{itemize}
200 \item \vspace{10pt}
201 \end{itemize}
202 }
203
204
205 \frame{\frametitle{Including a plot}
206 \begin{center}
207 % \includegraphics[height=2in]{dental.ps}\\
208 {\bf \red Dental trajectories for 27 children:}
209 \end{center}
210 }
211
212 \frame{\frametitle{Creating .pdf slides in WinEdt}
213
214 \begin{itemize}
215 \item LaTeX [Shift-Control-L]\vspace{10pt}
216 \item dvi2pdf [click the button]\vspace{24pt}
217 \end{itemize}
218 To print 4 slides per page in acrobat click\vspace{10pt}
219 \begin{itemize}
220 \item File/print/properties\vspace{10pt}
221 \item Change ``pages per sheet'' to 4\vspace{10pt}
222 \end{itemize}
223 }
224
225 \frame{
226 \begin{center}
227 {\Huge \red The end}
228 \end{center}
229 }
230
231
232 \end{document}