Replaced shadow.jpg with svg one.
[libreriscv.git] / conferences / fosdem2021_libresoc.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \graphicspath{{./}}
7
8 \title{The Libre-SOC Hybrid 3D CPU}
9 \author{Luke Kenneth Casson Leighton}
10
11
12 \begin{document}
13
14 \frame{
15 \begin{center}
16 \huge{The Libre-SOC Hybrid 3D CPU}\\
17 \vspace{32pt}
18 \Large{Augmenting the OpenPOWER ISA}\\
19 \Large{to provide 3D and Video instructions}\\
20 \Large{(properly and officially) and make a GPU}\\
21 \vspace{24pt}
22 \Large{FOSDEM2021}\\
23 \vspace{16pt}
24 \large{Sponsored by NLnet's PET Programme}\\
25 \vspace{6pt}
26 \large{\today}
27 \end{center}
28 }
29
30
31 \frame{\frametitle{Why another SoC?}
32
33 \begin{itemize}
34 \item Intel Management Engine, Apple QA issues, Spectre\vspace{6pt}
35 \item Endless proprietary drivers, "simplest" solution: \\
36 License proprietary hard macros (with proprietary firmware)\\
37 Adversely affects product development cost\\
38 due to opaque driver bugs (Samsung S3C6410 / S5P100)
39 \vspace{6pt}
40 \item Alternative: Intel and Valve-Steam collaboration\\
41 "Most productive business meeting ever!"\\
42 https://tinyurl.com/valve-steam-intel
43 \vspace{6pt}
44 \item Because for 30 years I Always Wanted To Design A CPU
45 \vspace{6pt}
46 \item Ultimately it is a strategic \textit{business} objective to
47 develop entirely Libre hardware, firmware and drivers.
48 \end{itemize}
49 }
50
51
52 \frame{\frametitle{Why OpenPOWER?}
53
54 \vspace{15pt}
55
56 \begin{itemize}
57 \item Good ecosystem essential\\
58 linux kernel, u-boot, compilers, OSes,\\
59 Reference Implementation(s)\vspace{10pt}
60 \item Supportive Foundation and Members\\
61 need to be able to submit ISA augmentations\\
62 (for proper peer review)\vspace{10pt}
63 \item No NDAs, full transparency must be acceptable\\
64 due to being funded under NLnet's PET Programme\vspace{10pt}
65 \item OpenPOWER: established for decades, excellent Foundation,\\
66 Microwatt as Reference, approachable and friendly.
67 \end{itemize}
68 }
69
70 \frame{\frametitle{How can you help?}
71
72 \vspace{5pt}
73
74 \begin{itemize}
75 \item Start here! https://libre-soc.org \\
76 Mailing lists https://lists.libre-soc.org \\
77 IRC Freenode libre-soc \\
78 etc. etc. (it's a Libre project, go figure) \\
79 \vspace{3pt}
80 \item Can I get paid? Yes! NLnet funded\\
81 See https://libre-soc.org/nlnet/\#faq \\
82 \vspace{3pt}
83 \item Also profit-sharing in any commercial ventures \\
84 \vspace{3pt}
85 \item How many opportunities to develop Libre SoCs exist,\\
86 and actually get paid for it?
87 \vspace{3pt}
88 \item I'm not a developer, how can I help?\\
89 - Plenty of research needed, artwork, website \\
90 - Help find customers and OEMs willing to commit (LOI)
91 \end{itemize}
92 }
93
94
95
96 \frame{\frametitle{What goes into a typical SoC?}
97 \vspace{9pt}
98 \begin{itemize}
99 \item 15 to 20mm BGA package: 2.5 to 5 watt power consumption\\
100 heat sink normally not required (simplifies overall design)
101 \vspace{3pt}
102 \item Fully-integrated peripherals (not Northbridge/Southbridge)\\
103 USB, HDMI, RGB/TTL, SD/MMC, I2C, UART, SPI, GPIO etc. etc.
104 \vspace{3pt}
105 \item Built-in GPU (shared memory bus, 3rd party licensed) \vspace{3pt}
106 \item Built-in VPU (likewise, proprietary)\vspace{3pt}
107 \item Target price between \$2.50 and \$30 depending on market\\
108 Radically different from IBM POWER9 Core (200 Watt)
109 \vspace{3pt}
110 \item We're doing the same, just with a hybrid architecture.\\
111 CPU == GPU == VPU
112 \end{itemize}
113 }
114
115
116
117 \frame{\frametitle{Simple SBC-style SoC}
118
119 \begin{center}
120 \includegraphics[width=0.9\textwidth]{shakti_libre_soc.jpg}
121 \end{center}
122
123 }
124
125 \frame{\frametitle{What's different about Libre-SOC?}
126
127 \begin{itemize}
128 \item Hybrid - integrated. The CPU \textit{is} the GPU.\\
129 The GPU \textit{is} the CPU. The VPU \textit{is} the CPU.\\
130 \textit{There is No Separate VPU/GPU Pipeline or Processor}\\
131 \vspace{9pt}
132 \item written in nmigen (a python-based HDL). Not VHDL\\
133 not Verilog (definitely not Chisel3/Scala)\\
134 This is an extremely important strategic decision.\\
135 \vspace{9pt}
136 \item Simple-V Vector Extension. See `SIMD Considered harmful'.\\
137 https://tinyurl.com/simd-considered-harmful\\
138 SV effectively a "hardware for-loop" on standard scalar ISA\\
139 (conceptually similar to Zero-Overhead Loops in DSPs)
140 \vspace{6pt}
141 \item Yes great, but what's different compared to Intel, AMD, NVIDIA,
142 ARM and IBM?
143 \end{itemize}
144 }
145
146 \frame{\frametitle{OpenPOWER Cell Processor and upwards}
147
148 \begin{itemize}
149 \item OpenPOWER ISA developed from PowerPC, with the RS6000 in the 90s.
150 \vspace{6pt}
151 \item Sony, IBM and Toshiba began the Cell Processor in 2001 \\
152 (Sony Playstation 3) - NUMA approach
153 \vspace{6pt}
154 \item Raw brute-force performance pissed all over the competition
155 at the time
156 \vspace{6pt}
157 \item VSX later evolved out of this initiative.
158 \vspace{6pt}
159 \item VSX, a SIMD extension, now showing its age. \\
160 Fixed-width, no predication, limited pixel formats (15 bit)
161 \vspace{6pt}
162 \item (Vulkan requires dozens of pixel formats)
163 \end{itemize}
164 }
165
166 \frame{\frametitle{Apple M1 (ARM) vs Intel / AMD (x86)}
167
168 \begin{itemize}
169 \item Very interesting article: tinyurl.com/apple-m1-review
170 \item Apple M1: uses ARM. Intel: implements x86
171 \item Apple M1: RISC multi-issue. Intel: CISC multi-issue.
172 \item Apple M1: uniform (easy) instruction decode \\
173 Intel: \textit{Cannot easily identify start of instruction}
174 \item Result: multi-issue x86 decoder is so complex, it misses
175 opportunities to keep back-end execution engines 100 percent
176 occupied
177 \item OpenPOWER happens to be RISC (easy decode), which is why POWER10
178 has 8-way multi-issue.
179 \item Libre-SOC can do the same tricks that IBM POWER10 and Apple M1
180 can. Intel (x86) literally cannot keep up.
181 \end{itemize}
182 }
183
184
185 \frame{\frametitle{Hybrid Architecture: Augmented 6600}
186
187 \begin{itemize}
188 \item CDC 6600 is a design from 1965. The \textit{augmentations} are not.\\
189 Help from Mitch Alsup includes \textit{precise exceptions}, \\
190 multi-issue and more. Academic literature on 6600 utterly misleading.
191 6600 Scoreboards completely underestimated (Seymour Cray and
192 James Thornton
193 solved problems they didn't realise existed elsewhere!)
194 \item Front-end Vector ISA, back-end "Predicated (masked) SIMD"\\
195 nmigen (python OO) strategically critical to achieving this.
196 \item Out-of-order combined with Simple-V allows scalar operations\\
197 at the developer end to be turned into SIMD at the back-end\\
198 \textit{without the developer needing to do SIMD}
199 \item IEEE754 sin / cos / atan2, Texturisation opcodes, YUV2RGB\\
200 all automatically vectorised.
201 \end{itemize}
202 }
203
204 \frame{\frametitle{Learning from these and putting it together}
205
206 \begin{itemize}
207 \item Apple M1 and IBM POWER10 show that RISC plus superscalar
208 multi-issue produces insane performance
209 \item Intel AVX 512 and CISC in general is getting out of hand (what's
210 next: 256-bit length instructions, AVX 1024?)
211 \item RISC-V RVV shows Cray-style Vectors can save power. Simple-V
212 has the same benefits with far less instructions (188 for RVV,
213 3 to 5 new instructions for Simple-V).
214 \item CDC 6600 shows that intelligently-implemented designs can do the
215 job, with far less resources.
216 \item Libre-SOC combines the best of historical processor designs,
217 co-opting and innovating on them (pissing in the back yard of
218 every incumbent CPU and GPU company in the process).
219 \item It's a Libre project: you get to help
220 \end{itemize}
221 }
222
223
224 \frame{\frametitle{Why nmigen?}
225
226 \begin{itemize}
227 \item Uses python to build an AST (Abstract Syntax Tree).
228 Actually hands that over to yosys (to create ILANG file)
229 after which verilog can (if necessary) be created
230 \item Deterministic synthesiseable behaviour (Signals are declared
231 with their reset pattern: no more forgetting "if rst" block).
232 \item python OO programming techniques can be deployed. classes
233 and functions created which pass in parameters which change
234 what HDL is created (IEEE754 FP16 / 32 / 64 for example)
235 \item python-based for-loops can e.g. read CSV files then generate
236 a hierarchical nested suite of HDL Switch / Case statements
237 (this is how the Libre-soc PowerISA decoder is implemented)
238 \item extreme OO abstraction can even be used to create "dynamic
239 partitioned Signals" that have the same operator-overloaded
240 "add", "subtract", "greater-than" operators
241
242 \end{itemize}
243 }
244
245 \frame{\frametitle{Why another Vector ISA? (or: not-exactly another)}
246
247 \begin{itemize}
248 \item Simple-V is a 'register tag' system. \textit{There are no opcodes}\\
249 SV 'tags' scalar operations (scalar regfiles) as 'vectorised'
250 \item (PowerISA SIMD is around 700 opcodes, making it unlikely to be
251 able to fit a PowerISA decoder in only one clock cycle)
252 \item Effectively a 'hardware sub-counter for-loop': pauses the PC\\
253 then rolls incrementally through the operand register numbers\\
254 issuing \textit{multiple} scalar instructions into the pipelines\\
255 (hence the reason for a multi-issue OoO microarchitecture)
256 \item Current \textit{and future} PowerISA scalar opcodes inherently
257 \textit{and automatically} become 'vectorised' by SV without
258 needing an explicit new Vector opcode.
259 \item Predication and element width polymorphism are also 'tags'.
260 elwidth polymorphism allows for BF16 / FP16 / 80 / 128 to be added to
261 the ISA \textit{without modifying the ISA}
262
263 \end{itemize}
264 }
265
266 \frame{\frametitle{Quick refresher on SIMD}
267
268 \begin{itemize}
269 \item SIMD very easy to implement (and very seductive)
270 \item Parallelism is in the ALU
271 \item Zero-to-Negligeable impact for rest of core
272 \end{itemize}
273 Where SIMD Goes Wrong:\vspace{6pt}
274 \begin{itemize}
275 \item See "SIMD instructions considered harmful"
276 https://sigarch.org/simd-instructions-considered-harmful
277 \item Setup and corner-cases alone are extremely complex.\\
278 Hardware is easy, but software is hell.\\
279 strncpy VSX patch for POWER9: 250 hand-written asm lines!\\
280 (RVV / SimpleV strncpy is 14 instructions)
281 \item O($N^{6}$) ISA opcode proliferation (1000s of instructions)\\
282 opcode, elwidth, veclen, src1-src2-dest hi/lo
283 \end{itemize}
284 }
285
286 \begin{frame}[fragile]
287 \frametitle{Simple-V ADD in a nutshell}
288
289 \begin{semiverbatim}
290 function op\_add(rd, rs1, rs2, predr) # add not VADD!
291  int i, id=0, irs1=0, irs2=0;
292  for (i = 0; i < VL; i++)
293   if (ireg[predr] & 1<<i) # predication uses intregs
294    ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
295 if (reg\_is\_vectorised[rd] )  \{ id += 1; \}
296 if (reg\_is\_vectorised[rs1])  \{ irs1 += 1; \}
297 if (reg\_is\_vectorised[rs2])  \{ irs2 += 1; \}
298 \end{semiverbatim}
299
300 \begin{itemize}
301 \item Above is oversimplified: Reg. indirection left out (for clarity).
302 \item SIMD slightly more complex (case above is elwidth = default)
303 \item Scalar-scalar and scalar-vector and vector-vector now all in one
304 \item OoO may choose to push ADDs into instr. queue (v. busy!)
305 \end{itemize}
306 \end{frame}
307
308 \frame{\frametitle{Additional Simple-V features}
309
310 \begin{itemize}
311 \item "fail-on-first" (POWER9 VSX strncpy segfaults on boundary!)
312 \item "Twin Predication" (covers VSPLAT, VGATHER, VSCATTER, VINDEX etc.)
313 \item SVP64: extensive "tag" (Vector context) augmentation
314 \item "Context propagation": a VLIW-like context. Allows contexts
315 to be repeatedly applied.
316 Effectively a "hardware compression algorithm" for ISAs.
317 \item Ultimate goal: cut down I-Cache usage, cuts down on power
318 \item Typical GPU has its own I-Cache and small shaders.\\
319 \textit{We are a Hybrid CPU/GPU: I-Cache is not separate!}
320 \item Needs to go through OpenPOWER Foundation `approval'
321 \end{itemize}
322 }
323
324
325 \frame{\frametitle{Summary}
326
327 \begin{itemize}
328 \item Goal is to create a mass-volume low-power embedded SoC suitable
329 for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
330 \item No way we could implement a project of this magnitude without
331 nmigen (being able to use python OO to HDL)
332 \item Collaboration with OpenPOWER Foundation and Members absolutely
333 essential. No short-cuts. Standards to be developed and ratified
334 so that everyone benefits.
335 \item Riding the wave of huge stability of OpenPOWER ecosystem
336 \item Greatly simplified open 3D and Video drivers reduces product
337 development costs for customers
338 \item It also happens to be fascinating, deeply rewarding technically
339 challenging, and funded by NLnet
340
341 \end{itemize}
342 }
343
344
345 \frame{
346 \begin{center}
347 {\Huge The end\vspace{12pt}\\
348 Thank you\vspace{12pt}\\
349 Questions?\vspace{12pt}
350 }
351 \end{center}
352
353 \begin{itemize}
354 \item Discussion: http://lists.libre-soc.org
355 \item Freenode IRC \#libre-soc
356 \item http://libre-soc.org/
357 \item http://nlnet.nl/PET
358 \item https://libre-soc.org/nlnet/\#faq
359 \end{itemize}
360 }
361
362
363 \end{document}