(no commit message)
[libreriscv.git] / conferences / openpower2021 / openpower_2021.tex
1 \documentclass[slidestop]{beamer}
2 \usepackage{beamerthemesplit}
3 \usepackage{graphics}
4 \usepackage{pstricks}
5
6 \graphicspath{{./}}
7
8 \title{The Libre-SOC Hybrid 3D CPU}
9 \author{Luke Kenneth Casson Leighton}
10
11
12 \begin{document}
13
14 \frame{
15 \begin{center}
16 \huge{The Libre-SOC Hybrid 3D CPU}\\
17 \vspace{32pt}
18 \Large{Draft SVP64 in-place Matrix Multiply}\\
19 \Large{and FFT / DCT for the Power ISA}\\
20 \vspace{24pt}
21 \Large{OpenPOWER Summit 2021}\\
22 \vspace{16pt}
23 \large{Sponsored by NLnet Grants}\\
24 \large{and NGI POINTER Grants}\\
25 \vspace{6pt}
26 \large{28th Oct 2021}
27 \end{center}
28 }
29
30
31 \frame{\frametitle{Overview of Libre-SOC goals}
32
33 \vspace{8pt}
34
35 \begin{itemize}
36 \item To create power-efficient mass-volume products\vspace{8pt}
37 \item To leverage the OpenPOWER ecosystem to do so\vspace{8pt}
38 \item To be entirely transparent for Security reasons\vspace{8pt}
39 \item To empower businesses to bring Secure transparent\\
40 mass-volume products to market\vspace{8pt}
41 \item Mass-volume end-user products need 3D, Video, Audio
42 \textbf{therefore we require small-size Matrices (3x3 but not with
43 75\% utilisation, and 4x4) and the core strategic parts
44 of A/V CODECs and that means DCT and FFT.}
45 Anything else is a bonus (NTT with Galois Field bitmanip)
46 \end{itemize}
47 }
48
49 \frame{\frametitle{Overview of SVP64 goals}
50
51 \vspace{15pt}
52
53 \begin{itemize}
54 \item High performance and high performance/watt\vspace{10pt}
55 \item Reduced code density (reduced I-Cache usage)\\
56 https://arxiv.org/abs/2002.10143 - 3.5x power reduction\vspace{8pt}
57 \item Remain accessible for assembler writers and compilers alike\vspace{10pt}
58 \item Introduce true Vectorisation to the Power ISA\\
59 (VSX is Packed SIMD)\vspace{8pt}
60 \item Be adopted via the external OPF ISA WG RFC process\\
61 (not: be a non-official custom extension. proprietary\\
62 custom extensions conflict with mass-volume adoption)\vspace{10pt}
63 \end{itemize}
64 }
65
66
67
68 \begin{frame}[fragile]
69 \frametitle{Reminder of Simple-V}
70
71 \begin{semiverbatim}
72 https://libre-soc.org/openpower/sv/overview/
73 Greatly simplified (like x86 "REP" instruction):
74
75  for (i = 0; i < VL; i++)
76    GPR[RT+i] <= GPR[RA+i] + GPR[RB+i];
77
78 function op\_add(RT, RA, RB, predr) # add not VADD!
79  int i, id=0, irs1=0, irs2=0;
80  for (i = 0; i < VL; i++)
81   if (GPR[predr] & 1<<i) # predication
82    GPR[RT+id] <= GPR[RA+irs1] + GPR[RB+irs2];
83 if (reg\_is\_vectorised[RT])  \{ id += 1; \}
84 if (reg\_is\_vectorised[RA])  \{ irs1 += 1; \}
85 if (reg\_is\_vectorised[RB])  \{ irs2 += 1; \}
86 \end{semiverbatim}
87
88 \end{frame}
89
90
91 \begin{frame}[fragile]
92 \frametitle{SVP64 REMAP system}
93
94 \begin{semiverbatim}
95 Register offsets are "REMAP"ed through a Hardware FSM
96 https://libre-soc.org/openpower/sv/remap/
97 remarkably similar to ZOLC
98 https://www.researchgate.net/publication/224647569
99
100 function op\_add(RT, RA, rs2, predr) # add not VADD!
101  int i, id=0, irs1=0, irs2=0;
102  for (i = 0; i < VL; i++)
103   if (GPR[predr] & 1<<i) # predication
104    GPR[RT+REMAP(id)] <= GPR[RA+REMAP(irs1)] +
105 GPR[rs2+REMAP(irs2)];
106 if (reg\_is\_vectorised[RT])  \{ id += 1; \}
107 if (reg\_is\_vectorised[RA])  \{ irs1 += 1; \}
108 if (reg\_is\_vectorised[s2])  \{ irs2 += 1; \}
109 \end{semiverbatim}
110
111 \end{frame}
112
113 \begin{frame}[fragile]
114 \frametitle{Matrix Multiply: Basics}
115
116 \begin{semiverbatim}
117 (a00 a01 a02 x (b00 b01 = (c00 c01
118 a10 a11 a12) b10 b11 c10 c11) = ...
119 b20 b21)
120
121 (a00*b00 + a01*b10 + a02*b20 a00*b01 + a01*b11 + a02*b21
122 a10*b00 + a11*b10 + a12*b20 a10*b01 + a11*b11 + a12*b21)
123
124 (b00 b01 x (a00 a01 a02 = (c00 c01 c02
125 b10 b11 a10 a11 a12) c10 c11 c12
126 b20 b21) c20 c21 c22) = ...
127
128 (b00*a00 + b01*a10 b00*a01 + b01*a11 b00*a02 + b01*a12
129 b10*a00 + b11*a10 b10*a01 + b11*a11 b10*a02 + b11*a12
130 b20*a00 + b21*a10 b20*a01 + b21*a11 b20*a02 + b21*a12)
131
132 \end{semiverbatim}
133
134 \end{frame}
135
136
137 \begin{frame}[fragile]
138 \frametitle{Matrix Multiply: naive, with python for-loops}
139
140 \begin{semiverbatim}
141 result = [] # final result
142 for i in range(len(A)):
143
144 row = [] # the new row in new matrix
145 for j in range(len(B[0])):
146
147 product = 0 # the new element in the new row
148 for v in range(len(A[i])):
149 product += A[i][v] * B[v][j]
150 row.append(product) # add sum of product to new row
151
152 result.append(row) # add new row into final result
153 \end{semiverbatim}
154
155 \end{frame}
156
157 \begin{frame}[fragile]
158 \frametitle{Matrix Multiply: suitable for Hardware scheduling}
159
160 \begin{semiverbatim}
161 Unsuitable: creates massive Read-After-Write chains
162
163 for i in range(len(A)):
164 for j in range(len(B[0])):
165 for v in range(len(A[i])):
166 product[i][j] += A[i][v] * B[v][j]
167
168 Suitable: can be parallelised / pipelined. RaW avoided
169
170 for i in range(len(A)):
171 for v in range(len(A[i])): # swapped
172 for j in range(len(B[0])): # with this
173 product[i][j] += A[i][v] * B[v][j]
174
175 \end{semiverbatim}
176
177 \end{frame}
178
179
180 \frame{\frametitle{Matrix Multiply: Generalise but Specialise}
181
182 \begin{itemize}
183 \item Why not make a general-purpose nested "Loop" system?\\
184 - Other uses (algorithms) beyond Matrix Multiplication\\
185 - Allow any arbitrary-sized loops\\
186 - Allow any permutation of nesting\\
187 - Allow reversing per-dimension
188 \item Specialise by making Matrix Multiply "setup" quick/easy\\
189 - two 32-bit instructions to set up A, B, C sizes\\
190 - one 64-bit SVP64 FMAC instruction (hot-loop).\\
191 - Nothing else needed. Saves on I-Cache
192 \item Hardware turns out to be near-identical to ZOLC\\
193 https://opencores.org/projects/hwlu\\
194 https://libre-soc.org/openpower/sv/remap/
195 \item Concept is actually borrowed from Aspex Array-String Processor
196 1D/2D/3D Memory DMA "reordering" Engine (except applied to
197 the register file)
198 \end{itemize}
199 }
200
201 \begin{frame}[fragile]
202 \frametitle{Matrix Multiply: unit test / example}
203
204 \begin{semiverbatim}
205 94 def test_sv_remap2(self):
206 95 lst = ["svshape 5, 4, 3, 0, 0",
207 96 "svremap 0b11111, 1, 2, 3, 0, 0, 0, 0",
208 97 "sv.fmadds 0.v, 8.v, 16.v, 0.v"
209 98 ]
210 99 REMAP fmadds FRT, FRA, FRC, FRB
211
212 svshape 5, 4, 3, 0, 0 => A: 3x5 B: 3x4
213 => C: 3x3
214 svremap (enable) (F)RS, (F)RT, (F)RA, (F)RB, (F)RC
215 sv.fmadds: uses fp0 as accumulator
216 product[i][j] += A[i][v] * B[v][j]
217 \end{semiverbatim}
218
219 \end{frame}
220
221 \frame{\frametitle{Matrix Multiply: Ehm that's all Folks}
222
223 \vspace{6pt}
224
225 \begin{itemize}
226 \item Really is that straightforward: no actual Vector ops\\
227 - Does not dictate or limit micro-architectural detail\\
228 - Issues Scalar FMACs into existing back-end hardware\\
229 - Can use any 4-operand instruction (GF, INT, Bitmanip)\\
230 - No Power-2 limits. Any operand width (8/16/32/64)\vspace{8pt}
231 \item Limited to 127 scalar ops and in-place registers. Future?\\
232 - https://arxiv.org/abs/2002.10143 CISC-like load-and-inc\\
233 - Auto-load/store (tagged) registers, keeps RISC ISA\\
234 - Extend to memory-based arbitrary NxN matrix sizes\\
235 - Still power-efficient: no I-cache usage during FMAC issue\vspace{8pt}
236 \item Future can be investigated as part of EUR 22.6m EU Grant\\
237 https://libre-soc.org/SEP-210803722-Libre-SOC-8-core/\vspace{15pt}
238 \end{itemize}
239 }
240
241
242 \frame{\frametitle{DCT / FFT / DFT / NTT: what if we could REMAP?}
243
244 \vspace{6pt}
245
246 \begin{itemize}
247 \item Can we create a REMAP Schedule for FFT (etc)? YES\\
248 - More complicated than Matrix Schedules but same principle\\
249 - Again: issues Scalar instructions into back-end micro-arch\\
250 - Requires 5-operand (3-in, 2-out) new Scalar Instructions\\
251 - Any operand width (8/16/32/64)\vspace{8pt}
252 \item Limited to in-place registers and Power-of-Two. Future?\\
253 - Again: CISC-like auto-load/store-and-increment\\
254 - https://arxiv.org/abs/2002.10143\\
255 - Again: still power-efficient (no I-Cache usage in loops)\vspace{8pt}
256 \item Again: can be investigated as part of EUR 22.6m EU Grant\\
257 https://libre-soc.org/SEP-210803722-Libre-SOC-8-core/\vspace{15pt}
258 \end{itemize}
259 }
260
261
262 \frame{\frametitle{DCT / FFT / DFT / NTT: other implementations?}
263
264 \vspace{6pt}
265
266 \begin{itemize}
267 \item Texas Instruments TMS320 and C6xx DSPs (VLIW) \\
268 - 14 u-Ops per VLIW (including Zero-Overhead Looping)\\
269 - Performs Odd/Even FP32 looping single-instruction FFT\\
270 - Cannot do anything other than FP32\\
271 - Otherwise absolutely brilliant and elegant (20+ years)
272 \item Qualcom Hexagon DSP\\
273 - Again: VLIW (29 RISC-like u-Ops in 1 cycle)\\
274 - Seriously power-efficient and effective\\
275 - Has ZOLC and Complex-number Multiply\\
276 - Only seems to handle the inner loop of FFT though
277 \item SVP64\\
278 - not limited to inner loop (handles entire triple-loop) \\
279 - like Hexagon, not limted to type of operation inside loop \\
280 - Complex-number ops: a bit too heavy-duty for now (later?)
281 \end{itemize}
282 }
283
284
285 \frame{\frametitle{Discrete Cosine Transform (DCT): Basics}
286
287 \begin{itemize}
288 \item Standard DCT Schedule (messy, impossible for SIMD)
289 \item Output is in bit-reversed order\\
290 0b000 = 0b000 (in: 0 out: 0)\\
291 0b001 = 0b100 (in: 1 out: 4) ...\\
292 0b110 = 0b011 (in: 6 out: 3)\\
293 0b111 = 0b111 (in: 7 out: 7)
294 \end{itemize}
295
296 \begin{center}
297 \includegraphics[width=0.75\textwidth]{plonka_dct1.png}
298 \end{center}
299
300 }
301
302 \frame{\frametitle{Fast Fourier Transform (FFT/DFT): Butterfly Basics}
303
304 \begin{itemize}
305 \item Standard Butterfly Schedule (again: messy, but less so)
306 \item Output, again, is in bit-reversed order
307 \end{itemize}
308
309 \begin{center}
310 \includegraphics[width=0.70\textwidth]{fft_butterfly.png}
311 \end{center}
312
313 }
314
315 \frame{\frametitle{FFT: 3-in, 2-out butterfly}
316
317 \begin{itemize}
318 \item One multiply (by coefficient), one add, one subtract
319 \item inputs: X[0] X[1] C(oeff) outputs: X[0] X[1]
320 \end{itemize}
321
322 \begin{center}
323 \includegraphics[width=0.90\textwidth]{2-point-dft.png}
324 \end{center}
325
326 }
327
328
329 \begin{frame}[fragile]
330 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (1)}
331
332 \begin{semiverbatim}
333 coef = (2 if inverse else -2) * cmath.pi / n
334 exptable = [cmath.rect(1, i*coef) for i in range(n // 2)]
335 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
336 size = 2
337 while size <= n:
338 halfsize, tablestep = size // 2, n // size
339 for i in range(0, n, size):
340 k = 0
341 for j in range(i, i + halfsize):
342 temp = vec[j + halfsize] * exptable[k]
343 vec[j + halfsize] = vec[j] - temp
344 vec[j] += temp
345 k += tablestep
346 size *= 2
347 \end{semiverbatim}
348
349 \end{frame}
350
351
352 \begin{frame}[fragile]
353 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (2)}
354
355 \begin{semiverbatim}
356 coef = (2 if inverse else -2) * cmath.pi / n
357 exptable = [cmath.rect(1, i*coef) for i in range(n // 2)]
358 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
359 size = 2
360 while size <= n:
361 hs, tablestep = size // 2, n // size
362 for i in range(0, n, size):
363 k = 0
364 for j in range(i, i+hs):
365 # Twin-Butterfly 3-in 2-out: one instruction
366 C = exptable[k]
367 vec[j+hs], vec[j] = 2B(vec[j+hs], vec[j], C)
368 k += tablestep
369 size *= 2
370 \end{semiverbatim}
371
372 \end{frame}
373
374 \begin{frame}[fragile]
375 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (3)}
376
377 \begin{itemize}
378 \item What if the Triple Loop could be done with REMAP?
379 \item Register Offsets j, j+hs, k created automatically?
380 \item Only one actual inner loop instruction (Twin-butterfly)
381 \item 3-in (X0/X1/C) 2-out (X0/X1) allows for in-place FFT
382 \item Hardware FSM (like ZOLC) creates offset triplet\\
383 - Technically not that hard to implement (for Radix-2)\\
384 - Exact same principle as Triple-loop for Matrices
385 \end{itemize}
386
387 \begin{semiverbatim}
388 for j,k,hs in REMAP_TRIPLE_LOOP_GENERATOR():
389 # Twin-Butterfly 3-in 2-out: one instruction
390 C = exptable[k]
391 vec[j+hs], vec[j] = 2B(vec[j+hs], vec[j], C)
392 \end{semiverbatim}
393
394 \end{frame}
395
396 \frame{\frametitle{DCT: pre-arrange (pre-load) data}
397
398 \begin{itemize}
399 \item Arrange input data such that output falls into place
400 \item (another) Twin 3-in 2-out Mul-Add in-place instruction
401 \end{itemize}
402
403 \begin{center}
404 \includegraphics[width=0.70\textwidth]{dct_butterfly.png}
405 \end{center}
406
407 }
408
409
410 \frame{\frametitle{FFT (Complex numbers) and DCT coefficients?}
411
412 \begin{itemize}
413 \item Problem (DCT): DCT Cosine Coefficients change (cos + 0.5)
414 depending on the layer. Cannot do as single instruction
415 \item Problem (FFT): Complex number butterfly multiplication involves
416 4 multiplies. Cannot do in-place as single instruction\vspace{12pt}
417 \item Solution: "Vertical-First" Vectors (Mitch Alsup 66000 ISA)\vspace{12pt}
418 \item Understanding of SVP64 "Vertical-First" 30min video
419 https://youtube.com/watch?v=fn2KJvWyBKg
420 \item Basically involves stepping "vertically" through instructions
421 then ("stepping") to the next offset (REMAP), loop with bc
422 \item Horizontal-first: run through the entire REMAP schedule on a single
423 instruction before repeating looping on the next
424
425 \end{itemize}
426 }
427
428 \frame{\frametitle{Summary}
429
430 \begin{itemize}
431 \item Goal is to create a mass-volume low-power embedded SoC suitable
432 for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
433 \item This means a computational focus on 3D and Audio/Video.\\
434 - Critical not to waste 75\% of Power-2 SIMD Lanes on 3x3
435 \item Reducing core work to a one-instruction hot-loop inherently
436 reduces power consumption because the I-Cache is 100\% idle.
437 \item REMAP system completely independent from the instructions it
438 REMAPs. Applies to future scalar ops (GF, Bitmanip)
439 \item Future versions involve proper Zero-Overhead Loop-Control
440 and hidden "tags" to automatically perform CISC-like
441 auto-load/store-and-inc (for much larger data sets)
442 \item Please help contribute: it's your Open Power ISA too.
443
444 \end{itemize}
445 }
446
447
448 \frame{
449 \begin{center}
450 {\Huge The end\vspace{10pt}\\
451 Thank you\vspace{10pt}\\
452 Questions?\vspace{10pt}
453 }
454 \end{center}
455
456 \begin{itemize}
457 \item Discussion: Libre-SOC-ISA mailing list\\
458 http://lists.libre-soc.org/mailman/listinfo/libre-soc-isa
459 \item Libera IRC \#libre-soc
460 \item http://libre-soc.org/
461 \item http://nlnet.nl/PET\\
462 https://www.ngi.eu/ngi-projects/ngi-pointer/
463 \end{itemize}
464 }
465
466
467 \end{document}