conferences/openpower2021/openpower_2021.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5
   6 \graphicspath{{./}}
   7
   8 \title{The Libre-SOC Hybrid 3D CPU}
   9 \author{Luke Kenneth Casson Leighton}
  10
  11
  12 \begin{document}
  13
  14 \frame{
  15    \begin{center}
  16     \huge{The Libre-SOC Hybrid 3D CPU}\\
  17     \vspace{32pt}
  18     \Large{Draft SVP64 in-place Matrix Multiply}\\
  19     \Large{and FFT / DCT for the Power ISA}\\
  20     \vspace{24pt}
  21     \Large{OpenPOWER Summit 2021}\\
  22     \vspace{16pt}
  23     \large{Sponsored by NLnet Grants}\\
  24     \large{and NGI POINTER Grants}\\
  25     \vspace{6pt}
  26     \large{28th Oct 2021}
  27   \end{center}
  28 }
  29
  30
  31 \frame{\frametitle{Overview of Libre-SOC goals}
  32
  33 \vspace{8pt}
  34
  35  \begin{itemize}
  36    \item To create power-efficient mass-volume products\vspace{8pt}
  37    \item To leverage the OpenPOWER ecosystem to do so\vspace{8pt}
  38    \item To be entirely transparent for Security reasons\vspace{8pt}
  39    \item To empower businesses to bring Secure transparent\\
  40          mass-volume products to market\vspace{8pt}
  41    \item Mass-volume end-user products need 3D, Video, Audio
  42          \textbf{therefore we require small-size Matrices (3x3 but not with
  43          75\% utilisation, and 4x4) and the core strategic parts
  44          of A/V CODECs and that means DCT and FFT.}
  45          Anything else is a bonus (NTT with Galois Field bitmanip)
  46   \end{itemize}
  47 }
  48
  49 \frame{\frametitle{Overview of SVP64 goals}
  50
  51 \vspace{15pt}
  52
  53  \begin{itemize}
  54    \item High performance and high performance/watt\vspace{10pt}
  55    \item Reduced code density (reduced I-Cache usage)\\
  56          https://arxiv.org/abs/2002.10143 - 3.5x power reduction\vspace{8pt}
  57    \item Remain accessible for assembler writers and compilers alike\vspace{10pt}
  58    \item Introduce true Vectorisation to the Power ISA\\
  59          (VSX is Packed SIMD)\vspace{8pt}
  60    \item Be adopted via the external OPF ISA WG RFC process\\
  61          (not: be a non-official custom extension. proprietary\\
  62          custom extensions conflict with mass-volume adoption)\vspace{10pt}
  63   \end{itemize}
  64 }
  65
  66
  67
  68 \begin{frame}[fragile]
  69 \frametitle{Reminder of Simple-V}
  70
  71 \begin{semiverbatim}
  72 https://libre-soc.org/openpower/sv/overview/
  73 Greatly simplified (like x86 "REP" instruction):
  74
  75   for (i = 0; i < VL; i++)
  76        GPR[RT+i] <= GPR[RA+i] + GPR[RB+i];
  77
  78 function op\_add(RT, RA, RB, predr) # add not VADD!
  79   int i, id=0, irs1=0, irs2=0;
  80   for (i = 0; i < VL; i++)
  81     if (GPR[predr] & 1<<i) # predication
  82        GPR[RT+id] <= GPR[RA+irs1] + GPR[RB+irs2];
  83     if (reg\_is\_vectorised[RT])  \{ id += 1; \}
  84     if (reg\_is\_vectorised[RA])  \{ irs1 += 1; \}
  85     if (reg\_is\_vectorised[RB])  \{ irs2 += 1; \}
  86 \end{semiverbatim}
  87
  88 \end{frame}
  89
  90
  91 \begin{frame}[fragile]
  92 \frametitle{SVP64 REMAP system}
  93
  94 \begin{semiverbatim}
  95 Register offsets are "REMAP"ed through a Hardware FSM
  96 https://libre-soc.org/openpower/sv/remap/
  97 remarkably similar to ZOLC
  98 https://www.researchgate.net/publication/224647569
  99
 100 function op\_add(RT, RA, rs2, predr) # add not VADD!
 101   int i, id=0, irs1=0, irs2=0;
 102   for (i = 0; i < VL; i++)
 103     if (GPR[predr] & 1<<i) # predication
 104        GPR[RT+REMAP(id)] <= GPR[RA+REMAP(irs1)] +
 105                            GPR[rs2+REMAP(irs2)];
 106     if (reg\_is\_vectorised[RT])  \{ id += 1; \}
 107     if (reg\_is\_vectorised[RA])  \{ irs1 += 1; \}
 108     if (reg\_is\_vectorised[s2])  \{ irs2 += 1; \}
 109 \end{semiverbatim}
 110
 111 \end{frame}
 112
 113 \begin{frame}[fragile]
 114 \frametitle{Matrix Multiply: Basics}
 115
 116 \begin{semiverbatim}
 117 (a00 a01 a02  x (b00 b01   =   (c00 c01
 118  a10 a11 a12)    b10 b11        c10 c11)  = ...
 119                  b20 b21)
 120
 121 (a00*b00 + a01*b10 + a02*b20 a00*b01 + a01*b11 + a02*b21
 122  a10*b00 + a11*b10 + a12*b20 a10*b01 + a11*b11 + a12*b21)
 123
 124  (b00 b01    x (a00 a01 a02  =   (c00 c01 c02
 125   b10 b11       a10 a11 a12)      c10 c11 c12
 126   b20 b21)                        c20 c21 c22)  = ...
 127
 128 (b00*a00 + b01*a10  b00*a01 + b01*a11  b00*a02 + b01*a12
 129  b10*a00 + b11*a10  b10*a01 + b11*a11  b10*a02 + b11*a12
 130  b20*a00 + b21*a10  b20*a01 + b21*a11  b20*a02 + b21*a12)
 131
 132 \end{semiverbatim}
 133
 134 \end{frame}
 135
 136
 137 \begin{frame}[fragile]
 138 \frametitle{Matrix Multiply: naive, with python for-loops}
 139
 140 \begin{semiverbatim}
 141 result = [] # final result
 142 for i in range(len(A)):
 143
 144   row = [] # the new row in new matrix
 145   for j in range(len(B[0])):
 146
 147     product = 0 # the new element in the new row
 148     for v in range(len(A[i])):
 149         product += A[i][v] * B[v][j]
 150     row.append(product) # add sum of product to new row
 151
 152   result.append(row) # add new row into final result
 153 \end{semiverbatim}
 154
 155 \end{frame}
 156
 157 \begin{frame}[fragile]
 158 \frametitle{Matrix Multiply: suitable for Hardware scheduling}
 159
 160 \begin{semiverbatim}
 161 Unsuitable: creates massive Read-After-Write chains
 162
 163 for i in range(len(A)):
 164   for j in range(len(B[0])):
 165     for v in range(len(A[i])):
 166       product[i][j] += A[i][v] * B[v][j]
 167
 168 Suitable: can be parallelised / pipelined. RaW avoided
 169
 170 for i in range(len(A)):
 171   for v in range(len(A[i])):    # swapped
 172     for j in range(len(B[0])):  # with this
 173       product[i][j] += A[i][v] * B[v][j]
 174
 175 \end{semiverbatim}
 176
 177 \end{frame}
 178
 179
 180 \frame{\frametitle{Matrix Multiply: Generalise but Specialise}
 181
 182  \begin{itemize}
 183    \item Why not make a general-purpose nested "Loop" system?\\
 184         - Other uses (algorithms) beyond Matrix Multiplication\\
 185         - Allow any arbitrary-sized loops\\
 186         - Allow any permutation of nesting\\
 187         - Allow reversing per-dimension
 188    \item Specialise by making Matrix Multiply "setup" quick/easy\\
 189         - two 32-bit instructions to set up A, B, C sizes\\
 190         - one 64-bit SVP64 FMAC instruction (hot-loop).\\
 191         - Nothing else needed.  Saves on I-Cache
 192    \item Hardware turns out to be near-identical to ZOLC\\
 193         https://opencores.org/projects/hwlu\\
 194         https://libre-soc.org/openpower/sv/remap/
 195    \item Concept is actually borrowed from Aspex Array-String Processor
 196          1D/2D/3D Memory DMA "reordering" Engine (except applied to
 197          the register file)
 198   \end{itemize}
 199 }
 200
 201 \begin{frame}[fragile]
 202 \frametitle{Matrix Multiply: unit test / example}
 203
 204 \begin{semiverbatim}
 205   94 def test_sv_remap2(self):
 206   95     lst = ["svshape 5, 4, 3, 0, 0",
 207   96            "svremap 0b11111, 1, 2, 3, 0, 0, 0, 0",
 208   97            "sv.fmadds 0.v, 8.v, 16.v, 0.v"
 209   98           ]
 210   99             REMAP fmadds FRT, FRA, FRC, FRB
 211
 212 svshape 5, 4, 3, 0, 0 => A: 3x5 B: 3x4
 213                       => C: 3x3
 214 svremap (enable) (F)RS, (F)RT, (F)RA, (F)RB, (F)RC
 215 sv.fmadds: uses fp0 as accumulator
 216       product[i][j] += A[i][v] * B[v][j]
 217 \end{semiverbatim}
 218
 219 \end{frame}
 220
 221 \frame{\frametitle{Matrix Multiply: Ehm that's all Folks}
 222
 223 \vspace{6pt}
 224
 225  \begin{itemize}
 226    \item Really is that straightforward: no actual Vector ops\\
 227         - Does not dictate or limit micro-architectural detail\\
 228         - Issues Scalar FMACs into existing back-end hardware\\
 229         - Can use any 4-operand instruction (GF, INT, Bitmanip)\\
 230         - No Power-2 limits. Any operand width (8/16/32/64)\vspace{8pt}
 231    \item Limited to 127 scalar ops and in-place registers.  Future?\\
 232         - https://arxiv.org/abs/2002.10143 CISC-like load-and-inc\\
 233         - Auto-load/store (tagged) registers, keeps RISC ISA\\
 234         - Extend to memory-based arbitrary NxN matrix sizes\\
 235         - Still power-efficient: no I-cache usage during FMAC issue\vspace{8pt}
 236    \item Future can be investigated as part of EUR 22.6m EU Grant\\
 237         https://libre-soc.org/SEP-210803722-Libre-SOC-8-core/\vspace{15pt}
 238   \end{itemize}
 239 }
 240
 241
 242 \frame{\frametitle{DCT / FFT / DFT / NTT: what if we could REMAP?}
 243
 244 \vspace{6pt}
 245
 246  \begin{itemize}
 247    \item Can we create a REMAP Schedule for FFT (etc)? YES\\
 248         - More complicated than Matrix Schedules but same principle\\
 249         - Again: issues Scalar instructions into back-end micro-arch\\
 250         - Requires 5-operand (3-in, 2-out) new Scalar Instructions\\
 251         - Any operand width (8/16/32/64)\vspace{8pt}
 252    \item Limited to in-place registers and Power-of-Two.  Future?\\
 253         - Again: CISC-like auto-load/store-and-increment\\
 254         - https://arxiv.org/abs/2002.10143\\
 255         - Again: still power-efficient (no I-Cache usage in loops)\vspace{8pt}
 256    \item Again: can be investigated as part of EUR 22.6m EU Grant\\
 257         https://libre-soc.org/SEP-210803722-Libre-SOC-8-core/\vspace{15pt}
 258   \end{itemize}
 259 }
 260
 261
 262 \frame{\frametitle{DCT / FFT / DFT / NTT: other implementations?}
 263
 264 \vspace{6pt}
 265
 266  \begin{itemize}
 267    \item Texas Instruments TMS320 and C6xx DSPs (VLIW) \\
 268         - 14 u-Ops per VLIW (including Zero-Overhead Looping)\\
 269         - Performs Odd/Even FP32 looping single-instruction FFT\\
 270         - Cannot do anything other than FP32\\
 271         - Otherwise absolutely brilliant and elegant (20+ years)
 272    \item Qualcom Hexagon DSP\\
 273         - Again: VLIW (29 RISC-like u-Ops in 1 cycle)\\
 274         - Seriously power-efficient and effective\\
 275         - Has ZOLC and Complex-number Multiply\\
 276         - Only seems to handle the inner loop of FFT though
 277     \item SVP64\\
 278         - not limited to inner loop (handles entire triple-loop) \\
 279         - like Hexagon, not limted to type of operation inside loop \\
 280         - Complex-number ops: a bit too heavy-duty for now (later?)
 281   \end{itemize}
 282 }
 283
 284
 285 \frame{\frametitle{Discrete Cosine Transform (DCT): Basics}
 286
 287  \begin{itemize}
 288    \item Standard DCT Schedule (messy, impossible for SIMD)
 289    \item Output is in bit-reversed order\\
 290          0b000 = 0b000 (in: 0   out: 0)\\
 291          0b001 = 0b100 (in: 1   out: 4) ...\\
 292          0b110 = 0b011 (in: 6   out: 3)\\
 293          0b111 = 0b111 (in: 7   out: 7)
 294   \end{itemize}
 295
 296 \begin{center}
 297 \includegraphics[width=0.75\textwidth]{plonka_dct1.png}
 298 \end{center}
 299
 300 }
 301
 302 \frame{\frametitle{Fast Fourier Transform (FFT/DFT): Butterfly Basics}
 303
 304  \begin{itemize}
 305    \item Standard Butterfly Schedule (again: messy, but less so)
 306    \item Output, again, is in bit-reversed order
 307   \end{itemize}
 308
 309 \begin{center}
 310 \includegraphics[width=0.70\textwidth]{fft_butterfly.png}
 311 \end{center}
 312
 313 }
 314
 315 \frame{\frametitle{FFT: 3-in, 2-out butterfly}
 316
 317  \begin{itemize}
 318    \item One multiply (by coefficient), one add, one subtract
 319    \item inputs: X[0] X[1] C(oeff) outputs: X[0] X[1]
 320   \end{itemize}
 321
 322 \begin{center}
 323 \includegraphics[width=0.90\textwidth]{2-point-dft.png}
 324 \end{center}
 325
 326 }
 327
 328
 329 \begin{frame}[fragile]
 330 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (1)}
 331
 332 \begin{semiverbatim}
 333 coef = (2 if inverse else -2) * cmath.pi / n
 334 exptable = [cmath.rect(1, i*coef) for i in range(n // 2)]
 335 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
 336 size = 2
 337 while size <= n:
 338     halfsize, tablestep = size // 2, n // size
 339     for i in range(0, n, size):
 340         k = 0
 341         for j in range(i, i + halfsize):
 342             temp = vec[j + halfsize] * exptable[k]
 343             vec[j + halfsize] = vec[j] - temp
 344             vec[j] += temp
 345             k += tablestep
 346     size *= 2
 347 \end{semiverbatim}
 348
 349 \end{frame}
 350
 351
 352 \begin{frame}[fragile]
 353 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (2)}
 354
 355 \begin{semiverbatim}
 356 coef = (2 if inverse else -2) * cmath.pi / n
 357 exptable = [cmath.rect(1, i*coef) for i in range(n // 2)]
 358 vec = [vec[reverse_bits(i, levels)] for i in range(n)]
 359 size = 2
 360 while size <= n:
 361     hs, tablestep = size // 2, n // size
 362     for i in range(0, n, size):
 363         k = 0
 364         for j in range(i, i+hs):
 365             # Twin-Butterfly 3-in 2-out: one instruction
 366             C = exptable[k]
 367             vec[j+hs], vec[j] = 2B(vec[j+hs], vec[j], C)
 368             k += tablestep
 369     size *= 2
 370 \end{semiverbatim}
 371
 372 \end{frame}
 373
 374 \begin{frame}[fragile]
 375 \frametitle{DFT: Project Nayuki Radix-2 Cooley-Tukey DIT (3)}
 376
 377  \begin{itemize}
 378     \item What if the Triple Loop could be done with REMAP?
 379     \item Register Offsets j, j+hs, k created automatically?
 380     \item Only one actual inner loop instruction (Twin-butterfly)
 381     \item 3-in (X0/X1/C) 2-out (X0/X1) allows for in-place FFT
 382     \item Hardware FSM (like ZOLC) creates offset triplet\\
 383         - Technically not that hard to implement (for Radix-2)\\
 384         - Exact same principle as Triple-loop for Matrices
 385   \end{itemize}
 386
 387 \begin{semiverbatim}
 388 for j,k,hs in REMAP_TRIPLE_LOOP_GENERATOR():
 389             # Twin-Butterfly 3-in 2-out: one instruction
 390             C = exptable[k]
 391             vec[j+hs], vec[j] = 2B(vec[j+hs], vec[j], C)
 392 \end{semiverbatim}
 393
 394 \end{frame}
 395
 396 \frame{\frametitle{DCT: pre-arrange (pre-load) data}
 397
 398  \begin{itemize}
 399    \item Arrange input data such that output falls into place
 400    \item (another) Twin 3-in 2-out Mul-Add in-place instruction
 401   \end{itemize}
 402
 403 \begin{center}
 404 \includegraphics[width=0.70\textwidth]{dct_butterfly.png}
 405 \end{center}
 406
 407 }
 408
 409
 410 \frame{\frametitle{FFT (Complex numbers) and DCT coefficients?}
 411
 412  \begin{itemize}
 413    \item Problem (DCT): DCT Cosine Coefficients change (cos + 0.5)
 414          depending on the layer.  Cannot do as single instruction
 415    \item Problem (FFT): Complex number butterfly multiplication involves
 416          4 multiplies.  Cannot do in-place as single instruction\vspace{12pt}
 417    \item Solution: "Vertical-First" Vectors (Mitch Alsup 66000 ISA)\vspace{12pt}
 418    \item Understanding of SVP64 "Vertical-First" 30min video
 419          https://youtube.com/watch?v=fn2KJvWyBKg
 420    \item Basically involves stepping "vertically" through instructions
 421          then ("stepping") to the next offset (REMAP), loop with bc
 422    \item Horizontal-first: run through the entire REMAP schedule on a single
 423          instruction before repeating looping on the next
 424
 425   \end{itemize}
 426 }
 427
 428 \frame{\frametitle{Summary}
 429
 430  \begin{itemize}
 431    \item Goal is to create a mass-volume low-power embedded SoC suitable
 432          for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
 433    \item This means a computational focus on 3D and Audio/Video.\\
 434          - Critical not to waste 75\% of Power-2 SIMD Lanes on 3x3
 435    \item Reducing core work to a one-instruction hot-loop inherently
 436          reduces power consumption because the I-Cache is 100\% idle.
 437    \item REMAP system completely independent from the instructions it
 438          REMAPs. Applies to future scalar ops (GF, Bitmanip)
 439    \item Future versions involve proper Zero-Overhead Loop-Control
 440          and hidden "tags" to automatically perform CISC-like
 441          auto-load/store-and-inc (for much larger data sets)
 442    \item Please help contribute: it's your Open Power ISA too.
 443
 444   \end{itemize}
 445 }
 446
 447
 448 \frame{
 449   \begin{center}
 450     {\Huge The end\vspace{10pt}\\
 451                    Thank you\vspace{10pt}\\
 452                    Questions?\vspace{10pt}
 453         }
 454   \end{center}
 455
 456   \begin{itemize}
 457         \item Discussion: Libre-SOC-ISA mailing list\\
 458         http://lists.libre-soc.org/mailman/listinfo/libre-soc-isa
 459         \item Libera IRC \#libre-soc
 460         \item http://libre-soc.org/
 461         \item http://nlnet.nl/PET\\
 462     https://www.ngi.eu/ngi-projects/ngi-pointer/
 463   \end{itemize}
 464 }
 465
 466
 467 \end{document}