conferences/xdc2020.tex

   1 \documentclass[slidestop]{beamer}
   2 \usepackage{beamerthemesplit}
   3 \usepackage{graphics}
   4 \usepackage{pstricks}
   5
   6 \graphicspath{{./}}
   7
   8 \title{The Libre-SOC Hybrid 3D CPU}
   9 \author{Luke Kenneth Casson Leighton}
  10
  11
  12 \begin{document}
  13
  14 \frame{
  15    \begin{center}
  16     \huge{The Libre-SOC Hybrid 3D CPU}\\
  17     \vspace{32pt}
  18     \Large{Augmenting the OpenPOWER ISA}\\
  19     \Large{to provide 3D and Video instructions}\\
  20     \Large{(properly and officially) and make a GPU}\\
  21     \vspace{24pt}
  22     \Large{XDC2020}\\
  23     \vspace{16pt}
  24     \large{Sponsored by NLnet's PET Programme}\\
  25     \vspace{6pt}
  26     \large{\today}
  27   \end{center}
  28 }
  29
  30
  31 \frame{\frametitle{Why another SoC?}
  32
  33  \begin{itemize}
  34    \item Intel Management Engine, Apple QA issues, Spectre\vspace{6pt}
  35    \item Endless proprietary drivers, "simplest" solution: \\
  36          License proprietary hard macros (with proprietary firmware)\\
  37                  Adversely affects product development cost\\
  38                 due to opaque driver bugs (Samsung S3C6410 / S5P100)
  39                  \vspace{6pt}
  40    \item Alternative: Intel and Valve-Steam collaboration\\
  41          "Most productive business meeting ever!"\\
  42          https://tinyurl.com/valve-steam-intel
  43                 \vspace{6pt}
  44    \item Because for 30 years I Always Wanted To Design A CPU
  45                 \vspace{6pt}
  46    \item Ultimately it is a strategic \textit{business} objective to
  47          develop entirely Libre hardware, firmware and drivers.
  48   \end{itemize}
  49 }
  50
  51
  52 \frame{\frametitle{Why OpenPOWER?}
  53
  54 \vspace{15pt}
  55
  56  \begin{itemize}
  57    \item Good ecosystem essential\\
  58                  linux kernel, u-boot, compilers, OSes,\\
  59                  Reference Implementation(s)\vspace{10pt}
  60    \item Supportive Foundation and Members\\
  61                  need to be able to submit ISA augmentations\\
  62                  (for proper peer review)\vspace{10pt}
  63    \item No NDAs, full transparency must be acceptable\\
  64              due to being funded under NLnet's PET Programme\vspace{10pt}
  65    \item OpenPOWER: established for decades, excellent Foundation,\\
  66              Microwatt as Reference, approachable and friendly.
  67   \end{itemize}
  68 }
  69
  70
  71 \frame{\frametitle{What goes into a typical SoC?}
  72 \vspace{9pt}
  73  \begin{itemize}
  74    \item 15 to 20mm BGA package: 2.5 to 5 watt power consumption\\
  75                 heat sink normally not required (simplifies overall design)
  76                 \vspace{10pt}
  77    \item Fully-integrated peripherals (not Northbridge/Southbridge)\\
  78          USB, HDMI, RGB/TTL, SD/MMC, I2C, UART, SPI, GPIO etc. etc.
  79          \vspace{10pt}
  80    \item Built-in GPU (shared memory bus, 3rd party licensed) \vspace{10pt}
  81    \item Build-in VPU (likewise)\vspace{10pt}
  82    \item Target price between \$2.50 and \$30 depending on market\\
  83          Radically different from IBM POWER9 Core (200 Watt)
  84          \vspace{10pt}
  85   \end{itemize}
  86 }
  87
  88
  89
  90 \frame{\frametitle{Simple SBC-style SoC}
  91
  92 \begin{center}
  93 \includegraphics[width=0.9\textwidth]{shakti_libre_soc.jpg}
  94 \end{center}
  95
  96 }
  97
  98 \frame{\frametitle{What's different about Libre-SOC?}
  99
 100  \begin{itemize}
 101    \item Hybrid - integrated.  The CPU \textit{is} the GPU.\\
 102          The GPU \textit{is} the CPU.  The VPU \textit{is} the CPU.\\
 103          \textit{There is No Separate VPU/GPU Pipeline}\\
 104                   \vspace{9pt}
 105    \item written in nmigen (a python-based HDL).  Not VHDL\\
 106                   not Verilog (definitely not Chisel3/Scala)\\
 107                   This is an extremely important strategic decision.\\
 108                   \vspace{9pt}
 109    \item Simple-V Vector Extension.  See `SIMD Considered harmful'.\\
 110                   https://tinyurl.com/simd-considered-harmful\\
 111                 SV effectively a "hardware for-loop" on standard scalar ISA\\
 112                 (conceptually similar to Zero-Overhead Loops in DSPs)
 113                   \vspace{9pt}
 114   \end{itemize}
 115 }
 116
 117 \frame{\frametitle{Hybrid Architecture: Augmented 6600}
 118
 119  \begin{itemize}
 120    \item CDC 6600 is a design from 1965.  The \textit{augmentations} are not.\\
 121                  Help from Mitch Alsup includes \textit{precise exceptions}, \\
 122                  multi-issue and more. Academic literature on 6600 utterly misleading.
 123                 6600 Scoreboards completely underestimated (Seymour Cray and
 124                 James Thornton
 125                 solved problems they didn't realise existed elsewhere!)
 126    \item Front-end Vector ISA, back-end "Predicated (masked) SIMD"\\
 127          nmigen (python OO) strategically critical to achieving this.
 128    \item Out-of-order combined with Simple-V allows scalar operations\\
 129          at the developer end to be turned into SIMD at the back-end\\
 130          \textit{without the developer needing to do SIMD}
 131    \item IEEE754 sin / cos / atan2, Texturisation opcodes, YUV2RGB\\
 132                  all automatically vectorised.
 133   \end{itemize}
 134 }
 135
 136 \frame{\frametitle{Why nmigen?}
 137
 138  \begin{itemize}
 139    \item Uses python to build an AST (Abstract Syntax Tree).
 140          Actually hands that over to yosys (to create ILANG file)
 141          after which verilog can (if necessary) be created
 142    \item Deterministic synthesiseable behaviour (Signals are declared
 143          with their reset pattern: no more forgetting "if rst" block).
 144    \item python OO programming techniques can be deployed.  classes
 145          and functions created which pass in parameters which change
 146          what HDL is created (IEEE754 FP16 / 32 / 64 for example)
 147    \item python-based for-loops can e.g. read CSV files then generate
 148          a hierarchical nested suite of HDL Switch / Case statements
 149          (this is how the Libre-soc PowerISA decoder is implemented)
 150    \item extreme OO abstraction can even be used to create "dynamic
 151          partitioned Signals" that have the same operator-overloaded
 152          "add", "subtract", "greater-than" operators
 153
 154   \end{itemize}
 155 }
 156
 157 \frame{\frametitle{Why another Vector ISA? (or: not-exactly another)}
 158
 159  \begin{itemize}
 160    \item Simple-V is a 'register tag' system.  \textit{There are no opcodes}\\
 161                  SV 'tags' scalar operations (scalar regfiles) as 'vectorised'
 162    \item (PowerISA SIMD is around 700 opcodes, making it unlikely to be
 163          able to fit a PowerISA decoder in only one clock cycle)
 164    \item Effectively a 'hardware sub-counter for-loop': pauses the PC\\
 165          then rolls incrementally through the operand register numbers\\
 166          issuing \textit{multiple} scalar instructions into the pipelines\\
 167          (hence the reason for a multi-issue OoO microarchitecture)
 168    \item Current \textit{and future} PowerISA scalar opcodes inherently
 169                  \textit{and automatically} become 'vectorised' by SV without
 170                  needing an explicit new Vector opcode.
 171    \item Predication and element width polymorphism are also 'tags'.
 172          elwidth polymorphism allows for FP16 / 80 / 128 to be added to
 173          the ISA \textit{without modifying the ISA}
 174
 175   \end{itemize}
 176 }
 177
 178 \frame{\frametitle{Quick refresher on SIMD}
 179
 180  \begin{itemize}
 181    \item SIMD very easy to implement (and very seductive)
 182    \item Parallelism is in the ALU
 183    \item Zero-to-Negligeable impact for rest of core
 184   \end{itemize}
 185   Where SIMD Goes Wrong:\vspace{6pt}
 186    \begin{itemize}
 187    \item See "SIMD instructions considered harmful"
 188    https://sigarch.org/simd-instructions-considered-harmful
 189    \item Setup and corner-cases alone are extremely complex.\\
 190          Hardware is easy, but software is hell.\\
 191          strncpy VSX patch for POWER9: 250 hand-written asm lines!\\
 192          (RVV / SimpleV strncpy is 14 instructions)
 193    \item O($N^{6}$) ISA opcode proliferation (1000s of instructions)\\
 194          opcode, elwidth, veclen, src1-src2-dest hi/lo
 195   \end{itemize}
 196 }
 197
 198 \begin{frame}[fragile]
 199 \frametitle{Simple-V ADD in a nutshell}
 200
 201 \begin{semiverbatim}
 202 function op\_add(rd, rs1, rs2, predr) # add not VADD!
 203   int i, id=0, irs1=0, irs2=0;
 204   for (i = 0; i < VL; i++)
 205     if (ireg[predr] & 1<<i) # predication uses intregs
 206        ireg[rd+id] <= ireg[rs1+irs1] + ireg[rs2+irs2];
 207     if (reg\_is\_vectorised[rd] )  \{ id += 1; \}
 208     if (reg\_is\_vectorised[rs1])  \{ irs1 += 1; \}
 209     if (reg\_is\_vectorised[rs2])  \{ irs2 += 1; \}
 210 \end{semiverbatim}
 211
 212   \begin{itemize}
 213    \item Above is oversimplified: Reg. indirection left out (for clarity).
 214    \item SIMD slightly more complex (case above is elwidth = default)
 215    \item Scalar-scalar and scalar-vector and vector-vector now all in one
 216    \item OoO may choose to push ADDs into instr. queue (v. busy!)
 217   \end{itemize}
 218 \end{frame}
 219
 220 \begin{frame}[fragile]
 221 \frametitle{Predication-Branch (overload meaning of "branch")}
 222
 223 \begin{semiverbatim}
 224 s1 = reg\_is\_vectorised(src1);
 225 s2 = reg\_is\_vectorised(src2);
 226 if (!s2 && !s1) goto branch;
 227 for (int i = 0; i < VL; ++i)
 228   if (cmp(s1 ? reg[src1+i]:reg[src1],
 229           s2 ? reg[src2+i]:reg[src2])
 230          ireg[rs3] |= 1<<i;
 231 \end{semiverbatim}
 232
 233   \begin{itemize}
 234    \item Above is oversimplified (case above is elwidth = default)
 235    \item If s1 and s2 both scalars, Standard branch occurs
 236    \item Predication stored in integer regfile as a bitfield
 237    \item Scalar-vector and vector-vector supported
 238    \item Overload Branch immediate to be predication target rs3
 239   \end{itemize}
 240 \end{frame}
 241
 242 \begin{frame}[fragile]
 243 \frametitle{Register element width and packed SIMD}
 244
 245 \begin{semiverbatim}
 246     typedef union \{
 247        uint8\_t  actual\_bytes[8]; // actual SRAM bytes
 248        uint8\_t  b[]; // array of type uint8\_t
 249        uint16\_t s[]; // etc
 250        uint32\_t i[];
 251        uint64\_t l[];
 252     \} reg\_t;
 253
 254     reg\_t int\_regfile[128];
 255 \end{semiverbatim}
 256
 257  \begin{itemize}
 258    \item Regfile is treated (sort-of) as a byte-level SRAM
 259    \item Each "register" starts at an 8-byte offset into SRAM
 260    \item requires byte-level "select" lines on SRAM
 261   \end{itemize}
 262
 263 \end{frame}
 264
 265 \frame{\frametitle{Register element width and packed SIMD}
 266
 267  \begin{itemize}
 268    \item default: elements behave as defined by the standard ISA
 269    \item override for Integer operations: 8/16/32 bit SIMD
 270    \item override for IEEE754 FP: FP16/FP32 (and later FP80 or FP128)
 271    \item Effectively "typecasts" regfile to union of arrays
 272    \item Does not require modification of ISA!  This is "tagging"\\
 273          (similar to the `Mill' ISA)
 274    \item FPADD64 RT, RA, RB becomes `actually please do FP16'\\
 275          (but without needing to add an actual FPADD16 opcode)
 276    \item Note: no zeroing unless explicitly requested!\\
 277          (unused elements e.g. VL=3 when elwidth=16 are
 278          predicated out: int\_regfile[RA].s[3] is not zero'd)
 279   \end{itemize}
 280
 281 }
 282
 283 \frame{\frametitle{Additional Simple-V features}
 284
 285  \begin{itemize}
 286    \item "fail-on-first" (POWER9 VSX strncpy segfaults on boundary!)
 287    \item "Twin Predication" (covers VSPLAT, VGATHER, VSCATTER, VINDEX etc.)
 288    \item SVPrefix: 16-bit and 32-bit prefix to scalar operations\\
 289              (SVP-64 allows more extensive "tag" augmentation)
 290    \item VBLOCK: a VLIW-like context.  Allows space for `swizzle' tags
 291          and more.  Effectively a "hardware compression algorithm" for ISAs.
 292    \item Ultimate goal: cut down I-Cache usage, cuts down on power
 293    \item Typical GPU has its own I-Cache and small shaders.\\
 294         \textit{We are a Hybrid CPU/GPU: I-Cache is not separate!}
 295    \item Needs to go through OpenPOWER Foundation `approval'
 296   \end{itemize}
 297 }
 298
 299
 300 \frame{\frametitle{Summary}
 301
 302  \begin{itemize}
 303    \item Goal is to create a mass-volume low-power embedded SoC suitable
 304          for use in netbooks, chromebooks, tablets, smartphones, IoT SBCs.
 305    \item No way we could implement a project of this magnitude without
 306          nmigen (being able to use python OO to       HDL)
 307    \item Collaboration with OpenPOWER Foundation and Members absolutely
 308          essential. No short-cuts.  Standards to be developed and ratified
 309          so that everyone benefits.
 310    \item Working on the back of huge stability of POWER ecosystem
 311    \item Greatly simplified open 3D and Video drivers reduces product
 312          development costs for customers
 313    \item It also happens to be fascinating, deeply rewarding technically
 314          challenging, and funded by NLnet
 315
 316   \end{itemize}
 317 }
 318
 319
 320 \frame{
 321   \begin{center}
 322     {\Huge The end\vspace{15pt}\\
 323                    Thank you\vspace{15pt}\\
 324                    Questions?\vspace{15pt}
 325         }
 326   \end{center}
 327
 328   \begin{itemize}
 329         \item Discussion: http://lists.libre-soc.org
 330         \item Freenode IRC \#libre-soc
 331         \item http://libre-soc.org/
 332         \item http://nlnet.nl/PET
 333   \end{itemize}
 334 }
 335
 336
 337 \end{document}