From: Andrey Miroshnikov Date: Fri, 22 Jul 2022 15:48:23 +0000 (+0100) Subject: Added primer to spec doc, changed img links in sv/overview.mdwn X-Git-Tag: opf_rfc_ls005_v1~1130 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ae91372d29ea526ac117f98466ed7fc22e51630c;p=libreriscv.git Added primer to spec doc, changed img links in sv/overview.mdwn --- diff --git a/openpower/simple_v_spec.tex b/openpower/simple_v_spec.tex index 67edff8b7..e685b0de0 100644 --- a/openpower/simple_v_spec.tex +++ b/openpower/simple_v_spec.tex @@ -106,11 +106,19 @@ table}}{} \def\fps@figure{htbp} \makeatother +% graphics path for primer +\graphicspath{ {svp64-primer/img/} } \date{} \begin{document} +\part{Scalable Vectors Primer} +\chapter{Executive Summary} +\include{svp64-primer/summary} +\bibliography{svp64-primer/references} +\bibliographystyle{ieeetr} + \chapter*{Preamble} \addcontentsline{toc}{chapter}{Preamble} \markboth{INTRODUCTION}{} diff --git a/openpower/sv/overview.mdwn b/openpower/sv/overview.mdwn index d019afe53..f4dab4b36 100644 --- a/openpower/sv/overview.mdwn +++ b/openpower/sv/overview.mdwn @@ -77,7 +77,7 @@ a register file size increase using "tagging" (similar to how x86 originally extended registers from 32 to 64 bit). -![Single-Issue concept](/svp64-primer/img/power_pipelines.svg) +![Single-Issue concept](/openpower/svp64-primer/img/power_pipelines.svg) ## SV @@ -95,7 +95,7 @@ The fundamentals are (just like x86 "REP"): * Once the loop is completed *only then* is the Program Counter allowed to move to the next instruction. -![Multi-Issue with Predicated SIMD back-end ALUs](/svp64-primer/img/sv_multi_issue.svg) +![Multi-Issue with Predicated SIMD back-end ALUs](/openpower/svp64-primer/img/sv_multi_issue.svg) Hardware (and simulator) implementors are free and clear to implement this as literally a for-loop, sitting in between instruction decode and issue. @@ -383,7 +383,7 @@ This means that Vector elements start from locations specified by 64 bit "register" but that from that location onwards the elements *overlap subsequent registers*. -![image](/svp64-primer/img/svp64_regs.svg){ width=40% } +![image](/openpower/svp64-primer/img/svp64_regs.svg){ width=40% } Here is another way to view the same concept, bearing in mind that it is assumed a LE memory order: diff --git a/openpower/svp64-primer/acronyms.tex b/openpower/svp64-primer/acronyms.tex new file mode 100644 index 000000000..99d848554 --- /dev/null +++ b/openpower/svp64-primer/acronyms.tex @@ -0,0 +1,21 @@ +\section{List of Acronyms} +\begin{acronym} + \acro{ASIC}{Application Specific Integrated Circuit} + \acro{AVX-512}{Intel Advanced Vector Extensions 512-bit} + \acro{CPU}{Central Processing Unit} + \acro{DCT}{Discrete Cosine Transform} + \acro{DSP}{Digital Signal Processors} + \acro{DAXPY}{Double-Precision aX Plus Y ($aX+Y$)} + \acro{FFT}{Fast Fourier Transform} + \acro{IA-32}{Intel Architecture 32-bit or i386} + \acro{ISA}{Instruction Set Architecture} + \acro{MMX}{Intel's first SIMD implementation} + \acro{RVV}{RISC-V Vector extension} + \acro{SIMD}{Single Instruction Multiple Data} + \acro{SWAR}{SIMD Within A Register (see Flynn's Taxonomy)} + \acro{SV}{(Scalable) Simple Vectorisation or Simple-V} + \acro{SVE2}{ARM Scalable Vector Extension version two} + \acro{SVP64}{Simple-V with Prefixing of Power ISA, 64-bits in length} + \acro{VLIW}{Very Long Instruction Word} + \acro{VSX}{128-bit Packed SIMD Extension to the Power ISA} +\end{acronym} diff --git a/openpower/svp64-primer/img/cray_vector_regs.png b/openpower/svp64-primer/img/cray_vector_regs.png new file mode 100644 index 000000000..b7e4f8a02 Binary files /dev/null and b/openpower/svp64-primer/img/cray_vector_regs.png differ diff --git a/openpower/svp64-primer/img/cray_vector_regs.svg b/openpower/svp64-primer/img/cray_vector_regs.svg new file mode 100644 index 000000000..b84bdc0dc --- /dev/null +++ b/openpower/svp64-primer/img/cray_vector_regs.svg @@ -0,0 +1,623 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + 0 + 1 + 2 + 3 + 4 + + 0 + + 1 + 2 + 3 + 4 + 5 + 63 + + + + + + + ... + 6 + 7 + element numbers + Cray 1 + setvl 5vadd r0,r1,r2 + registernumbers + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/openpower/svp64-primer/img/power_pipelines.png b/openpower/svp64-primer/img/power_pipelines.png new file mode 100644 index 000000000..98afc6b8b Binary files /dev/null and b/openpower/svp64-primer/img/power_pipelines.png differ diff --git a/openpower/svp64-primer/img/power_pipelines.svg b/openpower/svp64-primer/img/power_pipelines.svg new file mode 100644 index 000000000..fbbd63de2 --- /dev/null +++ b/openpower/svp64-primer/img/power_pipelines.svg @@ -0,0 +1,669 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + FetchPC + + Decode + + + Issue + + Execute + Scalar + + FetchPC + + Decode + + Issue + + Execute + + Loop0...VL-1 + + SV + PredicateMask bits + + + + + + + + diff --git a/openpower/svp64-primer/img/simd_axb.png b/openpower/svp64-primer/img/simd_axb.png new file mode 100644 index 000000000..52d32a47a Binary files /dev/null and b/openpower/svp64-primer/img/simd_axb.png differ diff --git a/openpower/svp64-primer/img/simd_axb.svg b/openpower/svp64-primer/img/simd_axb.svg new file mode 100644 index 000000000..9768e03d6 --- /dev/null +++ b/openpower/svp64-primer/img/simd_axb.svg @@ -0,0 +1,352 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + A2 + A3 + A4 + + + + + + + A1 + B1 + A1xB1 + B2 + B3 + B4 + A2xB2 + A3xB3 + A4xB4 + + diff --git a/openpower/svp64-primer/img/sv_multi_issue.svg b/openpower/svp64-primer/img/sv_multi_issue.svg new file mode 100644 index 000000000..7d7d4359e --- /dev/null +++ b/openpower/svp64-primer/img/sv_multi_issue.svg @@ -0,0 +1,1002 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + FetchPC + + Decode + + + Issue + + Execute + + + + Multi-Issue + + Issue + + Execute + + + + + + + SVMulti-Issue + Predicate Mask bitsPassed to ALUs + + + FetchPC + + + + Decode + + + + + + Loop 01234567 + + + + + + + + IssuePredicate1011 + IssuePredicate0110 + + + + + + + + ExecuteMaskedExecuteExecute + MaskedExecuteExecuteMasked + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/openpower/svp64-primer/img/svp64_regs.png b/openpower/svp64-primer/img/svp64_regs.png new file mode 100644 index 000000000..605cefac0 Binary files /dev/null and b/openpower/svp64-primer/img/svp64_regs.png differ diff --git a/openpower/svp64-primer/img/svp64_regs.svg b/openpower/svp64-primer/img/svp64_regs.svg new file mode 100644 index 000000000..1f18d7f2a --- /dev/null +++ b/openpower/svp64-primer/img/svp64_regs.svg @@ -0,0 +1,927 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + r0 + r1 + r2 + r3 + r4 + byte-level write-enable lines + + + + + + + + + + + + + + + + + + + + + + + + 64-bit wide Scalar registers(not Vector Registers) + setvl 5 + sv.addi 0,0,1 + SV + + r40 + + + + + + + + + + + + + + + + + + + + + + + + r41 + r42 + sv.addi/ew=32 40,40,1 + + + + + + + + + + + + + + + + + + r80 + r81 + r127 + Vector length 564-bit elementspacked into scalarGPRs r0-r4 + Vector length 532-bit elementspacked into GPRsr40-r41and lower half of r42. Upper half ofr42 UNMODIFIED + Vector length 516-bit elementspacked into GPRs r80and lowest half-wordof r81. Remainderof r81 UNMODIFIED + sv.addi/ew=16 80,80,1 + GPR/FPR extendedfrom 32 to 128 + + + + + + + + + + + + + + + + diff --git a/openpower/svp64-primer/img/vl_reg_n.jpg b/openpower/svp64-primer/img/vl_reg_n.jpg new file mode 100644 index 000000000..6f239bb89 Binary files /dev/null and b/openpower/svp64-primer/img/vl_reg_n.jpg differ diff --git a/openpower/svp64-primer/references.bib b/openpower/svp64-primer/references.bib new file mode 100644 index 000000000..6245be44d --- /dev/null +++ b/openpower/svp64-primer/references.bib @@ -0,0 +1,90 @@ +@online{SIMD_HARM, + ALTauthor = {David Patterson, Andrew Waterman}, + ALTeditor = {editor}, + title = {SIMD Instructions Considered Harmful}, + date = {18-09-2017}, + howpublished={\par\url{https://www.sigarch.org/simd-instructions-considered-harmful/}}, + OPTsubtitle = {subtitle}, + OPTtitleaddon = {titleaddon}, + OPTlanguage = {English}, + OPTversion = {version}, + OPTnote = {note}, + OPTorganization = {organization}, + OPTdate = {date}, + OPTmonth = {month}, + OPTyear = {year}, + OPTaddendum = {addendum}, + OPTpubstate = {pubstate}, + OPTurldate = {16-06-2022}, +} + +@online{SIMD_HPC, + ALTauthor = {João M.P.Cardoso, José Gabriel F.Coutinho, Pedro C.Diniz}, + ALTeditor = {editor}, + title = {High-performance embedded computing}, + date = {2017}, + howpublished={\par\url{https://www.sciencedirect.com/topics/computer-science/single-instruction-multiple-data}}, + OPTsubtitle = {subtitle}, + OPTtitleaddon = {titleaddon}, + OPTlanguage = {English}, + OPTversion = {version}, + OPTnote = {note}, + OPTorganization = {organization}, + OPTdate = {date}, + OPTmonth = {month}, + OPTyear = {year}, + OPTaddendum = {addendum}, + OPTpubstate = {pubstate}, + OPTurldate = {urldate}, +} + +@online{SIMD_WASM, + ALTauthor = {Nick Lewycky}, + ALTeditor = {editor}, + title = {WebAssembly and SIMD}, + date = {31-07-2019}, + howpublished={\par\url{https://medium.com/wasmer/webassembly-and-simd-13badb9bf1a8}}, + OPTsubtitle = {subtitle}, + OPTtitleaddon = {titleaddon}, + OPTlanguage = {language}, + OPTversion = {version}, + OPTnote = {note}, + OPTorganization = {organization}, + OPTdate = {date}, + OPTmonth = {month}, + OPTyear = {year}, + OPTaddendum = {addendum}, + OPTpubstate = {pubstate}, + OPTurldate = {urldate}, +} + +@online{riscv-v-spec, + ALTauthor = {author}, + ALTeditor = {editor}, + title = {RISC-V "V" Vector Extension}, + date = {date}, + howpublished={\par\url{https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc}}, + OPTsubtitle = {subtitle}, + OPTtitleaddon = {titleaddon}, + OPTlanguage = {language}, + OPTedition = {edition}, + OPTtype = {type}, + OPTseries = {series}, + OPTnumber = {number}, + OPTversion = {version}, + OPTnote = {note}, + OPTorganization = {organization}, + OPTpublisher = {publisher}, + OPTlocation = {location}, + OPTisbn = {isbn}, + OPTchapter = {2}, + OPTpages = {8}, + OPTpagetotal = {111}, + OPTaddendum = {addendum}, + OPTpubstate = {pubstate}, + OPTdoi = {doi}, + OPTeprint = {eprint}, + OPTeprintclass = {eprintclass}, + OPTeprinttype = {eprinttype}, + OPTurldate = {20-09-2021}, +} diff --git a/openpower/svp64-primer/summary.tex b/openpower/svp64-primer/summary.tex new file mode 100644 index 000000000..318125d8b --- /dev/null +++ b/openpower/svp64-primer/summary.tex @@ -0,0 +1,206 @@ +\section{Summary} +The proposed \acs{SV} is a Scalable Vector Specification for a hardware for-loop \textbf{that +ONLY uses scalar instructions}. + +\begin{itemize} +\itemsep 0em +\item The Power \acs{ISA} v3.1 Spec is not altered. + v3.1 Code-compatibility is guaranteed. +\item Does not require sacrificing 32-bit Major Opcodes. +\item Does not require adding duplicates of instructions + (popcnt, popcntw, popcntd, vpopcntb, vpopcnth, vpopcntw, vpopcntd) +\item Fully abstracted: does not create Micro-architectural dependencies + (no fixed "Lane" size), one binary works across all existing + \textit{and future} implementations. +\item Specifically designed to be easily implemented + on top of an existing Micro-architecture (especially + Superscalar Out-of-Order Multi-issue) without + disruptive full architectural redesigns. +\item Divided into Compliancy Levels to suit differing needs. +\item At the highest Compliancy Level only requires five instructions + (SVE2 requires appx 9,000. \acs{AVX-512} around 10,000. \acs{RVV} around + 300). +\item Predication, often-requested, is added cleanly + (without modifying the v3.1 Power ISA) +\item In-registers arbitrary-sized Matrix Multiply is achieved in three + instructions (without adding any v3.1 Power ISA instructions) +\item Full \acs{DCT} and \acs{FFT} RADIX2 Triple-loops are achieved with + dramatically reduced instruction count, and power consumption expected + to greatly reduce. Normally found only in high-end \acs{VLIW} \acs{DSP} + (TI MSP, Qualcomm Hexagon) +\item Fail-First Load/Store allows Vectorised high performance + strncpy to be implemented in around 14 + instructions (hand-optimised \acs{VSX} assembler is 240). +\item Inner loop of MP3 implemented in under 100 instructions + (gcc produces 450 for the same function on POWER9). +\end{itemize} + +All areas investigated so far consistently showed reductions in executable +size, which as outlined in \cite{SIMD_HARM} has an indirect reduction in +power consumption due to less I-Cache/TLB pressure and also Issue remaining +idle for long periods. +Simple-V has been specifically and carefully crafted to respect +the Power ISA's Supercomputing pedigree. + +\begin{figure}[hb] + \centering + \includegraphics[width=0.6\linewidth]{power_pipelines.png} + \caption{Showing how SV fits in between Decode and Issue} + \label{fig:power_pipelines} +\end{figure} + +\pagebreak + +\subsection{What is SIMD?} + +\acs{SIMD} is a way of partitioning existing \acs{CPU} +registers of 64-bit length into smaller 8-, 16-, 32-bit pieces. +\cite{SIMD_HARM}\cite{SIMD_HPC} +These partitions can then be operated on simultaneously, and the initial values +and results being stored as entire 64-bit registers (\acs{SWAR}). +The SIMD instruction opcode +includes the data width and the operation to perform. +\par + +\begin{figure}[hb] + \centering + \includegraphics[width=0.6\linewidth]{simd_axb.png} + \caption{SIMD multiplication} + \label{fig:simd_axb} +\end{figure} + +This method can have a huge advantage for rapid processing of +vector-type data (image/video, physics simulations, cryptography, +etc.), +\cite{SIMD_WASM}, + and thus on paper is very attractive compared to +scalar-only instructions. +\textit{As long as the data width fits the workload, everything is fine}. +\par + +\subsection{Shortfalls of SIMD} +SIMD registers are of a fixed length and thus to achieve greater +performance, CPU architects typically increase the width of registers +(to 128-, 256-, 512-bit etc) for more partitions.\par Additionally, +binary compatibility is an important feature, and thus each doubling +of SIMD registers also expands the instruction set. The number of +instructions quickly balloons and this can be seen in for example +IA-32 expanding from 80 to about 1400 instructions since +the 1970s\cite{SIMD_HARM}.\par + +Five digit Opcode proliferation (10,000 instructions) is overwhelming. +The following are just some of the reasons why SIMD is unsustainable as +the number of instructions increase: +\begin{itemize} + \itemsep 0em + \item Hardware design, ASIC routing etc. + \item Compiler design + \item Documentation of the ISA + \item Manual coding and optimisation + \item Time to support the platform + \item Compilance Suite development and testing + \item Protracted Variable-Length encoding (x86) severely compromises + Multi-issue decoding +\end{itemize} + +\subsection{Scalable Vector Architectures} +An older alternative exists to utilise data parallelism - vector +architectures. Vector CPUs collect operands from the main memory, and +store them in large, sequential vector registers.\par + +A simple vector processor might operate on one element at a time, +however as the element operations are usually independent, +a processor could be made to compute all of the vector's +elements simultaneously, taking advantage of multiple pipelines.\par + +Typically, today's vector processors can execute two, four, or eight +64-bit elements per clock cycle. +\cite{SIMD_HARM}. +Vector ISAs are specifically designed to deal with (in hardware) fringe +cases where an algorithm's element count is not a multiple of the +underlying hardware "Lane" width. The element data width +is variable (8 to 64-bit just like in SIMD) +but it is the \textit{number} of elements being +variable under control of a "setvl" instruction that specifically +makes Vector ISAs "Scalable" +\par + +\acs{RVV} supports a VL of up to $2^{16}$ or $65536$ bits, +which can fit 1024 64-bit words. +\cite{riscv-v-spec}. +The Cray-1 had 8 Vector Registers with up to 64 elements (64-bit each). +An early Draft of RVV supported overlaying the Vector Registers onto the +Floating Point registers, similar to \acs{MMX}. + +\begin{figure}[ht] + \centering + \includegraphics[width=0.6\linewidth]{cray_vector_regs.png} + \caption{Cray Vector registers: 8 registers, 64 elements each} + \label{fig:cray_vector_regs} +\end{figure} + +Simple-V's "Vector" Registers (a misnomer) are specifically designed to fit +on top of +the Scalar (GPR, FPR) register files, which are extended from the default +of 32, to 128 entries in the high-end Compliancy Levels. This is a primary +reason why Simple-V can be added on top of an existing Scalar ISA, and +\textit{in particular} why there is no need to add explicit Vector +Registers or +Vector instructions. The diagram below shows \textit{conceptually} +how a Vector's elements are sequentially and linearly mapped onto the +\textit{Scalar} register file: + +\begin{figure}[ht] + \centering + \includegraphics[width=0.6\linewidth]{svp64_regs.png} + \caption{three instructions, same vector length, different element widths} + \label{fig:svp64_regs} +\end{figure} + +\pagebreak + +\subsection{Simple Vectorisation} +\acs{SV} is a Scalable Vector ISA designed for hybrid workloads (CPU, GPU, +VPU, 3D?). Includes features normally found only on Cray-style Supercomputers +(Cray-1, NEC SX-Aurora) and GPUs. Keeps to a strict uniform RISC paradigm, +leveraging a scalar ISA by using "Prefixing". +\textbf{No dedicated vector opcodes exist in SV, at all}. +SVP64 uses 25\% of the Power ISA v3.1 64-bit Prefix space (EXT001) to create +the SV Vectorisation Context for the 32-bit Scalar Suffix. + +\vspace{10pt} +Main design principles +\begin{itemize} + \itemsep 0em + \item Introduce by implementing on top of existing Power ISA + \item Effectively a \textbf{hardware for-loop}, pauses main PC, + issues multiple scalar operations + \item Strictly preserves (leverages) underlying scalar execution + dependencies as if + the for-loop had been expanded into actual scalar instructions + ("preserving Program Order") + \item Augments existing instructions by adding "tags" - provides + Vectorisation "context" rather than adding new opcodes. + \item Does not modify or deviate from the underlying scalar + Power ISA unless there's a significant performance boost or other + advantage in the vector space + \item Aimed at Supercomputing: avoids creating significant + \textit{sequential dependency hazards}, allowing \textbf{high + performance multi-issue superscalar microarchitectures} to be + leveraged. +\end{itemize} + +Advantages include: +\begin{itemize} + \itemsep 0em + \item Easy to create first (and sometimes only) implementation + as a literal for-loop in hardware, simulators, and compilers. + \item Obliterates SIMD opcode proliferation + ($O(N^6)$) as well as dedicated Vectorisation + ISAs. No more separate vector instructions. + \item Reducing maintenance overhead (no separate Vector instructions). + Adding any new Scalar instruction + \textit{automatically adds a Vectorised version of the same}. + \item Easier for compilers, coders, documentation +\end{itemize} + diff --git a/openpower/svp64-primer/svp64-primer.tex b/openpower/svp64-primer/svp64-primer.tex new file mode 100644 index 000000000..7ca8023ba --- /dev/null +++ b/openpower/svp64-primer/svp64-primer.tex @@ -0,0 +1,32 @@ +\documentclass[a4paper, 10pt]{article} +\usepackage[utf8]{inputenc} +\usepackage[firstpage]{draftwatermark} +\usepackage[printonlyused,withpage]{acronym} +\usepackage{graphicx} +\usepackage{float} +\usepackage{url} +\usepackage[margin=1.1in]{geometry} +\graphicspath{ {./img/} } + +\title{(DRAFT) SVP64 Primer} + +\author{Andrey Miroshnikov, Luke Kenneth Casson Leighton} + +\SetWatermarkLightness{0.5} +\SetWatermarkScale{4} +%\SetWatermarkText{DRAFT!} + +\begin{document} +\maketitle + + +\input{acronyms} +\pagebreak +\input{summary} +%\input{...} + +\section{References} +%\textit{(All references and sources are available on request)} +\bibliography{references} +\bibliographystyle{ieeetr} +\end{document} diff --git a/openpower/svp64-primer/svp64-proposal.tex b/openpower/svp64-primer/svp64-proposal.tex new file mode 100644 index 000000000..eb714dbf7 --- /dev/null +++ b/openpower/svp64-primer/svp64-proposal.tex @@ -0,0 +1,33 @@ +\documentclass[a4paper, 10pt]{article} +\usepackage[utf8]{inputenc} +\usepackage[firstpage]{draftwatermark} +\usepackage[printonlyused,withpage]{acronym} +\usepackage{graphicx} +\usepackage{float} +\usepackage{url} +\usepackage[margin=1.1in]{geometry} +\graphicspath{ {./img/} } + +\title{(DRAFT) SVP64 Primer} + +\author{Luke Kenneth Casson Leighton, Andrey Miroshnikov} + +\SetWatermarkLightness{0.5} +\SetWatermarkScale{4} +%\SetWatermarkText{DRAFT!} + +\begin{document} +\maketitle + + +\input{acronyms} +\pagebreak +%\input{summary} +\section{Summary} +Simple Vectorisation requires considering the ISA as a 2-dimensional concept, with the instructions comprising of one (vertical), and the register file as another (horizontal). + +\section{References} +%\textit{(All references and sources are available on request)} +\bibliography{references} +\bibliographystyle{ieeetr} +\end{document} diff --git a/svp64-primer/acronyms.tex b/svp64-primer/acronyms.tex deleted file mode 100644 index 99d848554..000000000 --- a/svp64-primer/acronyms.tex +++ /dev/null @@ -1,21 +0,0 @@ -\section{List of Acronyms} -\begin{acronym} - \acro{ASIC}{Application Specific Integrated Circuit} - \acro{AVX-512}{Intel Advanced Vector Extensions 512-bit} - \acro{CPU}{Central Processing Unit} - \acro{DCT}{Discrete Cosine Transform} - \acro{DSP}{Digital Signal Processors} - \acro{DAXPY}{Double-Precision aX Plus Y ($aX+Y$)} - \acro{FFT}{Fast Fourier Transform} - \acro{IA-32}{Intel Architecture 32-bit or i386} - \acro{ISA}{Instruction Set Architecture} - \acro{MMX}{Intel's first SIMD implementation} - \acro{RVV}{RISC-V Vector extension} - \acro{SIMD}{Single Instruction Multiple Data} - \acro{SWAR}{SIMD Within A Register (see Flynn's Taxonomy)} - \acro{SV}{(Scalable) Simple Vectorisation or Simple-V} - \acro{SVE2}{ARM Scalable Vector Extension version two} - \acro{SVP64}{Simple-V with Prefixing of Power ISA, 64-bits in length} - \acro{VLIW}{Very Long Instruction Word} - \acro{VSX}{128-bit Packed SIMD Extension to the Power ISA} -\end{acronym} diff --git a/svp64-primer/img/cray_vector_regs.png b/svp64-primer/img/cray_vector_regs.png deleted file mode 100644 index b7e4f8a02..000000000 Binary files a/svp64-primer/img/cray_vector_regs.png and /dev/null differ diff --git a/svp64-primer/img/cray_vector_regs.svg b/svp64-primer/img/cray_vector_regs.svg deleted file mode 100644 index b84bdc0dc..000000000 --- a/svp64-primer/img/cray_vector_regs.svg +++ /dev/null @@ -1,623 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - 0 - 1 - 2 - 3 - 4 - - 0 - - 1 - 2 - 3 - 4 - 5 - 63 - - - - - - - ... - 6 - 7 - element numbers - Cray 1 - setvl 5vadd r0,r1,r2 - registernumbers - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/svp64-primer/img/power_pipelines.png b/svp64-primer/img/power_pipelines.png deleted file mode 100644 index 98afc6b8b..000000000 Binary files a/svp64-primer/img/power_pipelines.png and /dev/null differ diff --git a/svp64-primer/img/power_pipelines.svg b/svp64-primer/img/power_pipelines.svg deleted file mode 100644 index fbbd63de2..000000000 --- a/svp64-primer/img/power_pipelines.svg +++ /dev/null @@ -1,669 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - FetchPC - - Decode - - - Issue - - Execute - Scalar - - FetchPC - - Decode - - Issue - - Execute - - Loop0...VL-1 - - SV - PredicateMask bits - - - - - - - - diff --git a/svp64-primer/img/simd_axb.png b/svp64-primer/img/simd_axb.png deleted file mode 100644 index 52d32a47a..000000000 Binary files a/svp64-primer/img/simd_axb.png and /dev/null differ diff --git a/svp64-primer/img/simd_axb.svg b/svp64-primer/img/simd_axb.svg deleted file mode 100644 index 9768e03d6..000000000 --- a/svp64-primer/img/simd_axb.svg +++ /dev/null @@ -1,352 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - A2 - A3 - A4 - - - - - - - A1 - B1 - A1xB1 - B2 - B3 - B4 - A2xB2 - A3xB3 - A4xB4 - - diff --git a/svp64-primer/img/sv_multi_issue.svg b/svp64-primer/img/sv_multi_issue.svg deleted file mode 100644 index 7d7d4359e..000000000 --- a/svp64-primer/img/sv_multi_issue.svg +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - FetchPC - - Decode - - - Issue - - Execute - - - - Multi-Issue - - Issue - - Execute - - - - - - - SVMulti-Issue - Predicate Mask bitsPassed to ALUs - - - FetchPC - - - - Decode - - - - - - Loop 01234567 - - - - - - - - IssuePredicate1011 - IssuePredicate0110 - - - - - - - - ExecuteMaskedExecuteExecute - MaskedExecuteExecuteMasked - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/svp64-primer/img/svp64_regs.png b/svp64-primer/img/svp64_regs.png deleted file mode 100644 index 605cefac0..000000000 Binary files a/svp64-primer/img/svp64_regs.png and /dev/null differ diff --git a/svp64-primer/img/svp64_regs.svg b/svp64-primer/img/svp64_regs.svg deleted file mode 100644 index 1f18d7f2a..000000000 --- a/svp64-primer/img/svp64_regs.svg +++ /dev/null @@ -1,927 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - r0 - r1 - r2 - r3 - r4 - byte-level write-enable lines - - - - - - - - - - - - - - - - - - - - - - - - 64-bit wide Scalar registers(not Vector Registers) - setvl 5 - sv.addi 0,0,1 - SV - - r40 - - - - - - - - - - - - - - - - - - - - - - - - r41 - r42 - sv.addi/ew=32 40,40,1 - - - - - - - - - - - - - - - - - - r80 - r81 - r127 - Vector length 564-bit elementspacked into scalarGPRs r0-r4 - Vector length 532-bit elementspacked into GPRsr40-r41and lower half of r42. Upper half ofr42 UNMODIFIED - Vector length 516-bit elementspacked into GPRs r80and lowest half-wordof r81. Remainderof r81 UNMODIFIED - sv.addi/ew=16 80,80,1 - GPR/FPR extendedfrom 32 to 128 - - - - - - - - - - - - - - - - diff --git a/svp64-primer/img/vl_reg_n.jpg b/svp64-primer/img/vl_reg_n.jpg deleted file mode 100644 index 6f239bb89..000000000 Binary files a/svp64-primer/img/vl_reg_n.jpg and /dev/null differ diff --git a/svp64-primer/references.bib b/svp64-primer/references.bib deleted file mode 100644 index 6245be44d..000000000 --- a/svp64-primer/references.bib +++ /dev/null @@ -1,90 +0,0 @@ -@online{SIMD_HARM, - ALTauthor = {David Patterson, Andrew Waterman}, - ALTeditor = {editor}, - title = {SIMD Instructions Considered Harmful}, - date = {18-09-2017}, - howpublished={\par\url{https://www.sigarch.org/simd-instructions-considered-harmful/}}, - OPTsubtitle = {subtitle}, - OPTtitleaddon = {titleaddon}, - OPTlanguage = {English}, - OPTversion = {version}, - OPTnote = {note}, - OPTorganization = {organization}, - OPTdate = {date}, - OPTmonth = {month}, - OPTyear = {year}, - OPTaddendum = {addendum}, - OPTpubstate = {pubstate}, - OPTurldate = {16-06-2022}, -} - -@online{SIMD_HPC, - ALTauthor = {João M.P.Cardoso, José Gabriel F.Coutinho, Pedro C.Diniz}, - ALTeditor = {editor}, - title = {High-performance embedded computing}, - date = {2017}, - howpublished={\par\url{https://www.sciencedirect.com/topics/computer-science/single-instruction-multiple-data}}, - OPTsubtitle = {subtitle}, - OPTtitleaddon = {titleaddon}, - OPTlanguage = {English}, - OPTversion = {version}, - OPTnote = {note}, - OPTorganization = {organization}, - OPTdate = {date}, - OPTmonth = {month}, - OPTyear = {year}, - OPTaddendum = {addendum}, - OPTpubstate = {pubstate}, - OPTurldate = {urldate}, -} - -@online{SIMD_WASM, - ALTauthor = {Nick Lewycky}, - ALTeditor = {editor}, - title = {WebAssembly and SIMD}, - date = {31-07-2019}, - howpublished={\par\url{https://medium.com/wasmer/webassembly-and-simd-13badb9bf1a8}}, - OPTsubtitle = {subtitle}, - OPTtitleaddon = {titleaddon}, - OPTlanguage = {language}, - OPTversion = {version}, - OPTnote = {note}, - OPTorganization = {organization}, - OPTdate = {date}, - OPTmonth = {month}, - OPTyear = {year}, - OPTaddendum = {addendum}, - OPTpubstate = {pubstate}, - OPTurldate = {urldate}, -} - -@online{riscv-v-spec, - ALTauthor = {author}, - ALTeditor = {editor}, - title = {RISC-V "V" Vector Extension}, - date = {date}, - howpublished={\par\url{https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc}}, - OPTsubtitle = {subtitle}, - OPTtitleaddon = {titleaddon}, - OPTlanguage = {language}, - OPTedition = {edition}, - OPTtype = {type}, - OPTseries = {series}, - OPTnumber = {number}, - OPTversion = {version}, - OPTnote = {note}, - OPTorganization = {organization}, - OPTpublisher = {publisher}, - OPTlocation = {location}, - OPTisbn = {isbn}, - OPTchapter = {2}, - OPTpages = {8}, - OPTpagetotal = {111}, - OPTaddendum = {addendum}, - OPTpubstate = {pubstate}, - OPTdoi = {doi}, - OPTeprint = {eprint}, - OPTeprintclass = {eprintclass}, - OPTeprinttype = {eprinttype}, - OPTurldate = {20-09-2021}, -} diff --git a/svp64-primer/summary.tex b/svp64-primer/summary.tex deleted file mode 100644 index 318125d8b..000000000 --- a/svp64-primer/summary.tex +++ /dev/null @@ -1,206 +0,0 @@ -\section{Summary} -The proposed \acs{SV} is a Scalable Vector Specification for a hardware for-loop \textbf{that -ONLY uses scalar instructions}. - -\begin{itemize} -\itemsep 0em -\item The Power \acs{ISA} v3.1 Spec is not altered. - v3.1 Code-compatibility is guaranteed. -\item Does not require sacrificing 32-bit Major Opcodes. -\item Does not require adding duplicates of instructions - (popcnt, popcntw, popcntd, vpopcntb, vpopcnth, vpopcntw, vpopcntd) -\item Fully abstracted: does not create Micro-architectural dependencies - (no fixed "Lane" size), one binary works across all existing - \textit{and future} implementations. -\item Specifically designed to be easily implemented - on top of an existing Micro-architecture (especially - Superscalar Out-of-Order Multi-issue) without - disruptive full architectural redesigns. -\item Divided into Compliancy Levels to suit differing needs. -\item At the highest Compliancy Level only requires five instructions - (SVE2 requires appx 9,000. \acs{AVX-512} around 10,000. \acs{RVV} around - 300). -\item Predication, often-requested, is added cleanly - (without modifying the v3.1 Power ISA) -\item In-registers arbitrary-sized Matrix Multiply is achieved in three - instructions (without adding any v3.1 Power ISA instructions) -\item Full \acs{DCT} and \acs{FFT} RADIX2 Triple-loops are achieved with - dramatically reduced instruction count, and power consumption expected - to greatly reduce. Normally found only in high-end \acs{VLIW} \acs{DSP} - (TI MSP, Qualcomm Hexagon) -\item Fail-First Load/Store allows Vectorised high performance - strncpy to be implemented in around 14 - instructions (hand-optimised \acs{VSX} assembler is 240). -\item Inner loop of MP3 implemented in under 100 instructions - (gcc produces 450 for the same function on POWER9). -\end{itemize} - -All areas investigated so far consistently showed reductions in executable -size, which as outlined in \cite{SIMD_HARM} has an indirect reduction in -power consumption due to less I-Cache/TLB pressure and also Issue remaining -idle for long periods. -Simple-V has been specifically and carefully crafted to respect -the Power ISA's Supercomputing pedigree. - -\begin{figure}[hb] - \centering - \includegraphics[width=0.6\linewidth]{power_pipelines.png} - \caption{Showing how SV fits in between Decode and Issue} - \label{fig:power_pipelines} -\end{figure} - -\pagebreak - -\subsection{What is SIMD?} - -\acs{SIMD} is a way of partitioning existing \acs{CPU} -registers of 64-bit length into smaller 8-, 16-, 32-bit pieces. -\cite{SIMD_HARM}\cite{SIMD_HPC} -These partitions can then be operated on simultaneously, and the initial values -and results being stored as entire 64-bit registers (\acs{SWAR}). -The SIMD instruction opcode -includes the data width and the operation to perform. -\par - -\begin{figure}[hb] - \centering - \includegraphics[width=0.6\linewidth]{simd_axb.png} - \caption{SIMD multiplication} - \label{fig:simd_axb} -\end{figure} - -This method can have a huge advantage for rapid processing of -vector-type data (image/video, physics simulations, cryptography, -etc.), -\cite{SIMD_WASM}, - and thus on paper is very attractive compared to -scalar-only instructions. -\textit{As long as the data width fits the workload, everything is fine}. -\par - -\subsection{Shortfalls of SIMD} -SIMD registers are of a fixed length and thus to achieve greater -performance, CPU architects typically increase the width of registers -(to 128-, 256-, 512-bit etc) for more partitions.\par Additionally, -binary compatibility is an important feature, and thus each doubling -of SIMD registers also expands the instruction set. The number of -instructions quickly balloons and this can be seen in for example -IA-32 expanding from 80 to about 1400 instructions since -the 1970s\cite{SIMD_HARM}.\par - -Five digit Opcode proliferation (10,000 instructions) is overwhelming. -The following are just some of the reasons why SIMD is unsustainable as -the number of instructions increase: -\begin{itemize} - \itemsep 0em - \item Hardware design, ASIC routing etc. - \item Compiler design - \item Documentation of the ISA - \item Manual coding and optimisation - \item Time to support the platform - \item Compilance Suite development and testing - \item Protracted Variable-Length encoding (x86) severely compromises - Multi-issue decoding -\end{itemize} - -\subsection{Scalable Vector Architectures} -An older alternative exists to utilise data parallelism - vector -architectures. Vector CPUs collect operands from the main memory, and -store them in large, sequential vector registers.\par - -A simple vector processor might operate on one element at a time, -however as the element operations are usually independent, -a processor could be made to compute all of the vector's -elements simultaneously, taking advantage of multiple pipelines.\par - -Typically, today's vector processors can execute two, four, or eight -64-bit elements per clock cycle. -\cite{SIMD_HARM}. -Vector ISAs are specifically designed to deal with (in hardware) fringe -cases where an algorithm's element count is not a multiple of the -underlying hardware "Lane" width. The element data width -is variable (8 to 64-bit just like in SIMD) -but it is the \textit{number} of elements being -variable under control of a "setvl" instruction that specifically -makes Vector ISAs "Scalable" -\par - -\acs{RVV} supports a VL of up to $2^{16}$ or $65536$ bits, -which can fit 1024 64-bit words. -\cite{riscv-v-spec}. -The Cray-1 had 8 Vector Registers with up to 64 elements (64-bit each). -An early Draft of RVV supported overlaying the Vector Registers onto the -Floating Point registers, similar to \acs{MMX}. - -\begin{figure}[ht] - \centering - \includegraphics[width=0.6\linewidth]{cray_vector_regs.png} - \caption{Cray Vector registers: 8 registers, 64 elements each} - \label{fig:cray_vector_regs} -\end{figure} - -Simple-V's "Vector" Registers (a misnomer) are specifically designed to fit -on top of -the Scalar (GPR, FPR) register files, which are extended from the default -of 32, to 128 entries in the high-end Compliancy Levels. This is a primary -reason why Simple-V can be added on top of an existing Scalar ISA, and -\textit{in particular} why there is no need to add explicit Vector -Registers or -Vector instructions. The diagram below shows \textit{conceptually} -how a Vector's elements are sequentially and linearly mapped onto the -\textit{Scalar} register file: - -\begin{figure}[ht] - \centering - \includegraphics[width=0.6\linewidth]{svp64_regs.png} - \caption{three instructions, same vector length, different element widths} - \label{fig:svp64_regs} -\end{figure} - -\pagebreak - -\subsection{Simple Vectorisation} -\acs{SV} is a Scalable Vector ISA designed for hybrid workloads (CPU, GPU, -VPU, 3D?). Includes features normally found only on Cray-style Supercomputers -(Cray-1, NEC SX-Aurora) and GPUs. Keeps to a strict uniform RISC paradigm, -leveraging a scalar ISA by using "Prefixing". -\textbf{No dedicated vector opcodes exist in SV, at all}. -SVP64 uses 25\% of the Power ISA v3.1 64-bit Prefix space (EXT001) to create -the SV Vectorisation Context for the 32-bit Scalar Suffix. - -\vspace{10pt} -Main design principles -\begin{itemize} - \itemsep 0em - \item Introduce by implementing on top of existing Power ISA - \item Effectively a \textbf{hardware for-loop}, pauses main PC, - issues multiple scalar operations - \item Strictly preserves (leverages) underlying scalar execution - dependencies as if - the for-loop had been expanded into actual scalar instructions - ("preserving Program Order") - \item Augments existing instructions by adding "tags" - provides - Vectorisation "context" rather than adding new opcodes. - \item Does not modify or deviate from the underlying scalar - Power ISA unless there's a significant performance boost or other - advantage in the vector space - \item Aimed at Supercomputing: avoids creating significant - \textit{sequential dependency hazards}, allowing \textbf{high - performance multi-issue superscalar microarchitectures} to be - leveraged. -\end{itemize} - -Advantages include: -\begin{itemize} - \itemsep 0em - \item Easy to create first (and sometimes only) implementation - as a literal for-loop in hardware, simulators, and compilers. - \item Obliterates SIMD opcode proliferation - ($O(N^6)$) as well as dedicated Vectorisation - ISAs. No more separate vector instructions. - \item Reducing maintenance overhead (no separate Vector instructions). - Adding any new Scalar instruction - \textit{automatically adds a Vectorised version of the same}. - \item Easier for compilers, coders, documentation -\end{itemize} - diff --git a/svp64-primer/svp64-primer.tex b/svp64-primer/svp64-primer.tex deleted file mode 100644 index 7ca8023ba..000000000 --- a/svp64-primer/svp64-primer.tex +++ /dev/null @@ -1,32 +0,0 @@ -\documentclass[a4paper, 10pt]{article} -\usepackage[utf8]{inputenc} -\usepackage[firstpage]{draftwatermark} -\usepackage[printonlyused,withpage]{acronym} -\usepackage{graphicx} -\usepackage{float} -\usepackage{url} -\usepackage[margin=1.1in]{geometry} -\graphicspath{ {./img/} } - -\title{(DRAFT) SVP64 Primer} - -\author{Andrey Miroshnikov, Luke Kenneth Casson Leighton} - -\SetWatermarkLightness{0.5} -\SetWatermarkScale{4} -%\SetWatermarkText{DRAFT!} - -\begin{document} -\maketitle - - -\input{acronyms} -\pagebreak -\input{summary} -%\input{...} - -\section{References} -%\textit{(All references and sources are available on request)} -\bibliography{references} -\bibliographystyle{ieeetr} -\end{document} diff --git a/svp64-primer/svp64-proposal.tex b/svp64-primer/svp64-proposal.tex deleted file mode 100644 index eb714dbf7..000000000 --- a/svp64-primer/svp64-proposal.tex +++ /dev/null @@ -1,33 +0,0 @@ -\documentclass[a4paper, 10pt]{article} -\usepackage[utf8]{inputenc} -\usepackage[firstpage]{draftwatermark} -\usepackage[printonlyused,withpage]{acronym} -\usepackage{graphicx} -\usepackage{float} -\usepackage{url} -\usepackage[margin=1.1in]{geometry} -\graphicspath{ {./img/} } - -\title{(DRAFT) SVP64 Primer} - -\author{Luke Kenneth Casson Leighton, Andrey Miroshnikov} - -\SetWatermarkLightness{0.5} -\SetWatermarkScale{4} -%\SetWatermarkText{DRAFT!} - -\begin{document} -\maketitle - - -\input{acronyms} -\pagebreak -%\input{summary} -\section{Summary} -Simple Vectorisation requires considering the ISA as a 2-dimensional concept, with the instructions comprising of one (vertical), and the register file as another (horizontal). - -\section{References} -%\textit{(All references and sources are available on request)} -\bibliography{references} -\bibliographystyle{ieeetr} -\end{document}