X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=simple_v_extension%2Fsimple_v_chennai_2018.tex;h=6401ef90baabb53b6331dc7c0ee80c93cd52f6df;hb=819751f461d50346468f729bc47ce745080a02e4;hp=417c0f3d921824477b98097ec8ed6eb1f9a4d652;hpb=63bb4e2c0b8bd01e6a97c57f8dde37d2f9dd9f5a;p=libreriscv.git diff --git a/simple_v_extension/simple_v_chennai_2018.tex b/simple_v_extension/simple_v_chennai_2018.tex index 417c0f3d9..6401ef90b 100644 --- a/simple_v_extension/simple_v_chennai_2018.tex +++ b/simple_v_extension/simple_v_chennai_2018.tex @@ -50,7 +50,7 @@ https://sigarch.org/simd-instructions-considered-harmful \item Setup and corner-cases alone are extremely complex.\\ Hardware is easy, but software is hell. - \item O($N^{6}$) ISA opcode proliferation!\\ + \item O($N^{6}$) ISA opcode proliferation (1000s of instructions)\\ opcode, elwidth, veclen, src1-src2-dest hi/lo \end{itemize} } @@ -59,6 +59,8 @@ \begin{itemize} \item Effectively a variant of SIMD / SIMT (arbitrary length)\vspace{4pt} + \item Fascinatingly, despite being a SIMD-variant, RVV only has + O(N) opcode proliferation! (extremely well designed) \item Extremely powerful (extensible to 256 registers)\vspace{4pt} \item Supports polymorphism, several datatypes (inc. FP16)\vspace{4pt} \item Requires a separate Register File (32 w/ext to 256)\vspace{4pt} @@ -66,11 +68,9 @@ \end{itemize} However... \begin{itemize} - \item 98 percent opcode duplication with rest of RV (CLIP) + \item 98 percent opcode duplication with rest of RV \item Extending RVV requires customisation not just of h/w:\\ gcc, binutils also need customisation (and maintenance) - \item Fascinatingly, despite being a SIMD-variant, RVV only has - O(N) opcode proliferation! (extremely well designed) \end{itemize} } @@ -103,7 +103,7 @@ \frame{\frametitle{What's the value of SV? Why adopt it even in non-V?} \begin{itemize} - \item memcpy becomes much smaller (higher bang-per-buck) + \item memcpy has a much higher bang-per-buck ratio \item context-switch (LOAD/STORE multiple): 1-2 instructions \item Compressed instrs further reduces I-cache (etc.) \item Reduced I-cache load (and less I-reads) @@ -124,15 +124,16 @@ \frame{\frametitle{How does Simple-V relate to RVV? What's different?} \begin{itemize} - \item RVV very heavy-duty (excellent for supercomputing)\vspace{8pt} - \item Simple-V abstracts parallelism (based on best of RVV)\vspace{8pt} - \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{8pt} - \item Even Compressed become vectorised (RVV can't)\vspace{8pt} - \item No polymorphism in SV (too complex)\vspace{8pt} + \item RVV very heavy-duty (excellent for supercomputing)\vspace{4pt} + \item Simple-V abstracts parallelism (based on best of RVV)\vspace{4pt} + \item Graded levels: hardware, hybrid or traps (fit impl. need)\vspace{4pt} + \item Even Compressed become vectorised (RVV can't)\vspace{4pt} + \item No polymorphism in SV (too complex)\vspace{4pt} \end{itemize} What Simple-V is not:\vspace{4pt} \begin{itemize} - \item A full supercomputer-level Vector Proposal + \item A full supercomputer-level Vector Proposal\\ + (it's not actually a Vector Proposal at all!) \item A replacement for RVV (SV is designed to be over-ridden\\ by - or augmented to become - RVV) \end{itemize} @@ -149,11 +150,11 @@ \item Standard and future and custom opcodes now parallel\\ (crucially: with NO extra instructions needing to be added) \end{itemize} - Note: EVERYTHING is parallelised: + Note: EVERY scalar op now paralleliseable \begin{itemize} \item All LOAD/STORE (inc. Compressed, Int/FP versions) \item All ALU ops (Int, FP, SIMD, DSP, everything) - \item All branches become predication targets (C.FNE added?) + \item All branches become predication targets (note: no FNE) \item C.MV of particular interest (s/v, v/v, v/s) \item FCVT, FMV, FSGNJ etc. very similar to C.MV \end{itemize} @@ -300,15 +301,15 @@ for (int i = 0; i < VL; ++i) \frametitle{Register key-value CSR table decoding pseudocode} \begin{semiverbatim} -struct vectorised fp\_vec[32], int\_vec[32]; // 64 in future +struct vectorised fp\_vec[32], int\_vec[32]; for (i = 0; i < 16; i++) // 16 CSRs? tb = int\_vec if CSRvec[i].type == 0 else fp\_vec idx = CSRvec[i].regkey // INT/FP src/dst reg in opcode tb[idx].elwidth = CSRvec[i].elwidth tb[idx].regidx = CSRvec[i].regidx // indirection + tb[idx].regidx += CSRvec[i].bank << 5 // 0 (1=rsvd) tb[idx].isvector = CSRvec[i].isvector tb[idx].packed = CSRvec[i].packed // SIMD or not - tb[idx].bank = CSRvec[i].bank // 0 (1=rsvd) tb[idx].enabled = true \end{semiverbatim} @@ -343,14 +344,14 @@ for (i = 0; i < 16; i++) // 16 CSRs? \frametitle{Predication key-value CSR table decoding pseudocode} \begin{semiverbatim} -struct pred fp\_pred[32], int\_pred[32]; // 64 in future +struct pred fp\_pred[32], int\_pred[32]; for (i = 0; i < 16; i++) // 16 CSRs? tb = int\_pred if CSRpred[i].type == 0 else fp\_pred idx = CSRpred[i].regkey tb[idx].zero = CSRpred[i].zero // zeroing tb[idx].inv = CSRpred[i].inv // inverted tb[idx].predidx = CSRpred[i].predidx // actual reg - tb[idx].bank = CSRpred[i].bank // 0 for now + tb[idx].predidx += CSRvec[i].bank << 5 // 0 (1=rsvd) tb[idx].enabled = true \end{semiverbatim} @@ -369,9 +370,9 @@ for (i = 0; i < 16; i++) // 16 CSRs? \begin{semiverbatim} def get\_pred\_val(bool is\_fp\_op, int reg): tb = int\_pred if is\_fp\_op else fp\_pred - if (!tb[reg].enabled): - return ~0x0 // all ops enabled - predidx = tb[reg].predidx // redirection occurs HERE + if (!tb[reg].enabled): return ~0x0 // all ops enabled + predidx = tb[reg].predidx // redirection occurs HERE + predidx += tb[reg].bank << 5 // 0 (1=rsvd) predicate = intreg[predidx] // actual predicate HERE if (tb[reg].inv): predicate = ~predicate // invert ALL bits @@ -470,10 +471,10 @@ def get\_pred\_val(bool is\_fp\_op, int reg): \begin{semiverbatim} function op\_add(rd, rs1, rs2) # add not VADD!  int i, id=0, irs1=0, irs2=0; +  predval = get\_pred\_val(FALSE, rd);  rd = int\_vec[rd ].isvector ? int\_vec[rd ].regidx : rd;  rs1 = int\_vec[rs1].isvector ? int\_vec[rs1].regidx : rs1;  rs2 = int\_vec[rs2].isvector ? int\_vec[rs2].regidx : rs2; -  predval = get\_pred\_val(FALSE, rd);  for (i = 0; i < VL; i++) if (predval \& 1<