From 5477e6901793ee57cee90b09c6a5c88af54f64a4 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Tue, 25 Jun 2019 12:54:19 +0100 Subject: [PATCH] cut out section that is now in the appendix --- simple_v_extension/abridged_spec.mdwn | 675 +------------------------- 1 file changed, 1 insertion(+), 674 deletions(-) diff --git a/simple_v_extension/abridged_spec.mdwn b/simple_v_extension/abridged_spec.mdwn index 30f147d61..3e184128a 100644 --- a/simple_v_extension/abridged_spec.mdwn +++ b/simple_v_extension/abridged_spec.mdwn @@ -3,6 +3,7 @@ * Copyright (C) 2017, 2018, 2019 Luke Kenneth Casson Leighton * Status: DRAFTv0.6 * Last edited: 25 jun 2019 +* See: main [[specification]] and [[appendix]] [[!toc ]] @@ -364,680 +365,6 @@ were not tested. The pseudo-code for Predication makes this clearer and simpler than it is in words (the loop ends, VL is set to the current element index, "i"). -# Instructions - -To illustrate how Scalar operations are turned "vector" and "predicated", -simplified example pseudo-code for an integer ADD operation is shown below. -Floating-point would use the FP Register Table. - - function op_add(rd, rs1, rs2) # add not VADD! -  int i, id=0, irs1=0, irs2=0; -  predval = get_pred_val(FALSE, rd); -  rd = int_vec[rd ].isvector ? int_vec[rd ].regidx : rd; -  rs1 = int_vec[rs1].isvector ? int_vec[rs1].regidx : rs1; -  rs2 = int_vec[rs2].isvector ? int_vec[rs2].regidx : rs2; -  for (i = 0; i < VL; i++) - xSTATE.srcoffs = i # save context - if (predval & 1< - -Adding in support for SUBVL is a matter of adding in an extra inner -for-loop, where register src and dest are still incremented inside the -inner part. Not that the predication is still taken from the VL index. - -So whilst elements are indexed by "(i * SUBVL + s)", predicate bits are -indexed by "(i)" - - function op_add(rd, rs1, rs2) # add not VADD! -  int i, id=0, irs1=0, irs2=0; -  predval = get_pred_val(FALSE, rd); -  rd = int_vec[rd ].isvector ? int_vec[rd ].regidx : rd; -  rs1 = int_vec[rs1].isvector ? int_vec[rs1].regidx : rs1; -  rs2 = int_vec[rs2].isvector ? int_vec[rs2].regidx : rs2; -  for (i = 0; i < VL; i++) - xSTATE.srcoffs = i # save context - for (s = 0; s < SUBVL; s++) - xSTATE.ssvoffs = s # save context - if (predval & 1< - -Branch operations use standard RV opcodes that are reinterpreted to -be "predicate variants" in the instance where either of the two src -registers are marked as vectors (active=1, vector=1). - -Note that the predication register to use (if one is enabled) is taken from -the *first* src register, and that this is used, just as with predicated -arithmetic operations, to mask whether the comparison operations take -place or not. If the second register is also marked as predicated, -that (scalar) predicate register is used as a **destination** to store -the results of all the comparisons. - -In instances where no vectorisation is detected on either src registers -the operation is treated as an absolutely standard scalar branch operation. -Where vectorisation is present on either or both src registers, the -branch may stil go ahead if any only if *all* tests succeed (i.e. excluding -those tests that are predicated out). - -Pseudo-code for branch: - - s1 = reg_is_vectorised(src1); - s2 = reg_is_vectorised(src2); - - if not s1 && not s2 - if cmp(rs1, rs2) # scalar compare - goto branch - return - - preg = int_pred_reg[rd] - reg = int_regfile - - ps = get_pred_val(I/F==INT, rs1); - rd = get_pred_val(I/F==INT, rs2); # this may not exist - - if not exists(rd) or zeroing: - result = 0 - else - result = preg[rd] - - for (int i = 0; i < VL; ++i) - if (zeroing) - if not (ps & (1< - -There is no MV instruction in RV however there is a C.MV instruction. -It is used for copying integer-to-integer registers (vectorised FMV -is used for copying floating-point). - -If either the source or the destination register are marked as vectors -C.MV is reinterpreted to be a vectorised (multi-register) predicated -move operation. The actual instruction's format does not change. - -There are several different instructions from RVV that are covered by -this one opcode: - -[[!table data=""" -src | dest | predication | op | -scalar | vector | none | VSPLAT | -scalar | vector | destination | sparse VSPLAT | -scalar | vector | 1-bit dest | VINSERT | -vector | scalar | 1-bit? src | VEXTRACT | -vector | vector | none | VCOPY | -vector | vector | src | Vector Gather | -vector | vector | dest | Vector Scatter | -vector | vector | src & dest | Gather/Scatter | -vector | vector | src == dest | sparse VCOPY | -"""]] - -Also, VMERGE may be implemented as back-to-back (macro-op fused) C.MV -operations with zeroing off, and inversion on the src and dest -predication for one of the two C.MV operations. - -### FMV, FNEG and FABS Instructions - -These are identical in form to C.MV, except covering floating-point -register copying. The same double-predication rules also apply. -However when elwidth is not set to default the instruction is implicitly -and automatic converted to a (vectorised) floating-point type conversion -operation of the appropriate size covering the source and destination -register bitwidths. - -(Note that FMV, FNEG and FABS are all actually pseudo-instructions) - -### FVCT Instructions - -These are again identical in form to C.MV, except that they cover -floating-point to integer and integer to floating-point. When element -width in each vector is set to default, the instructions behave exactly -as they are defined for standard RV (scalar) operations, except vectorised -in exactly the same fashion as outlined in C.MV. - -However when the source or destination element width is not set to default, -the opcode's explicit element widths are *over-ridden* to new definitions, -and the opcode's element width is taken as indicative of the SIMD width -(if applicable i.e. if packed SIMD is requested) instead. - -## LOAD / STORE Instructions and LOAD-FP/STORE-FP - -In vectorised architectures there are usually at least two different modes -for LOAD/STORE: - -* Read (or write for STORE) from sequential locations, where one - register specifies the address, and the one address is incremented - by a fixed amount. This is usually known as "Unit Stride" mode. -* Read (or write) from multiple indirected addresses, where the - vector elements each specify separate and distinct addresses. - -To support these different addressing modes, the CSR Register "isvector" -bit is used. So, for a LOAD, when the src register is set to -scalar, the LOADs are sequentially incremented by the src register -element width, and when the src register is set to "vector", the -elements are treated as indirection addresses. Simplified -pseudo-code would look like this: - - function op_ld(rd, rs) # LD not VLD! -  rdv = int_csr[rd].active ? int_csr[rd].regidx : rd; -  rsv = int_csr[rs].active ? int_csr[rs].regidx : rs; -  ps = get_pred_val(FALSE, rs); # predication on src -  pd = get_pred_val(FALSE, rd); # ... AND on dest -  for (int i = 0, int j = 0; i < VL && j < VL;): - if (int_csr[rs].isvec) while (!(ps & 1< - -C.LWSP / C.SWSP and floating-point etc. are also source-dest twin-predicated, -where it is implicit in C.LWSP/FLWSP etc. that x2 is the source register. -It is therefore possible to use predicated C.LWSP to efficiently -pop registers off the stack (by predicating x2 as the source), cherry-picking -which registers to store to (by predicating the destination). Likewise -for C.SWSP. In this way, LOAD/STORE-Multiple is efficiently achieved. - -**Note**: it is still possible to redirect x2 to an alternative target -register. With care, this allows C.LWSP / C.SWSP (and C.FLWSP) to be used as -general-purpose LOAD/STORE operations. - -## Compressed LOAD / STORE Instructions - -Compressed LOAD and STORE are again exactly the same as scalar LOAD/STORE, -where the same rules apply and the same pseudo-code apply as for -non-compressed LOAD/STORE. Again: setting scalar or vector mode -on the src for LOAD and dest for STORE switches mode from "Unit Stride" -to "Multi-indirection", respectively. - -# Element bitwidth polymorphism - -Element bitwidth is best covered as its own special section, as it -is quite involved and applies uniformly across-the-board. SV restricts -bitwidth polymorphism to default, 8-bit, 16-bit and 32-bit. - -The effect of setting an element bitwidth is to re-cast each entry -in the register table, and for all memory operations involving -load/stores of certain specific sizes, to a completely different width. -Thus In c-style terms, on an RV64 architecture, effectively each register -now looks like this: - - typedef union { - uint8_t actual_bytes[8]; // 8 for RV64, 4 for RV32, 16 for RV128 - uint8_t b[0]; // array of type uint8_t - uint16_t s[0]; - uint32_t i[0]; - uint64_t l[0]; - uint128_t d[0]; - } reg_t; - - reg_t int_regfile[128]; - -Implementors must ensure that over-runs of the register file throw -an exception. - -The pseudo-code is as follows, to demonstrate how the sign-extending -and width-extending works: - - typedef union { - uint8_t b; - uint16_t s; - uint32_t i; - uint64_t l; - } el_reg_t; - - bw(elwidth): - if elwidth == 0: return xlen - if elwidth == 1: return 8 - if elwidth == 2: return 16 - // elwidth == 3: - return 32 - - get_max_elwidth(rs1, rs2): - return max(bw(int_csr[rs1].elwidth), # default (XLEN) if not set - bw(int_csr[rs2].elwidth)) # again XLEN if no entry - - get_polymorphed_reg(reg, bitwidth, offset): - el_reg_t res; - res.l = 0; // TODO: going to need sign-extending / zero-extending - if bitwidth == 8: - reg.b = int_regfile[reg].b[offset] - elif bitwidth == 16: - reg.s = int_regfile[reg].s[offset] - elif bitwidth == 32: - reg.i = int_regfile[reg].i[offset] - elif bitwidth == 64: - reg.l = int_regfile[reg].l[offset] - return res - - set_polymorphed_reg(reg, bitwidth, offset, val): - if (!int_csr[reg].isvec): - # sign/zero-extend depending on opcode requirements, from - # the reg's bitwidth out to the full bitwidth of the regfile - val = sign_or_zero_extend(val, bitwidth, xlen) - int_regfile[reg].l[0] = val - elif bitwidth == 8: - int_regfile[reg].b[offset] = val - elif bitwidth == 16: - int_regfile[reg].s[offset] = val - elif bitwidth == 32: - int_regfile[reg].i[offset] = val - elif bitwidth == 64: - int_regfile[reg].l[offset] = val - - maxsrcwid = get_max_elwidth(rs1, rs2) # source element width(s) - destwid = int_csr[rs1].elwidth # destination element width -  for (i = 0; i < VL; i++) - if (predval & 1< - -Polymorphic element widths in vectorised form means that the data -being loaded (or stored) across multiple registers needs to be treated -(reinterpreted) as a contiguous stream of elwidth-wide items, where -the source register's element width is **independent** from the destination's. - -This makes for a slightly more complex algorithm when using indirection -on the "addressed" register (source for LOAD and destination for STORE), -particularly given that the LOAD/STORE instruction provides important -information about the width of the data to be reinterpreted. - -As LOAD/STORE may be twin-predicated, it is important to note that -the rules on twin predication still apply. Where in previous -pseudo-code (elwidth=default for both source and target) it was -the *registers* that the predication was applied to, it is now the -**elements** that the predication is applied to. - -The pseudocode for all LD operations may be written out -as follows: - - function LBU(rd, rs): - load_elwidthed(rd, rs, 8, true) - function LB(rd, rs): - load_elwidthed(rd, rs, 8, false) - function LH(rd, rs): - load_elwidthed(rd, rs, 16, false) - ... - ... - function LQ(rd, rs): - load_elwidthed(rd, rs, 128, false) - - # returns 1 byte of data when opwidth=8, 2 bytes when opwidth=16.. - function load_memory(rs, imm, i, opwidth): - elwidth = int_csr[rs].elwidth - bitwidth = bw(elwidth); - elsperblock = min(1, opwidth / bitwidth) - srcbase = ireg[rs+i/(elsperblock)]; - offs = i % elsperblock; - return mem[srcbase + imm + offs]; # 1/2/4/8/16 bytes - - function load_elwidthed(rd, rs, opwidth, unsigned): - destwid = int_csr[rd].elwidth # destination element width -  rd = int_csr[rd].active ? int_csr[rd].regidx : rd; -  rs = int_csr[rs].active ? int_csr[rs].regidx : rs; -  ps = get_pred_val(FALSE, rs); # predication on src -  pd = get_pred_val(FALSE, rd); # ... AND on dest -  for (int i = 0, int j = 0; i < VL && j < VL;): - if (int_csr[rs].isvec) while (!(ps & 1<