From ef7798e5c908b9968e2e23b52bef914a56f868b8 Mon Sep 17 00:00:00 2001 From: lkcl Date: Sat, 9 Jan 2021 22:15:23 +0000 Subject: [PATCH] --- simple_v_extension/remap.mdwn | 162 +--------------------------------- 1 file changed, 1 insertion(+), 161 deletions(-) diff --git a/simple_v_extension/remap.mdwn b/simple_v_extension/remap.mdwn index 907c9a189..4d1614307 100644 --- a/simple_v_extension/remap.mdwn +++ b/simple_v_extension/remap.mdwn @@ -2,164 +2,4 @@ # NOTE -This section is under revision (and is optional) - -# REMAP - -REMAP allows the usual vector loop `0..VL-1` to be "reshaped" (re-mapped) from a linear -form to a 2D or 3D transposed form, or "offset" to permit arbitrary -access to elements, independently on each Vector src or dest register. - -Their primary use is for Matrix Multiplication, reordering of sequential data in-place. Four CSRs are provided so that a single FMAC may be used in a single loop to perform 4x4 times 4x4 Matrix multiplication, generating 64 FMACs. Additional uses include regular "Structure Packing" such as RGB pixel data extraction and reforming. - -# SHAPE 1D/2D/3D vector-matrix remapping CSRs - -There are four "shape" CSRs, SHAPE0-3, 32-bits in each, -which have the same format. - -[[!inline raw="yes" pages="simple_v_extension/shape_table_format" ]] - -The algorithm below shows how REMAP works more clearly, and may be -executed as a python program: - - xdim = 3 - ydim = 4 - zdim = 1 - - lims = [xdim, ydim, zdim] - idxs = [0,0,0] # starting indices - order = [0,1,2] # experiment with different permutations, here - offset = 2 # experiment with different offset, here - VL = xdim * ydim * zdim # multiply (or add) to this to get "cycling" - applydim = 0 - invxyz = [0,0,0] - - # run for offset iterations before actually starting - for idx in range(offset): - for i in range(3): - idxs[order[i]] = idxs[order[i]] + 1 - if (idxs[order[i]] != lims[order[i]]): - break - idxs[order[i]] = 0 - - break_count = 0 - - for idx in range(VL): - ix = [0] * 3 - for i in range(3): - if i >= applydim: - ix[i] = idxs[i] - if invxyz[i]: - ix[i] = lims[i] - 1 - ix[i] - new_idx = ix[0] + ix[1] * xdim + ix[2] * xdim * ydim - print new_idx, - break_count += 1 - if break_count == lims[order[0]]: - print - break_count = 0 - for i in range(3): - idxs[order[i]] = idxs[order[i]] + 1 - if (idxs[order[i]] != lims[order[i]]): - break - idxs[order[i]] = 0 - -Here, it is assumed that this algorithm be run within all pseudo-code -throughout this document where a (parallelism) for-loop would normally -run from 0 to VL-1 to refer to contiguous register -elements; instead, where REMAP indicates to do so, the element index -is run through the above algorithm to work out the **actual** element -index, instead. Given that there are four possible SHAPE entries, up to -four separate registers in any given operation may be simultaneously -remapped: - - function op_add(rd, rs1, rs2) # add not VADD! - ... - ... -  for (i = 0; i < VL; i++) - xSTATE.srcoffs = i # save context - if (predval & 1<