(no commit message)

[libreriscv.git] / simple_v_extension / specification / mv.x.rst
diff --git a/simple_v_extension/specification/mv.x.rst b/simple_v_extension/specification/mv.x.rst

index b564eb5625da3ec4329508be8f93b6b935a9441e..7d3275447c4d035ddff8dbc1febcae3ab34c8fdc 100644 (file)
--- a/simple_v_extension/specification/mv.x.rst
+++ b/simple_v_extension/specification/mv.x.rst
@@ -1,7 +1,10 @@
+[[!tag standards]]
+
  MV.X and MV.swizzle
  ===================
  
-swizzle needs a MV.  see below for a potential way to use the funct7 to do a swizzle in rs2.
+swizzle needs a MV (there are 2 of them: swizzle and swizzle2).
+see below for a potential way to use the funct7 to do a swizzle in rs2.
  
  +---------------+-------------+-------+----------+----------+--------+----------+--------+--------+
  | Encoding      | 31:27       | 26:25 | 24:20    | 19:15    | 14:12  | 11:7     | 6:2    | 1:0    |
@@ -11,13 +14,12 @@ swizzle needs a MV.  see below for a potential way to use the funct7 to do a swi
  | RV32-I-type   + fn4[3:0]    + swizzle[7:0]     + rs1[4:0] + 0b000  | rd[4:0]  + OP-V   + 0b11   |
  +---------------+-------------+-------+----------+----------+--------+----------+--------+--------+
  
-* funct3 = MV
+* funct3 = MV: 0b000 for FP, 0b001 for INT
  * OP-V = 0b1010111
  * fn4 = 4 bit function.
-* fn4 = 0b0000 - INT MV-SWIZZLE ?
-* fn4 = 0b0001 - FP MV-SWIZZLE ?
-* fn4 = 0bNN10 - INT MV-X, NN=elwidth (default/8/16/32)
-* fn4 = 0bNN11 - FP MV-X NN=elwidth (default/8/16/32)
+* fn4 = 0b0000 - MV-SWIZZLE
+* fn4 = 0bNN01 - MV-X, NN=elwidth (default/8/16/32)
+* fn4 = 0bNN11 - MV-X.SUBVL NN=elwidth (default/8/16/32)
  
  swizzle (only active on SV or P48/P64 when SUBVL!=0):
  
@@ -27,6 +29,24 @@ swizzle (only active on SV or P48/P64 when SUBVL!=0):
  |   w |   z |   y |   x |
  +-----+-----+-----+-----+
  
+MV.X has two modes: SUBVL mode applies the element offsets only within a SUBVL inner loop. This can be used for transposition.
+
+::
+
+  for i in range(VL):
+     for j in range(SUBVL):
+        regs[rd] = regs[rd+regs[rs+j]]
+
+Normal mode will apply the element offsets incrementally:
+
+::
+
+  for i in range(VL):
+     for j in range(SUBVL):
+        regs[rd] = regs[rd+regs[rs+k]]
+          k++
+
+
  Pseudocode for element width part of MV.X:
  
  ::
@@ -86,6 +106,13 @@ potential MV.X?  register-version of MV-swizzle?
  
  question: do we need a swizzle MV.X as well?
  
+MV.X with 3 operands
+====================
+
+regs[rd] = regs[rs1 + regs[rs2]]
+
+Similar to LD/ST with the same twin predication rules
+
  macro-op fusion
  ===============
  
@@ -139,3 +166,176 @@ output:
  | m10 m11 m12 m13 |
  | m20 m21 m22 m23 |
  | m30 m31 m32 m33 |
+
+<http://web.archive.org/web/20100111104515/http://www.randombit.net:80/bitbashing/programming/integer_matrix_transpose_in_sse2.html>
+
+
+::
+
+   __m128i T0 = _mm_unpacklo_epi32(I0, I1);
+   __m128i T1 = _mm_unpacklo_epi32(I2, I3);
+   __m128i T2 = _mm_unpackhi_epi32(I0, I1);
+   __m128i T3 = _mm_unpackhi_epi32(I2, I3);
+
+   /* Assigning transposed values back into I[0-3] */
+   I0 = _mm_unpacklo_epi64(T0, T1);
+   I1 = _mm_unpackhi_epi64(T0, T1);
+   I2 = _mm_unpacklo_epi64(T2, T3);
+   I3 = _mm_unpackhi_epi64(T2, T3);
+
+Transforms for DCT 
+==================
+
+<https://opencores.org/websvn/filedetails?repname=mpeg2fpga&path=%2Fmpeg2fpga%2Ftrunk%2Frtl%2Fmpeg2%2Fidct.v>
+
+Table to evaluate
+=================
+
+swizzle2 takes 2 arguments, interleaving the two vectors depending on a 3rd (the swizzle selector)
+
++-----------+-------+-------+-------+-------+-------+------+
+|           | 31:27 | 26:25 | 24:20 | 19:15 | 14:12 | 11:7 |
++===========+=======+=======+=======+=======+=======+======+
+| swizzle2  | rs3   | 00    | rs2   | rs1   | 000   | rd   |
++-----------+-------+-------+-------+-------+-------+------+
+| fswizzle2 | rs3   | 01    | rs2   | rs1   | 000   | rd   |
++-----------+-------+-------+-------+-------+-------+------+
+| swizzle   | 0     | 10    | rs2   | rs1   | 000   | rd   |
++-----------+-------+-------+-------+-------+-------+------+
+| fswizzle  | 0     | 11    | rs2   | rs1   | 000   | rd   |
++-----------+-------+-------+-------+-------+-------+------+
+| swizzlei  | imm                   | rs1   | 001   | rd   |
++-----------+                       +-------+-------+------+
+| fswizzlei |                       | rs1   | 010   | rd   |
++-----------+-------+-------+-------+-------+-------+------+
+
+Matrix 4x4 Vector mul
+=====================
+
+::
+
+    pfscale,3 F2, F1, F10
+    pfscaleadd,2 F2, F1, F11, F2
+    pfscaleadd,1 F2, F1, F12, F2
+    pfscaleadd,0 F2, F1, F13, F2
+
+pfscale is a 4 vec mv.shuffle followed by a fmul. pfscaleadd is a 4 vec mv.shuffle followed by a fmac.
+
+In effect what this is doing is:
+
+::
+
+    fmul f2, f1.xxxx, f10
+    fmac f2, f1.yyyy, f11, f2
+    fmac f2, f1.zzzz, f12, f2
+    fmac f2, f1.wwww, f13, f2
+
+Where all of f2, f1, and f10-13 are vec4, and f1.x-w are copied (fixed index) where the other vec4 indices progress.
+
+Pseudocode
+==========
+
+Swizzle:
+
+::
+
+    pub trait SwizzleConstants: Copy + 'static {
+        const CONSTANTS: &'static [Self; 4];
+    }
+
+    impl SwizzleConstants for u8 {
+        const CONSTANTS: &'static [Self; 4] = &[0, 1, 0xFF, 0x7F];
+    }
+
+    impl SwizzleConstants for u16 {
+        const CONSTANTS: &'static [Self; 4] = &[0, 1, 0xFFFF, 0x7FFF];
+    }
+
+    impl SwizzleConstants for f32 {
+        const CONSTANTS: &'static [Self; 4] = &[0.0, 1.0, -1.0, 0.5];
+    }
+
+    // impl for other types too...
+
+    pub fn swizzle<Elm, Selector>(
+        rd: &mut [Elm],
+        rs1: &[Elm],
+        rs2: &[Selector],
+        vl: usize,
+        destsubvl: usize,
+        srcsubvl: usize)
+    where
+        Elm: SwizzleConstants,
+        // Selector is a copyable type that can be converted into u64
+        Selector: Copy + Into<u64>,
+    {
+        const FIELD_SIZE: usize = 3;
+        const FIELD_MASK: u64 = 0b111;
+        for vindex in 0..vl {
+            let selector = rs2[vindex].into();
+            // selector's type is u64
+            if selector >> (FIELD_SIZE * destsubvl) != 0 {
+                // handle illegal instruction trap
+            }
+            for i in 0..destsubvl {
+                let mut sel_field = selector >> (FIELD_SIZE * i);
+                sel_field &= FIELD_MASK;
+                let src = if (sel_field & 0b100) == 0 {
+                    &rs1[(vindex * srcsubvl)..]
+                } else {
+                    SwizzleConstants::CONSTANTS
+                };
+                sel_field &= 0b11;
+                if sel_field as usize >= srcsubvl {
+                    // handle illegal instruction trap
+                }
+                let value = src[sel_field as usize];
+                rd[vindex * destsubvl + i] = value;
+            }
+        }
+    }
+
+Swizzle2:
+
+::
+
+    fn swizzle2<Elm, Selector>(
+        rd: &mut [Elm],
+        rs1: &[Elm],
+        rs2: &[Selector],
+        rs3: &[Elm],
+        vl: usize,
+        destsubvl: usize,
+        srcsubvl: usize)
+    where
+        // Elm is a copyable type
+        Elm: Copy,
+        // Selector is a copyable type that can be converted into u64
+        Selector: Copy + Into<u64>,
+    {
+        const FIELD_SIZE: usize = 3;
+        const FIELD_MASK: u64 = 0b111;
+        for vindex in 0..vl {
+            let selector = rs2[vindex].into();
+            // selector's type is u64
+            if selector >> (FIELD_SIZE * destsubvl) != 0 {
+                // handle illegal instruction trap
+            }
+            for i in 0..destsubvl {
+                let mut sel_field = selector >> (FIELD_SIZE * i);
+                sel_field &= FIELD_MASK;
+                let src = if (sel_field & 0b100) != 0 {
+                    rs1
+                } else {
+                    rs3
+                };
+                sel_field &= 0b11;
+                if sel_field as usize >= srcsubvl {
+                    // handle illegal instruction trap
+                }
+                let value = src[vindex * srcsubvl + (sel_field as usize)];
+                rd[vindex * destsubvl + i] = value;
+            }
+        }
+    }
+