From ef6e968a5dd9ee03343ea251f1ea2525fb3d0688 Mon Sep 17 00:00:00 2001
From: lkcl <lkcl@web>
Date: Sat, 1 Apr 2023 17:56:04 +0100
Subject: [PATCH]

---
 openpower/sv/rfc/ls010.mdwn | 90 +++++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 13 deletions(-)

diff --git a/openpower/sv/rfc/ls010.mdwn b/openpower/sv/rfc/ls010.mdwn
index 1bf582569..bacec089a 100644
--- a/openpower/sv/rfc/ls010.mdwn
+++ b/openpower/sv/rfc/ls010.mdwn
@@ -144,15 +144,16 @@ and it is left to the reader to translate to MSB0 numbering.
 The Canonical specification for how element-sequential numbering and
 element-width overrides is defined is expressed in the following c
 structure, assuming a Little-Endian system, and naturally using LSB0
-numbering everywhere because the ANSI c specification is inherently LSB0:
+numbering everywhere because the ANSI c specification is inherently LSB0.
+Note the deliberate similarity to how VSX register elements are defined:
 
 ```
     #pragma pack
     typedef union {
-        uint8_t  b[]; // elwidth 8
-        uint16_t s[]; // elwidth 16
-        uint32_t i[]; // elwidth 32
-        uint64_t l[]; // elwidth 64
+        uint8_t  bytes[]; // elwidth 8
+        uint16_t hwords[]; // elwidth 16
+        uint32_t words[]; // elwidth 32
+        uint64_t dwords[]; // elwidth 64
         uint8_t actual_bytes[8];
     } el_reg_t;
 
@@ -160,18 +161,18 @@ numbering everywhere because the ANSI c specification is inherently LSB0:
 
     void get_register_element(el_reg_t* el, int gpr, int element, int width) {
         switch (width) {
-            case 64: el->l = int_regfile[gpr].l[element];
-            case 32: el->i = int_regfile[gpr].i[element];
-            case 16: el->s = int_regfile[gpr].s[element];
-            case 8 : el->b = int_regfile[gpr].b[element];
+            case 64: el->dwords = int_regfile[gpr].dwords[element];
+            case 32: el->words = int_regfile[gpr].words[element];
+            case 16: el->hwords = int_regfile[gpr].hwords[element];
+            case 8 : el->bytes = int_regfile[gpr].bytes[element];
         }
     }
     void set_register_element(el_reg_t* el, int gpr, int element, int width) {
         switch (width) {
-            case 64: int_regfile[gpr].l[element] = el->l;
-            case 32: int_regfile[gpr].i[element] = el->i;
-            case 16: int_regfile[gpr].s[element] = el->s;
-            case 8 : int_regfile[gpr].b[element] = el->b;
+            case 64: int_regfile[gpr].dwords[element] = el->dwords;
+            case 32: int_regfile[gpr].words[element] = el->words;
+            case 16: int_regfile[gpr].hwords[element] = el->hwords;
+            case 8 : int_regfile[gpr].bytes[element] = el->bytes;
         }
     }
 ```
@@ -206,6 +207,69 @@ write-enable line.  It is up to the Hardware Architect to then amortise
 as simultaneous non-overlapping Register File writes, to achieve High
 Performance designs.
 
+For a comparative data point the VSR Registers may be expressed in the
+same fashion. The c code below is directly an expression of Figure 97 in
+Power ISA Public v3.1 Book I Section 6.3 page 258, *after compensating for
+MSB0 numbering and adapting in full to LSB0 numbering and obeying LE
+ordering*.
+
+**Crucial to understanding why the subtraction from 1,3,7,15 is present
+is because VSX Registers number elements also in MSB0 order**. SVP64
+very specifically numbers elements in **LSB0** order with the first
+element being at the **LSB** end of the register, where VSX places
+the numerically-lowest element at the **MSB** end of the register.
+
+```
+    #pragma pack
+    typedef union {
+        uint8_t  bytes[16]; // elwidth 8, QTY 16 FIXED total
+        uint16_t hwords[8]; // elwidth 16, QTY 8 FIXED total
+        uint32_t words[4]; // elwidth 32, QTY 8 FIXED total
+        uint64_t dwords[2]; // elwidth 64, QTY 2 FIXED total
+        uint8_t actual_bytes[16]; // totals 128-bit
+    } el_reg_t;
+
+    elreg_t VSR_regfile[64];
+
+    static void check_num_elements(int elt, int width) { 
+        switch (width) {
+            case 64: assert elt < 2;
+            case 32: assert elt < 4;
+            case 16: assert elt < 8;
+            case 8 : assert elt < 16;
+        }
+    }
+    void get_VSR_element(el_reg_t* el, int gpr, int elt, int width) {
+        check_num_elements(elt, width);
+        switch (width) {
+            case 64: el->dwords[1-elt] = VSR_regfile[gpr].dwords[1-elt];
+            case 32: el->words[3-elt] = VSR_regfile[gpr].words[3-elt];
+            case 16: el->hwords[7-elt] = VSR_regfile[gpr].hwords[7-elt];
+            case 8 : el->bytes[15-elt] = VSR_regfile[gpr].bytes[15-elt];
+        }
+    }
+    void set_register_element(el_reg_t* el, int gpr, int elt, int width) {
+        check_num_elements(elt, width);
+        switch (width) {
+            case 64: VSR_regfile[gpr].dwords[elt] = el->dwords[1-elt];
+            case 32: VSR_regfile[gpr].words[3-elt] = el->words[3-elt];
+            case 16: VSR_regfile[gpr].hwords[7-elt] = el->hwords[7-elt];
+            case 8 : VSR_regfile[gpr].bytes[15-elt] = el->bytes[15-elt];
+        }
+    }
+```
+
+For VSX Registers one key difference is that the overlay of different element
+widths is clearly a *bounded quantity*, whereas for Simple-V the elements are
+*unrestrained and permitted to flow into successive underlying Scalar registers*.
+This difference is absolutely critical to a full understanding of the entire
+Simple-V paradigm and why element-ordering, bit-numbering *and register numbering*
+are all so strictly defined.
+
+Implementations are not permitted to violate the Canonical definition: software
+will be critically relying on the wrapped (overflow) behaviour inherently
+implied from the unbounded c arrays.
+
 ## Scalar Identity Behaviour
 
 SVP64 is designed so that when the prefix is all zeros, and VL=1, no
-- 
2.30.2