From 1361c489ba642cbf1395097f10b730edc146b81d Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Sat, 26 Jan 2019 02:17:33 +0000
Subject: [PATCH] add to notes

---
 3d_gpu/microarchitecture.mdwn | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
diff --git a/3d_gpu/microarchitecture.mdwn b/3d_gpu/microarchitecture.mdwn
index ebc7a057f..d26d89484 100644
--- a/3d_gpu/microarchitecture.mdwn
+++ b/3d_gpu/microarchitecture.mdwn
@@ -508,6 +508,40 @@ Register Prefixes <a name="prefixes" />
 | xxxxxxxxxxxxxxxx | xxxxxxxxxxxbbb11 | XXXXXXXXXXXXXXXX | XXXXXXXXX0111111 |
 </pre>
 
+# MVX and other reg-shuffling
+
+<pre>
+> Crucial strategic op missing is MVX:
+> regs[rd]= regs[regs[rs1]]
+>
+we could modify the definition slightly:
+for i in 0..VL {
+    let offset = regs[rs1 + i];
+    // we could also limit on out-of-range
+    assert!(offset < VL); // trap on fail
+    regs[rd + i] = regs[rs2 + offset];
+}
+
+The dependency matrix would have the instruction depend on everything from
+rs2 to rs2 + VL and we let the execution unit figure it out. for
+simplicity, we could extend the dependencies to a power of 2 or something.
+
+We should add some constrained swizzle instructions for the more
+pipeline-friendly cases. One that will be important is:
+for i in (0..VL) {
+    let i = i * 4;
+    let s1: [0; 4];
+    for j in 0..4 {
+        s1[j] = regs[rs1 + i + j];
+    }
+    for j in 0..4 {
+        regs[rd + i + j] = s1[(imm >> j * 2) & 0x3];
+    }
+}
+Another is matrix transpose for (2-4)x(2-4) matrices which we can implement
+as similar to a strided ld/st except for registers.
+</pre>
+
 # TLBs / Virtual Memory <a name="tlb" />
 
 ----
-- 
2.30.2