From e2fd8dfd141a596242c7c71311c2daebae29d9cb Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Wed, 26 Sep 2018 16:09:42 +0100
Subject: [PATCH] remembered that the use of sv registers have to be
 loop-incremented separately

the SV parallelism loop has to respect whether each *individual* register
is a vector or a scalar.
---
 riscv/insn_template_sv.cc |  5 ++---
 riscv/sv.cc               | 10 ++++++++--
 riscv/sv_decode.h         | 21 +++++++++++++--------
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/riscv/insn_template_sv.cc b/riscv/insn_template_sv.cc
index 222cde2..b8bad7a 100644
--- a/riscv/insn_template_sv.cc
+++ b/riscv/insn_template_sv.cc
@@ -11,12 +11,11 @@ reg_t FN(processor_t* p, insn_t s_insn, reg_t pc)
   // any registers that are marked as "vectorised"
   insn_bits_t bits = s_insn.bits();
 #ifndef USING_NOREGS
-  int voffs = 0;
   int vlen = 1;
   // need to know if register is used as float or int.
   // REGS_PATTERN is generated by id_regs.py (per opcode)
   unsigned int floatintmap = REGS_PATTERN;
-  sv_insn_t insn(bits, voffs, floatintmap);
+  sv_insn_t insn(bits, floatintmap);
   bool vectorop = false;
   reg_t predicate = 0;
   // identify which regs have had their CSR entries set as vectorised.
@@ -54,7 +53,7 @@ reg_t FN(processor_t* p, insn_t s_insn, reg_t pc)
   {
     // TODO: vlen = p->CSR(SIMPLEV_VL); // something like that...
   }
-  for (; voffs < vlen; voffs++)
+  for (int voffs=0; voffs < vlen; voffs++)
   {
       #include INCLUDEFILE
   }
diff --git a/riscv/sv.cc b/riscv/sv.cc
index e665f69..6be5872 100644
--- a/riscv/sv.cc
+++ b/riscv/sv.cc
@@ -45,7 +45,7 @@ bool sv_check_reg(bool intreg, uint64_t reg)
  * of SV.  it's "supposed" to "just" be a vectorisation API. it isn't:
  * it's quite a bit more.
  */
-uint64_t sv_insn_t::remap(uint64_t reg, bool intreg)
+uint64_t sv_insn_t::remap(uint64_t reg, bool intreg, int &voffs)
 {
   // okaay so first determine which map to use.  intreg is passed
   // in (ultimately) from id_regs.py's examination of the use of
@@ -83,6 +83,12 @@ uint64_t sv_insn_t::remap(uint64_t reg, bool intreg)
   // aaand now, as it's a "vector", FINALLY we can add on the loop-offset
   // which was passed in to the sv_insn_t constructor (by reference)
   // and, at last, we have "parallelism" a la contiguous registers.
-  return reg + this->voffs; // wheww :)
+  reg += voffs; // wheww :)
+
+  // however... before returning, we increment the loop-offset for
+  // this particular register, so that on the next loop the next
+  // contiguous register will be used.
+  voffs += 1;
+  return reg;
 }
 
diff --git a/riscv/sv_decode.h b/riscv/sv_decode.h
index 7faf48a..49a1678 100644
--- a/riscv/sv_decode.h
+++ b/riscv/sv_decode.h
@@ -14,18 +14,23 @@
 class sv_insn_t: public insn_t
 {
 public:
-  sv_insn_t(insn_bits_t bits, int& v, unsigned int f) :
-            insn_t(bits), voffs(v), fimap(f) {}
-  uint64_t rd () { return remap(insn_t::rd (), fimap & REG_RD); }
-  uint64_t rs1() { return remap(insn_t::rs1(), fimap & REG_RS1); }
-  uint64_t rs2() { return remap(insn_t::rs2(), fimap & REG_RS2); }
-  uint64_t rs3() { return remap(insn_t::rs3(), fimap & REG_RS3); }
+  sv_insn_t(insn_bits_t bits, unsigned int f) :
+            insn_t(bits), fimap(f),
+            offs_rd(0), offs_rs1(0),
+            offs_rs2(0), offs_rs3(0) {}
+  uint64_t rd () { return remap(insn_t::rd (), fimap & REG_RD , offs_rd); }
+  uint64_t rs1() { return remap(insn_t::rs1(), fimap & REG_RS1, offs_rs1); }
+  uint64_t rs2() { return remap(insn_t::rs2(), fimap & REG_RS2, offs_rs2); }
+  uint64_t rs3() { return remap(insn_t::rs3(), fimap & REG_RS3, offs_rs3); }
 private:
-  int &voffs;
   unsigned int fimap;
+  int offs_rd;
+  int offs_rs1;
+  int offs_rs2;
+  int offs_rs3;
   // remaps the register through the lookup table.
   // will need to take the current loop index/offset somehow
-  uint64_t remap(uint64_t reg, bool isint);
+  uint64_t remap(uint64_t reg, bool isint, int &offs);
 };
 
 #endif
-- 
2.30.2