From 1a920356dc0920b1b7476367ad3353a78a960fc0 Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Sun, 30 Jun 2019 07:32:39 +0100
Subject: [PATCH] fail-on-first mode interacts with zeroing

---
 simple_v_extension/appendix.mdwn      | 58 ++++++++++++++++-----------
 simple_v_extension/specification.mdwn | 14 +++++--
 2 files changed, 44 insertions(+), 28 deletions(-)
diff --git a/simple_v_extension/appendix.mdwn b/simple_v_extension/appendix.mdwn
index 426ff2247..c0d767f9a 100644
--- a/simple_v_extension/appendix.mdwn
+++ b/simple_v_extension/appendix.mdwn
@@ -36,6 +36,9 @@ VL is set to include elements that did not take the trap *and* includes
 the elements that were predicated (masked) out (not tested up to the
 point where the trap occurred).
 
+Unlike conditional tests, "fail-on-first trap" instruction behaviour is
+unaltered by setting zero or non-zero predication mode.
+
 If SUBVL is being used (SUBVL!=1), the first *sub-group* of elements
 will cause a trap as normal (as if ffirst is not set); subsequently, the
 trap must not occur in the *sub-group* of elements.  SUBVL will **NOT**
@@ -54,6 +57,13 @@ result being zero (or other "fail" condition).  VL is set to the number
 of elements that were (sequentially) processed before the fail-condition
 was encountered.
 
+Unlike trap fail-on-first, fail-on-first conditional testing behaviour
+responds to changes in the zero or non-zero predication mode.  Whilst
+in non-zeroing mode, masked-out elements are simply not tested (and
+thus considered "never to fail"), in zeroing mode, masked-out elements
+may be viewed as *always* (unconditionally) failing.  This effectively
+turns VL into something akin to a software-controlled loop.
+
 Note that just as with traps, if SUBVL!=1, the first trap in the
 *sub-group* will cause the processing to end, and, even if there were
 elements within the *sub-group* that passed the test, that sub-group is
@@ -1511,24 +1521,24 @@ TODO evaluate strncpy and strlen
 
 RVV version: <a name="strncpy"></>
 
-    strncpy: 
-        mv a3, a0               # Copy dst 
-    loop: 
-        setvli x0, a2, vint8    # Vectors of bytes. 
-        vlbff.v v1, (a1)        # Get src bytes 
-        vseq.vi v0, v1, 0       # Flag zero bytes 
-        vmfirst a4, v0          # Zero found? 
+    strncpy:
+        mv a3, a0               # Copy dst
+    loop:
+        setvli x0, a2, vint8    # Vectors of bytes.
+        vlbff.v v1, (a1)        # Get src bytes
+        vseq.vi v0, v1, 0       # Flag zero bytes
+        vmfirst a4, v0          # Zero found?
         vmsif.v v0, v0          # Set mask up to and including zero byte.
-        vsb.v v1, (a3), v0.t    # Write out bytes 
-        bgez a4, exit           # Done 
-        csrr t1, vl             # Get number of bytes fetched 
-        add a1, a1, t1          # Bump src pointer 
-        sub a2, a2, t1          # Decrement count. 
-        add a3, a3, t1          # Bump dst pointer 
-        bnez a2, loop           # Anymore? 
+        vsb.v v1, (a3), v0.t    # Write out bytes
+        bgez a4, exit           # Done
+        csrr t1, vl             # Get number of bytes fetched
+        add a1, a1, t1          # Bump src pointer
+        sub a2, a2, t1          # Decrement count.
+        add a3, a3, t1          # Bump dst pointer
+        bnez a2, loop           # Anymore?
 
-    exit: 
-        ret 
+    exit:
+        ret
 
 SV version (WIP):
 
@@ -1552,10 +1562,10 @@ SV version (WIP):
     allnonzero:
         stb t0, (a3)    # VL legal range
         GETVL t4        # from bne tests
-        add a1, a1, t4  # Bump src pointer 
-        sub a2, a2, t4  # Decrement count. 
-        add a3, a3, t4  # Bump dst pointer 
-        bnez a2, loop   # Anymore? 
+        add a1, a1, t4  # Bump src pointer
+        sub a2, a2, t4  # Decrement count.
+        add a3, a3, t4  # Bump dst pointer
+        bnez a2, loop   # Anymore?
     exit:
         ret
 
@@ -1599,19 +1609,19 @@ Notes:
 
 RVV version:
 
-        mv a3, a0             # Save start 
-    loop: 
+        mv a3, a0             # Save start
+    loop:
         setvli a1, x0, vint8  # byte vec, x0 (Zero reg) => use max hardware len
         vldbff.v v1, (a3)     # Get bytes
         csrr a1, vl           # Get bytes actually read e.g. if fault
-        vseq.vi v0, v1, 0     # Set v0[i] where v1[i] = 0 
+        vseq.vi v0, v1, 0     # Set v0[i] where v1[i] = 0
         add a3, a3, a1        # Bump pointer
         vmfirst a2, v0        # Find first set bit in mask, returns -1 if none
         bltz a2, loop         # Not found?
         add a0, a0, a1        # Sum start + bump
         add a3, a3, a2        # Add index of zero byte
         sub a0, a3, a0        # Subtract start address+bump
-        ret 
+        ret
 
 ## DAXPY
 
diff --git a/simple_v_extension/specification.mdwn b/simple_v_extension/specification.mdwn
index 181b40a2e..5ac2c8bda 100644
--- a/simple_v_extension/specification.mdwn
+++ b/simple_v_extension/specification.mdwn
@@ -663,11 +663,17 @@ The other variant is comparisons such as FEQ (or the augmented behaviour
 of Branch), and any operation that returns a result of zero (whether
 integer or floating-point).  In the FP case, this includes negative-zero.
 
-Note that the execution order must "appear" to be sequential for ffirst
-mode to work correctly.  An in-order architecture must execute the element
+ffirst interacts with zero- and non-zero predication.  In non-zeroing
+mode, masked-out operations are simply excluded from testing (can never
+fail).  However for fail-comparisons (not faults) in zeroing mode, the
+result will be zero: this *always* "fails", thus on the very first
+masked-out element ffirst will always terminate.
+
+Note that ffirst mode works because the execution order must "appear" to be
+(in "program order").  An in-order architecture must execute the element
 operations in sequence, whilst an out-of-order architecture must *commit*
-the element operations in sequence (giving the appearance of in-order
-execution).
+the element operations in sequence and cancel speculatively-executed
+ones (giving the appearance of in-order execution).
 
 Note also, that if ffirst mode is needed without predication, a special
 "always-on" Predicate Table Entry may be constructed by setting
-- 
2.30.2