whitespace cleanup

[libreriscv.git] / simple_v_extension / appendix.mdwn
diff --git a/simple_v_extension/appendix.mdwn b/simple_v_extension/appendix.mdwn

index 79b9383f5af131e5a1f1af1a794a072ef52b793e..1bd457b6b9e63e8b602833bc80368dcbc403847f 100644 (file)
--- a/simple_v_extension/appendix.mdwn
+++ b/simple_v_extension/appendix.mdwn
@@ -2,12 +2,12 @@
  
  * Copyright (C) 2017, 2018, 2019 Luke Kenneth Casson Leighton
  * Status: DRAFTv0.6
-* Last edited: 28 jun 2019
+* Last edited: 30 jun 2019
  * main spec [[specification]]
  
  [[!toc ]]
  
-# Fail-on-first modes
+# Fail-on-first modes <a name="ffirst"></a>
  
  Fail-on-first data dependency has different behaviour for traps than
  for conditional testing.  "Conditional" is taken to mean "anything
@@ -15,10 +15,10 @@ that is zero", however with traps, the first element has to
  be given the opportunity to throw the exact same trap that would
  be thrown if this were a scalar operation (when VL=1).
  
-Note that implementors are required to mutually exclusively choose one or
-the other modes: an instruction is **not** permitted to fail on a trap
-*and* fail a conditional test.  This advice to custom opcode writers as
-well as future extension writers.
+Note that implementors are required to mutually exclusively choose one
+or the other modes: an instruction is **not** permitted to fail on a
+trap *and* fail a conditional test at the same time.  This advice to
+custom opcode writers as well as future extension writers.
  
  ## Fail-on-first traps
  
@@ -36,6 +36,9 @@ VL is set to include elements that did not take the trap *and* includes
  the elements that were predicated (masked) out (not tested up to the
  point where the trap occurred).
  
+Unlike conditional tests, "fail-on-first trap" instruction behaviour is
+unaltered by setting zero or non-zero predication mode.
+
  If SUBVL is being used (SUBVL!=1), the first *sub-group* of elements
  will cause a trap as normal (as if ffirst is not set); subsequently, the
  trap must not occur in the *sub-group* of elements.  SUBVL will **NOT**
@@ -54,6 +57,13 @@ result being zero (or other "fail" condition).  VL is set to the number
  of elements that were (sequentially) processed before the fail-condition
  was encountered.
  
+Unlike trap fail-on-first, fail-on-first conditional testing behaviour
+responds to changes in the zero or non-zero predication mode.  Whilst
+in non-zeroing mode, masked-out elements are simply not tested (and
+thus considered "never to fail"), in zeroing mode, masked-out elements
+may be viewed as *always* (unconditionally) failing.  This effectively
+turns VL into something akin to a software-controlled loop.
+
  Note that just as with traps, if SUBVL!=1, the first trap in the
  *sub-group* will cause the processing to end, and, even if there were
  elements within the *sub-group* that passed the test, that sub-group is
@@ -252,7 +262,7 @@ complex), this becomes:
      rd = get_pred_val(I/F==INT, rs2); # this may not exist
  
      if not exists(rd) or zeroing:
-        result = 0
+        result = (1<<VL)-1 # all 1s
      else
          result = preg[rd]
  
@@ -1507,40 +1517,39 @@ of total length 128 bit given that XLEN is now 128.
  TODO evaluate strncpy and strlen
  <https://groups.google.com/forum/m/#!msg/comp.arch/bGBeaNjAKvc/_vbqyxTUAQAJ>
  
-## strncpy
+## strncpy <a name="strncpy"></>
  
-RVV version: <a name="strncpy"></>
+RVV version:
  
-    strncpy: 
-        mv a3, a0               # Copy dst 
-    loop: 
-        setvli x0, a2, vint8    # Vectors of bytes. 
-        vlbff.v v1, (a1)        # Get src bytes 
-        vseq.vi v0, v1, 0       # Flag zero bytes 
-        vmfirst a4, v0          # Zero found? 
+    strncpy:
+        mv a3, a0               # Copy dst
+    loop:
+        setvli x0, a2, vint8    # Vectors of bytes.
+        vlbff.v v1, (a1)        # Get src bytes
+        vseq.vi v0, v1, 0       # Flag zero bytes
+        vmfirst a4, v0          # Zero found?
          vmsif.v v0, v0          # Set mask up to and including zero byte.
-        vsb.v v1, (a3), v0.t    # Write out bytes 
-        bgez a4, exit           # Done 
-        csrr t1, vl             # Get number of bytes fetched 
-        add a1, a1, t1          # Bump src pointer 
-        sub a2, a2, t1          # Decrement count. 
-        add a3, a3, t1          # Bump dst pointer 
-        bnez a2, loop           # Anymore? 
+        vsb.v v1, (a3), v0.t    # Write out bytes
+        bgez a4, exit           # Done
+        csrr t1, vl             # Get number of bytes fetched
+        add a1, a1, t1          # Bump src pointer
+        sub a2, a2, t1          # Decrement count.
+        add a3, a3, t1          # Bump dst pointer
+        bnez a2, loop           # Anymore?
  
-    exit: 
-        ret 
+    exit:
+        ret
  
  SV version (WIP):
  
      strncpy:
          mv a3, a0
-        SETMVLI 8 # set max vector to 8
          RegCSR[a3] = 8bit, a3, scalar
          RegCSR[a1] = 8bit, a1, scalar
          RegCSR[t0] = 8bit, t0, vector
          PredTb[t0] = ffirst, x0, inv
      loop:
-        SETVLI a2, t4 # t4 and VL now 1..8
+        SETVLI a2, t4, 8 # t4 and VL now 1..8 (MVL=8)
          ldb t0, (a1) # t0 fail first mode
          bne t0, x0, allnonzero # still ff
          # VL points to last nonzero
@@ -1552,10 +1561,10 @@ SV version (WIP):
      allnonzero:
          stb t0, (a3)    # VL legal range
          GETVL t4        # from bne tests
-        add a1, a1, t4  # Bump src pointer 
-        sub a2, a2, t4  # Decrement count. 
-        add a3, a3, t4  # Bump dst pointer 
-        bnez a2, loop   # Anymore? 
+        add a1, a1, t4  # Bump src pointer
+        sub a2, a2, t4  # Decrement count.
+        add a3, a3, t4  # Bump dst pointer
+        bnez a2, loop   # Anymore?
      exit:
          ret
  
@@ -1583,7 +1592,7 @@ Notes:
    into t0 (could contain zeros).
  * bne t0 x0 tests up to the NEW VL for nonzero, vector t0 against
    scalar x0
-* however as t0 is in ffirst mode, the first fail wil ALSO stop the
+* however as t0 is in ffirst mode, the first fail will ALSO stop the
    compares, and reduce VL as well
  * the branch only goes to allnonzero if all tests succeed
  * if it did not, we can safely increment VL by 1 (using a4) to include
@@ -1599,20 +1608,32 @@ Notes:
  
  RVV version:
  
-        mv a3, a0             # Save start 
-    loop: 
+        mv a3, a0             # Save start
+    loop:
          setvli a1, x0, vint8  # byte vec, x0 (Zero reg) => use max hardware len
          vldbff.v v1, (a3)     # Get bytes
          csrr a1, vl           # Get bytes actually read e.g. if fault
-        vseq.vi v0, v1, 0     # Set v0[i] where v1[i] = 0 
+        vseq.vi v0, v1, 0     # Set v0[i] where v1[i] = 0
          add a3, a3, a1        # Bump pointer
          vmfirst a2, v0        # Find first set bit in mask, returns -1 if none
          bltz a2, loop         # Not found?
          add a0, a0, a1        # Sum start + bump
          add a3, a3, a2        # Add index of zero byte
          sub a0, a3, a0        # Subtract start address+bump
-        ret 
+        ret
  
  ## DAXPY
  
  [[!inline raw="yes" pages="simple_v_extension/daxpy_example" ]]
+
+Notes:
+
+* Setting MVL to 4 is just an example.  With enough space between the
+  FP regs, MVL may be set to larger values
+* VBLOCK header takes 16 bits, 8-bit mode may be used on the registers,
+  taking only another 16 bits, VBLOCK.SETVL requires 16 bits.  Total
+  overhead for use of VBLOCK: 48 bits (3 16-bit words).
+* All instructions except fmadd may use Compressed variants.  Total
+  number of 16-bit instruction words: 11.
+* Total: 14 16-bit words.  By contrast, RVV requires around 18 16-bit words.
+