more notes about scalar reduction

[libreriscv.git] / openpower / sv / 16_bit_compressed.mdwn
diff --git a/openpower/sv/16_bit_compressed.mdwn b/openpower/sv/16_bit_compressed.mdwn

index 741ed14d8478bc7ce44723cc025d6284c254d8c9..2a302ea5d7d9929ad1e50f3cc37bf47622964086 100644 (file)
--- a/openpower/sv/16_bit_compressed.mdwn
+++ b/openpower/sv/16_bit_compressed.mdwn
@@ -1,3 +1,5 @@
+[[!tag standards]]
+
  # 16 bit Compressed
  
  Similar to VLE (but without immediate-prefixing) this encoding is designed
@@ -111,7 +113,9 @@ to cross into or out of a function call.
  Thus it is the mandatory responsibility of the compiler to ensure that
  context returns to "v3.0B Standard" prior to entering a function call
  (responsibility of caller) and prior to exit from a function call
-(responsibility of callee).
+(responsibility of callee) by setting appropriate M and N bits.
+
+If however it is known to the compiler that certain static leaf node functions and their immediate callers will never, under any circumstances, be called by externsl ABI compliant code, then of course the compiler may choose to write such static functions as it sees fit.
  
  Trap Handlers also take responsibility for saving and restoring of
  Compressed Mode state, just as they already take responsibility for
@@ -201,7 +205,7 @@ Major Opcodes)
      | 0 | 1234 | 567  8 | 9  | a b | c  | d e | f | enc
      | N | immf | Cmaj.m | fld1     | fld2     | M | 16b
      | 1 | immf | Cmaj.m | fld1     | imm      | 1 | 16b imm
-    | fd3      | 001.1  | S1 | fd1 | S2 | fd2 | M | 16b sub
+    | N | fd3  | 001.1  | S1 | fd1 | S2 | fd2 | M | 16b sub
      | N | fd4  | 111.m  | fld1     | fld2     | M | 16b LDST
  
  Notes:
@@ -238,7 +242,8 @@ instruction counts from objdump on /bin/bash:
      | 1 | 1  | 0 | sh2 | | 001.0 | RA   | sh  | 1 | srawi.
      | 1 | 1  | 1 |     | | 001.0 | 000  | imm | 1 | TBD
      | 1 | 1  | 1 | i2  | | 001.0 | RA!=0| imm | 1 | addis
-    | 1 |              | | 010.0 | 000  |     | 1 | TBD
+    | 1 | 0  | i2      | | 010.0 | 000  | imm | 1 | setvli
+    | 1 | 1  | i2      | | 010.0 | 000  | imm | 1 | setmvli
      | 1 | i2           | | 010.0 | RA!=0| imm | 1 | addi
      | 1 | 0  | i2      | | 010.1 | RA   | imm | 1 | cmpdi
      | 1 | 1  | i2      | | 010.1 | RA   | imm | 1 | cmpwi
@@ -294,10 +299,11 @@ is "nop"
  
  16 bit mode only:
  
+    | 0 | 1 | 234 | | 567.8  | 9  ab | c   de | f |
      | - | - | --- | | -----  | ----- | ------ | - |
      | 1 | 0   000 | | 000.0  | 0  00 | 0   00 | 0 | nop
-    | 1 | 1   000 | | 000.0  | 0  00 | 0   00 | 0 | attn
-    | 1 | nonzero | | 000.0  | 0  00 | 0   00 | 0 | TBD
+    | 1 | 0   000 | | 000.0  | 0  00 | 0   00 | 1 | nop
+    | N | 1   000 | | 000.0  | 0  00 | 0   00 | M | attn
  
  Notes:
  
@@ -321,10 +327,13 @@ In essence the 2 nops are needed due to there being 2 different C forms:
  
  ### Branch
  
+TODO: document that branching whilst using mode-switching bits (M/N) is perfectly well permitted, the caveat being: it is specifically and wholly the complier/assembler writers responsibility to obey ABI rules and ensure that even with branches and returns that, at no time, is an incorrect mode entered or left that could result in any instruction being misinterpreted.
+
      | 16-bit mode | | 10-bit mode                 |
      | 0 | 1 | 234 | | 567.8  | 9  ab | c   de | f |
      | - | - | --- | | -----  | ----- | ------ | - |
      | N | offs2   | | 000.LK | offs!=0        | M | b, bl
+    | N |         | | 000.1  | 0  00 | 0   00 | M | TBD
      | 1 | offs2   | | 000.LK | BI    | BO1 oo | 1 | bc, bcl
      | N | BO3 BI3 | | 001.0  | LK BI | BO     | M | bclr, bclrl
  
@@ -355,21 +364,19 @@ In essence the 2 nops are needed due to there being 2 different C forms:
  
  Note: for 10-bit, ignore bits 0-4 (used by EXTNNN=Compressed)
  
-    | 16-bit mode    | | 10-bit mode             |
-    | 0   | 1  | 234 | | 567.8 | 9 a b | c d e | f |
-    | --- | -- | --- | | ----- | ----- | ----- | - |
-    | RA2 | SZ |  RB | | 001.1 | 1  RA | 0  RT | M | st
-    | RA2 | SZ |  RB | | 001.1 | 1  RA | 1  RT | M | fst
-    | N   | SZ |  RT | | 111.0 |  RA   |  RB   | M | ld
-    | N   | SZ |  RT | | 111.1 |  RA   |  RB   | M | fld
+    | 16-bit mode  | | 10-bit mode               |
+    | 0 | 1  | 234 | | 567.8 | 9 a b | c d e | f |
+    | - | -- | --- | | ----- | ----- | ----- | - |
+    | N | SZ |  RB | | 001.1 | 1  RA | 0  RT | M | st
+    | N | SZ |  RB | | 001.1 | 1  RA | 1  RT | M | fst
+    | N | SZ |  RT | | 111.0 |  RA   |  RB   | M | ld
+    | N | SZ |  RT | | 111.1 |  RA   |  RB   | M | fld
  
  * elwidth overrides can set different widths
  
  16 bit mode:
  
  * SZ=1 is 64 bit, SZ=0 is 32 bit
-* RA2 extends RA to 3 bits (MSB)
-* RT2 extends RT to 3 bits (MSB)
  
  10 bit mode:
  
@@ -427,9 +434,9 @@ Notes:
      | N | 0 |  RT | | 100.1 | RB  | RA!=0 | M | nand
      | N | 0 |  RT | | 101.0 | RB  | RA!=0 | M | or
      | N | 0 |  RT | | 101.1 | RB  | RA!=0 | M | nor/mr
-    | N | 0 |  RT | | 100.0 | RB  | 0 0 0 | M | extsw
+    | N | 0 |  RT | | 100.0 | RB  | 0 0 0 | M | popcnt
      | N | 0 |  RT | | 100.1 | RB  | 0 0 0 | M | cntlz
-    | N | 0 |  RT | | 101.0 | RB  | 0 0 0 | M | popcnt
+    | N | 0 |  RT | | 101.0 | RB  | 0 0 0 | M | extsw
      | N | 0 |  RT | | 101.1 | RB  | 0 0 0 | M | not
  
  16-bit mode only (note that bit 1 == 1):
@@ -440,9 +447,9 @@ Notes:
      | N | 1 |  RT | | 100.1 | RB  | RA!=0 | M | TBD
      | N | 1 |  RT | | 101.0 | RB  | RA!=0 | M | xor
      | N | 1 |  RT | | 101.1 | RB  | RA!=0 | M | eqv (xnor)
-    | N | 1 |  RT | | 100.0 | RB  | 0 0 0 | M | extsb
+    | N | 1 |  RT | | 100.0 | RB  | 0 0 0 | M | setvl.
      | N | 1 |  RT | | 100.1 | RB  | 0 0 0 | M | cnttz
-    | N | 1 |  RT | | 101.0 | RB  | 0 0 0 | M | TBD
+    | N | 1 |  RT | | 101.0 | RB  | 0 0 0 | M | extsb
      | N | 1 |  RT | | 101.1 | RB  | 0 0 0 | M | extsh
  
  10 bit mode:
@@ -487,10 +494,10 @@ Note here that elwidth overrides (SV Prefix) can be used to select FP16/32/64
  
  16 bit only, FP to INT convert (using C 0b001.1 subencoding)
  
-    | 0123 | 4 | | 567.8 | 9 ab | cde  | f |
-    | ---- | - | | ----- | ---- | ---- | - |
-    | 0010 | X | | 001.1 | 0 RA | Y RT | M | fp2int
-    | 0011 | X | | 001.1 | 0 RA | Y RT | M | int2fp
+    | 0 | 123 | 4 | | 567.8 | 9 ab | cde  | f |
+    | - | --- | - | | ----- | ---- | ---- | - |
+    | N | 101 | X | | 001.1 | 0 RA | Y RT | M | fp2int
+    | N | 110 | X | | 001.1 | 0 RA | Y RT | M | int2fp
  
  * X: signed=1, unsigned=0
  * Y: FP32=0, FP64=1
@@ -510,22 +517,22 @@ Note here that elwidth overrides (SV Prefix) can be used to select FP16/32/64
  10-bit or 16 bit:
  
      | 16-bit mode| | 10-bit mode            |
-    | 0123 | 4   | | 567.8 | 9 ab | cde | f |
-    | ---- | --- | | ----- | ---- | --- | - |
-    | 0000 | BF2 | | 001.1 | 0 BF | BFA | M | mcrf
+    | 0 | 123 | 4   | | 567.8 | 9 ab | cde | f |
+    | - | --- | --- | | ----- | ---- | --- | - |
+    | N | 000 | BF2 | | 001.1 | 0 BF | BFA | M | mcrf
  
  16-bit only:
  
-    | 0123 | 4   | | 567.8 | 9 ab | cde | f |
-    | ---- | --- | | ----- | ---- | --- | - |
-    | 0001 | BA2 | | 001.1 | 0 BA | BB  | M | crnor
-    | 0100 | BA2 | | 001.1 | 0 BA | BB  | M | crandc
-    | 0110 | BA2 | | 001.1 | 0 BA | BB  | M | crxor
-    | 0111 | BA2 | | 001.1 | 0 BA | BB  | M | crnand
-    | 1000 | BA2 | | 001.1 | 0 BA | BB  | M | crand
-    | 1001 | BA2 | | 001.1 | 0 BA | BB  | M | creqv
-    | 1101 | BA2 | | 001.1 | 0 BA | BB  | M | crorc
-    | 1110 | BA2 | | 001.1 | 0 BA | BB  | M | cror
+    | 0 | 1234 | | 567.8 | 9 ab | cde | f |
+    | - | ---- | | ----- | ---- | --- | - |
+    | N | 0010 | | 001.1 | 0 BA | BB  | M | crnor
+    | N | 0011 | | 001.1 | 0 BA | BB  | M | crandc
+    | N | 0100 | | 001.1 | 0 BA | BB  | M | crxor
+    | N | 0101 | | 001.1 | 0 BA | BB  | M | crnand
+    | N | 0110 | | 001.1 | 0 BA | BB  | M | crand
+    | N | 0111 | | 001.1 | 0 BA | BB  | M | creqv
+    | N | 1000 | | 001.1 | 0 BA | BB  | M | crorc
+    | N | 1001 | | 001.1 | 0 BA | BB  | M | cror
  
  Notes
  
@@ -558,28 +565,27 @@ space (when RA==0)
  
  **not available** in 10-bit mode, **only** in 16-bit mode:
  
-    | 0123 | 4 | | 567.8 | 9 ab | cde  | f |
-    | ---- | - | | ----- | ---- | ---- | - |
-    | 1111 | 0 | | 001.1 | 0 00 |  RT  | M | mtlr
-    | 1111 | 0 | | 001.1 | 0 01 |  RT  | M | mtctr
-    | 1111 | 0 | | 001.1 | 0 11 |  RT  | M | mtcr
-    | 1111 | 1 | | 001.1 | 0 00 |  RA  | M | mflr
-    | 1111 | 1 | | 001.1 | 0 01 |  RA  | M | mfctr
-    | 1111 | 1 | | 001.1 | 0 11 |  RA  | M | mfcr
+    | 0 | 1 | 234 | | 567.8 | 9 ab | cde  | f |
+    | - | ------- | | ----- | ---- | ---- | - |
+    | N | 1 | 111 | | 001.1 | 0 00 |  RT  | M | mtlr
+    | N | 1 | 111 | | 001.1 | 0 01 |  RT  | M | mtctr
+    | N | 1 | 111 | | 001.1 | 0 00 |  RA  | M | mflr
+    | N | 1 | 111 | | 001.1 | 0 01 |  RA  | M | mfctr
+    | N | 0 RA!=0 | | 000.0 | 0 00 |  000 | M | mtcr
+    | N | 1 RT!=0 | | 000.0 | 0 00 |  000 | M | mfcr
  
  ### Unallocated
  
-    | 0123 | 4 | | 567.8 | 9 ab | cde  | f |
-    | ---- | - | | ----- | ---- | ---- | - |
-    | 0101 |   | | 001.1 | 0    |      | M |
-    | 1010 |   | | 001.1 | 0    |      | M |
-    | 1011 |   | | 001.1 | 0    |      | M |
-    | 1100 |   | | 001.1 | 0    |      | M |
-    | 1111 |   | | 001.1 | 0 10 |      | M |
+16-bit only:
  
-## Other ideas (Attempt 2)
+    | 0 | 1 | 234 | | 567.8 | 9 ab | cde  | f |
+    | - | - | --- | | ----- | ---- | ---- | - |
+    | N | 1 | 111 | | 001.1 | 0 10 |      | M |
+    | N | 1 | 111 | | 001.1 | 0 11 |      | M |
  
-### 8-bit mode-switching instructions, odd addresses for C mode
+# Other ideas (Attempt 2)
+
+## 8-bit mode-switching instructions, odd addresses for C mode
  
  Drop the complexity of the 16-bit encoding further reduced to 10-bit,
  and use a single byte instead of two to switch between modes.  This
@@ -622,6 +628,14 @@ Tables explaining encoding:
      | .. bit | 16 bit          | 8nop   |
      | v3.0B standard 32 bit instruction |
  
+# Other ideas (v3)
+
+FSM state switching and mode switching deemed too complex.  Instead cut back to
+
+1. 10bit only (actually, 11 bit)
+2. SV-Prefixed 16bit only (aka SV-C32)
+
+Each will be entirely different which is a huge amount of work.
  
  # TODO 
  
@@ -633,6 +647,7 @@ Tables explaining encoding:
    objdump raw parsing
  * finally do full opcode allocation
  * rerun objdump compression ratio estimates
+* check in FSM if "return to v3.0B then 16bit" if it is ok to have the v3.0B be a 10bit Compressed.  should this be ignored and carry on? should a trap occur?
  
  ### Use 2- rather than 3-register opcodes
  
@@ -742,7 +757,7 @@ By eliminating such 16+16 (actually, 32bit conflation) tricks outlined in (2), C
  
  ## Compressed Decoder Phases
  
-Phase 1 (stage 1 of a 2-stage pipelined decoder) is defined as the minimum necessary FSM required to determine instruction length and mode.  This is implemented with the absolute bare minimum of gates and is based on the 6 encodings involving N, M and EXTNNN
+Phase 1 (stage 1 of a 2-stage pipelined decoder) is defined as the minimum necessary FSM required to determine instruction length and mode.  This is implemented with the absolute bare minimum of gates and is based on the 6 encodings involving N, M and EXTNNN (see table, below)
  
  Phase 2 (stage 2 of a 2-stage pipelined decoder) is defined as the "full decoder" that includes taking into account the length and mode from Phase 1.  Given a 2-stage pipelined decoder it is categorically **impossible** for Phase 2 to go backwards in time and affect the decisions made in Phase 1.
  
@@ -764,8 +779,8 @@ Table: Reminder of the 6 16-bit encodings:
  The Phase 1 length/mode identification takes into account only 3 pieces of information:
  
  * extc_id: insn[0:4] == EXTNNN (Compressed)
-* M: insn[0]
-* N: insn[15]
+* N: insn[0]
+* M: insn[15]
  
  The Phase 1 length/mode produces the following lengths/modes:
  
@@ -793,7 +808,7 @@ Pseudocode:
  
      elif previ.mode == 10bit:
           # previous was v3.0B, move to v3.0B or 16bit?
-        if N == 0:
+        if M == 0:
               next.length = 32
               nexti.mode = v3.0B
           else:
@@ -803,10 +818,10 @@ Pseudocode:
  
      elif previ.mode == 16bit:
            # previous was 16bit, stay there or move?
-          if N == 0:
+          if M == 0:
               # back to v3.0B
               next.length = 32
-             if M == 1:
+             if N == 1:
                    # ... but only for 1 insn
                    nexti.mode = v3.0B_then_16bit
               else: