Allow the formal engine to perform a same-cycle result in the ALU

[soc.git] / src / soc / fu / logical / main_stage.py
diff --git a/src/soc/fu/logical/main_stage.py b/src/soc/fu/logical/main_stage.py

index 8c3703e6f3b0ca872e45958c88a31756058ce6db..6a90395783e798165bd55576c769c9bf73144952 100644 (file)
--- a/src/soc/fu/logical/main_stage.py
+++ b/src/soc/fu/logical/main_stage.py
@@ -5,25 +5,21 @@
  # This module however should not gate the carry or overflow, that's up
  # to the output stage
  
+# Copyright (C) 2020 Michael Nolan <mtnolan2640@gmail.com>
+# Copyright (C) 2020, 2021 Luke Kenneth Casson Leighton <lkcl@lkcl.net>
+
  from nmigen import (Module, Signal, Cat, Repl, Mux, Const, Array)
  from nmutil.pipemodbase import PipeModBase
  from nmutil.clz import CLZ
  from soc.fu.logical.pipe_data import LogicalInputData
  from soc.fu.logical.bpermd import Bpermd
-from soc.fu.alu.pipe_data import ALUOutputData
-from ieee754.part.partsig import PartitionedSignal
-from soc.decoder.power_enums import InternalOp
-
-from soc.decoder.power_fields import DecodeFields
-from soc.decoder.power_fieldsn import SignalBitRange
+from soc.fu.logical.popcount import Popcount
+from soc.fu.logical.pipe_data import LogicalOutputData
+from ieee754.part.partsig import SimdSignal
+from openpower.decoder.power_enums import MicrOp
  
-
-def array_of(count, bitwidth):
-    res = []
-    for i in range(count):
-        res.append(Signal(bitwidth, reset_less=True,
-                          name=f"pop_{bitwidth}_{i}"))
-    return res
+from openpower.decoder.power_fields import DecodeFields
+from openpower.decoder.power_fieldsn import SignalBitRange
  
  
  class LogicalMainStage(PipeModBase):
@@ -36,86 +32,79 @@ class LogicalMainStage(PipeModBase):
          return LogicalInputData(self.pspec)
  
      def ospec(self):
-        return ALUOutputData(self.pspec)  # TODO: ALUIntermediateData
+        return LogicalOutputData(self.pspec)
  
      def elaborate(self, platform):
+        XLEN = self.pspec.XLEN
          m = Module()
          comb = m.d.comb
          op, a, b, o = self.i.ctx.op, self.i.a, self.i.b, self.o.o
  
+        comb += o.ok.eq(1) # overridden if no op activates
+
+        m.submodules.bpermd = bpermd = Bpermd(XLEN)
+        m.submodules.popcount = popcount = Popcount(XLEN)
+
          ##########################
          # main switch for logic ops AND, OR and XOR, cmpb, parity, and popcount
  
          with m.Switch(op.insn_type):
  
-            ###### AND, OR, XOR #######
-            with m.Case(InternalOp.OP_AND):
-                comb += o.eq(a & b)
-            with m.Case(InternalOp.OP_OR):
-                comb += o.eq(a | b)
-            with m.Case(InternalOp.OP_XOR):
-                comb += o.eq(a ^ b)
+            ###################
+            ###### AND, OR, XOR  v3.0B p92-95
+
+            with m.Case(MicrOp.OP_AND):
+                comb += o.data.eq(a & b)
+            with m.Case(MicrOp.OP_OR):
+                comb += o.data.eq(a | b)
+            with m.Case(MicrOp.OP_XOR):
+                comb += o.data.eq(a ^ b)
+
+            ###################
+            ###### cmpb  v3.0B p97
  
-            ###### cmpb #######
-            with m.Case(InternalOp.OP_CMPB):
+            with m.Case(MicrOp.OP_CMPB):
                  l = []
                  for i in range(8):
                      slc = slice(i*8, (i+1)*8)
                      l.append(Repl(a[slc] == b[slc], 8))
-                comb += o.eq(Cat(*l))
-
-            ###### popcount #######
-            with m.Case(InternalOp.OP_POPCNT):
-                # starting from a, perform successive addition-reductions
-                # creating arrays big enough to store the sum, each time
-                pc = [a]
-                # QTY32 2-bit (to take 2x 1-bit sums) etc.
-                work = [(32, 2), (16, 3), (8, 4), (4, 5), (2, 6), (1, 7)]
-                for l, b in work:
-                    pc.append(array_of(l, b))
-                pc8 = pc[3]     # array of 8 8-bit counts (popcntb)
-                pc32 = pc[5]    # array of 2 32-bit counts (popcntw)
-                popcnt = pc[-1]  # array of 1 64-bit count (popcntd)
-                # cascade-tree of adds
-                for idx, (l, b) in enumerate(work):
-                    for i in range(l):
-                        stt, end = i*2, i*2+1
-                        src, dst = pc[idx], pc[idx+1]
-                        comb += dst[i].eq(Cat(src[stt], Const(0, 1)) +
-                                          Cat(src[end], Const(0, 1)))
-                # decode operation length
-                with m.If(op.data_len == 1):
-                    # popcntb - pack 8x 4-bit answers into output
-                    for i in range(8):
-                        comb += o[i*8:(i+1)*8].eq(pc8[i])
-                with m.Elif(op.data_len == 4):
-                    # popcntw - pack 2x 5-bit answers into output
-                    for i in range(2):
-                        comb += o[i*32:(i+1)*32].eq(pc32[i])
-                with m.Else():
-                    # popcntd - put 1x 6-bit answer into output
-                    comb += o.eq(popcnt[0])
+                comb += o.data.eq(Cat(*l))
+
+            ###################
+            ###### popcount v3.0B p97, p98
+
+            with m.Case(MicrOp.OP_POPCNT):
+                comb += popcount.a.eq(a)
+                comb += popcount.b.eq(b)
+                comb += popcount.data_len.eq(op.data_len)
+                comb += o.data.eq(popcount.o)
+
+            ###################
+            ###### parity v3.0B p98
  
-            ###### parity #######
-            with m.Case(InternalOp.OP_PRTY):
+            with m.Case(MicrOp.OP_PRTY):
                  # strange instruction which XORs together the LSBs of each byte
                  par0 = Signal(reset_less=True)
                  par1 = Signal(reset_less=True)
                  comb += par0.eq(Cat(a[0], a[8], a[16], a[24]).xor())
-                comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
+                if XLEN == 64:
+                    comb += par1.eq(Cat(a[32], a[40], a[48], a[56]).xor())
                  with m.If(op.data_len[3] == 1):
-                    comb += o.eq(par0 ^ par1)
+                    comb += o.data.eq(par0 ^ par1)
                  with m.Else():
                      comb += o[0].eq(par0)
-                    comb += o[32].eq(par1)
+                    if XLEN == 64:
+                        comb += o[32].eq(par1)
  
-            ###### cntlz #######
-            with m.Case(InternalOp.OP_CNTZ):
+            ###################
+            ###### cntlz v3.0B p99
+
+            with m.Case(MicrOp.OP_CNTZ):
                  XO = self.fields.FormX.XO[0:-1]
                  count_right = Signal(reset_less=True)
                  comb += count_right.eq(XO[-1])
  
-                cntz_i = Signal(64, reset_less=True)
+                cntz_i = Signal(XLEN, reset_less=True)
                  a32 = Signal(32, reset_less=True)
                  comb += a32.eq(a[0:32])
  
@@ -124,16 +113,20 @@ class LogicalMainStage(PipeModBase):
                  with m.Else():
                      comb += cntz_i.eq(Mux(count_right, a[::-1], a))
  
-                m.submodules.clz = clz = CLZ(64)
+                m.submodules.clz = clz = CLZ(XLEN)
                  comb += clz.sig_in.eq(cntz_i)
-                comb += o.eq(Mux(op.is_32bit, clz.lz-32, clz.lz))
+                comb += o.data.eq(Mux(op.is_32bit, clz.lz-32, clz.lz))
+
+            ###################
+            ###### bpermd v3.0B p100
  
-            ###### bpermd #######
-            with m.Case(InternalOp.OP_BPERM):
-                m.submodules.bpermd = bpermd = Bpermd(64)
+            with m.Case(MicrOp.OP_BPERM):
                  comb += bpermd.rs.eq(a)
                  comb += bpermd.rb.eq(b)
-                comb += o.eq(bpermd.ra)
+                comb += o.data.eq(bpermd.ra)
+
+            with m.Default():
+                comb += o.ok.eq(0)
  
          ###### sticky overflow and context, both pass-through #####