Keep the sequencer in the "done" state until ready_i is asserted

[soc.git] / src / soc / experiment / alu_hier.py
diff --git a/src/soc/experiment/alu_hier.py b/src/soc/experiment/alu_hier.py

index 0172625817333fa225df0311b8eb0aa794ba5c43..99ff39e47f287ff4125ea025c249ddfe4804c207 100644 (file)
--- a/src/soc/experiment/alu_hier.py
+++ b/src/soc/experiment/alu_hier.py
@@ -9,96 +9,28 @@ A "real" integer ALU would place the answers onto the output bus after
  only one cycle (sync)
  """
  
-from nmigen import Elaboratable, Signal, Module, Const, Mux
+from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
  from nmigen.hdl.rec import Record, Layout
  from nmigen.cli import main
  from nmigen.cli import verilog, rtlil
  from nmigen.compat.sim import run_simulation
  
-from soc.decoder.power_enums import InternalOp, CryIn
+from soc.decoder.power_enums import InternalOp, Function, CryIn
+
+from soc.fu.alu.alu_input_record import CompALUOpSubset
+from soc.fu.cr.cr_input_record import CompCROpSubset
  
  import operator
  
  
-class CompALUOpSubset(Record):
-    """CompALUOpSubset
-
-    a copy of the relevant subset information from Decode2Execute1Type
-    needed for ALU operations.  use with eq_from_execute1 (below) to
-    grab subsets.
-    """
-    def __init__(self):
-        layout = (('insn_type', InternalOp),
-                  ('nia', 64),
-                  ('imm_data', Layout((("imm", 64), ("imm_ok", 1)))),
-                    #'cr = Signal(32, reset_less=True) # NO: this is from the CR SPR
-                    #'xerc = XerBits() # NO: this is from the XER SPR
-                  ('lk', 1),
-                  ('rc', Layout((("rc", 1), ("rc_ok", 1)))),
-                  ('oe', Layout((("oe", 1), ("oe_ok", 1)))),
-                  ('invert_a', 1),
-                  ('invert_out', 1),
-                  ('input_carry', CryIn),
-                  ('output_carry', 1),
-                  ('input_cr', 1),
-                  ('output_cr', 1),
-                  ('is_32bit', 1),
-                  ('is_signed', 1),
-                  ('byte_reverse', 1),
-                  ('sign_extend', 1))
-
-        Record.__init__(self, Layout(layout))
-
-        # grrr.  Record does not have kwargs
-        self.insn_type.reset_less = True
-        self.nia.reset_less = True
-        #self.cr = Signal(32, reset_less = True
-        #self.xerc = XerBits(
-        self.lk.reset_less = True
-        self.invert_a.reset_less = True
-        self.invert_out.reset_less = True
-        self.input_carry.reset_less = True
-        self.output_carry.reset_less = True
-        self.input_cr.reset_less = True
-        self.output_cr.reset_less = True
-        self.is_32bit.reset_less = True
-        self.is_signed.reset_less = True
-        self.byte_reverse.reset_less = True
-        self.sign_extend.reset_less = True
-
-    def eq_from_execute1(self, other):
-        """ use this to copy in from Decode2Execute1Type
-        """
-        res = []
-        for fname, sig in self.fields.items():
-            eqfrom = other.fields[fname]
-            res.append(sig.eq(eqfrom)
-        return res
  
-    def ports(self):
-        return [self.insn_type,
-                self.nia,
-                #self.cr,
-                #self.xerc,
-                self.lk,
-                self.invert_a,
-                self.invert_out,
-                self.input_carry,
-                self.output_carry,
-                self.input_cr,
-                self.output_cr,
-                self.is_32bit,
-                self.is_signed,
-                self.byte_reverse,
-                self.sign_extend,
-        ]
  
  class Adder(Elaboratable):
      def __init__(self, width):
          self.invert_a = Signal()
          self.a   = Signal(width)
          self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.o   = Signal(width, name="add_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -113,7 +45,7 @@ class Subtractor(Elaboratable):
      def __init__(self, width):
          self.a   = Signal(width)
          self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.o   = Signal(width, name="sub_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -125,7 +57,7 @@ class Multiplier(Elaboratable):
      def __init__(self, width):
          self.a   = Signal(width)
          self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.o   = Signal(width, name="mul_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -138,7 +70,7 @@ class Shifter(Elaboratable):
          self.width = width
          self.a   = Signal(width)
          self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.o   = Signal(width, name="shf_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -147,32 +79,127 @@ class Shifter(Elaboratable):
          m.d.comb += self.o.eq(self.a >> btrunc)
          return m
  
+class Dummy:
+    pass
+
+
+class DummyALU(Elaboratable):
+    def __init__(self, width):
+        self.p = Dummy() # make look like nmutil pipeline API
+        self.p.data_i = Dummy()
+        self.p.data_i.ctx = Dummy()
+        self.n = Dummy() # make look like nmutil pipeline API
+        self.n.data_o = Dummy()
+        self.p.valid_i = Signal()
+        self.p.ready_o = Signal()
+        self.n.ready_i = Signal()
+        self.n.valid_o = Signal()
+        self.counter   = Signal(4)
+        self.op  = CompCROpSubset()
+        i = []
+        i.append(Signal(width, name="i1"))
+        i.append(Signal(width, name="i2"))
+        i.append(Signal(width, name="i3"))
+        self.i = Array(i)
+        self.a, self.b, self.c = i[0], i[1], i[2]
+        self.out = Array([Signal(width, name="alu_o")])
+        self.o = self.out[0]
+        self.width = width
+        # more "look like nmutil pipeline API"
+        self.p.data_i.ctx.op = self.op
+        self.p.data_i.a = self.a
+        self.p.data_i.b = self.b
+        self.p.data_i.c = self.c
+        self.n.data_o.o = self.o
+
+    def elaborate(self, platform):
+        m = Module()
+
+        go_now = Signal(reset_less=True) # testing no-delay ALU
+
+        with m.If(self.p.valid_i):
+            # input is valid. next check, if we already said "ready" or not
+            with m.If(~self.p.ready_o):
+                # we didn't say "ready" yet, so say so and initialise
+                m.d.sync += self.p.ready_o.eq(1)
+
+                m.d.sync += self.o.eq(self.a)
+                m.d.comb += go_now.eq(1)
+                m.d.sync += self.counter.eq(1)
+
+        with m.Else():
+            # input says no longer valid, so drop ready as well.
+            # a "proper" ALU would have had to sync in the opcode and a/b ops
+            m.d.sync += self.p.ready_o.eq(0)
+
+        # ok so the counter's running: when it gets to 1, fire the output
+        with m.If((self.counter == 1) | go_now):
+            # set the output as valid if the recipient is ready for it
+            m.d.sync += self.n.valid_o.eq(1)
+        with m.If(self.n.ready_i & self.n.valid_o):
+            m.d.sync += self.n.valid_o.eq(0)
+            # recipient said it was ready: reset back to known-good.
+            m.d.sync += self.counter.eq(0) # reset the counter
+            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
+
+        # countdown to 1 (transition from 1 to 0 only on acknowledgement)
+        with m.If(self.counter > 1):
+            m.d.sync += self.counter.eq(self.counter - 1)
+
+        return m
+
+    def __iter__(self):
+        yield from self.op.ports()
+        yield self.a
+        yield self.b
+        yield self.c
+        yield self.o
+
+    def ports(self):
+        return list(self)
+
  
  class ALU(Elaboratable):
      def __init__(self, width):
-        self.p_valid_i = Signal()
-        self.p_ready_o = Signal()
-        self.n_ready_i = Signal()
-        self.n_valid_o = Signal()
+        self.p = Dummy() # make look like nmutil pipeline API
+        self.p.data_i = Dummy()
+        self.p.data_i.ctx = Dummy()
+        self.n = Dummy() # make look like nmutil pipeline API
+        self.n.data_o = Dummy()
+        self.p.valid_i = Signal()
+        self.p.ready_o = Signal()
+        self.n.ready_i = Signal()
+        self.n.valid_o = Signal()
          self.counter   = Signal(4)
-        self.op  = CompALUOpSubset()
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.op = CompALUOpSubset(name="op")
+        i = []
+        i.append(Signal(width, name="i1"))
+        i.append(Signal(width, name="i2"))
+        self.i = Array(i)
+        self.a, self.b = i[0], i[1]
+        self.out = Array([Signal(width, name="alu_o")])
+        self.o = self.out[0]
          self.width = width
+        # more "look like nmutil pipeline API"
+        self.p.data_i.ctx.op = self.op
+        self.p.data_i.a = self.a
+        self.p.data_i.b = self.b
+        self.n.data_o.o = self.o
  
      def elaborate(self, platform):
          m = Module()
          add = Adder(self.width)
          mul = Multiplier(self.width)
          shf = Shifter(self.width)
+        sub = Subtractor(self.width)
  
          m.submodules.add = add
          m.submodules.mul = mul
          m.submodules.shf = shf
+        m.submodules.sub = sub
  
          # really should not activate absolutely all ALU inputs like this
-        for mod in [add, mul, shf]:
+        for mod in [add, mul, shf, sub]:
              m.d.comb += [
                  mod.a.eq(self.a),
                  mod.b.eq(self.b),
@@ -183,56 +210,67 @@ class ALU(Elaboratable):
  
          go_now = Signal(reset_less=True) # testing no-delay ALU
  
-        with m.If(self.p_valid_i):
-            # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p_ready_o):
-                # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p_ready_o.eq(1)
+        # ALU sequencer is idle when the count is zero
+        alu_idle = Signal(reset_less=True)
+        m.d.comb += alu_idle.eq(self.counter == 0)
+
+        # ALU sequencer is done when the count is one
+        alu_done = Signal(reset_less=True)
+        m.d.comb += alu_done.eq(self.counter == 1)
+
+        # select handshake handling according to ALU type
+        with m.If(go_now):
+            # with a combinatorial, no-delay ALU, just pass through
+            # the handshake signals to the other side
+            m.d.comb += self.p.ready_o.eq(self.n.ready_i)
+            m.d.comb += self.n.valid_o.eq(self.p.valid_i)
+        with m.Else():
+            # sequential ALU handshake:
+            # ready_o responds to valid_i, but only if the ALU is idle
+            m.d.comb += self.p.ready_o.eq(self.p.valid_i & alu_idle)
+            # select the internally generated valid_o, above
+            m.d.comb += self.n.valid_o.eq(alu_done)
+
+        # hold the ALU result until ready_o is asserted
+        alu_r = Signal(self.width)
+
+        with m.If(alu_idle):
+            with m.If(self.p.valid_i):
  
                  # as this is a "fake" pipeline, just grab the output right now
                  with m.If(self.op.insn_type == InternalOp.OP_ADD):
-                    m.d.sync += self.o.eq(add.o)
+                    m.d.sync += alu_r.eq(add.o)
                  with m.Elif(self.op.insn_type == InternalOp.OP_MUL_L64):
-                    m.d.sync += self.o.eq(mul.o)
+                    m.d.sync += alu_r.eq(mul.o)
                  with m.Elif(self.op.insn_type == InternalOp.OP_SHR):
-                    m.d.sync += self.o.eq(shf.o)
-                # TODO: SUB
+                    m.d.sync += alu_r.eq(shf.o)
+                # SUB is zero-delay, no need to register
  
                  # NOTE: all of these are fake, just something to test
  
                  # MUL, to take 5 instructions
                  with m.If(self.op.insn_type == InternalOp.OP_MUL_L64):
                      m.d.sync += self.counter.eq(5)
-                # SHIFT to take 7
+                # SHIFT to take 1, straight away
                  with m.Elif(self.op.insn_type == InternalOp.OP_SHR):
-                    m.d.sync += self.counter.eq(7)
-                # ADD/SUB to take 2, straight away
-                with m.If(self.op.insn_type == InternalOp.OP_ADD):
+                    m.d.sync += self.counter.eq(1)
+                # ADD/SUB to take 3
+                with m.Elif(self.op.insn_type == InternalOp.OP_ADD):
                      m.d.sync += self.counter.eq(3)
-                # others to take 1, straight away
+                # others to take no delay
                  with m.Else():
                      m.d.comb += go_now.eq(1)
-                    m.d.sync += self.counter.eq(1)
-
-        with m.Else():
-            # input says no longer valid, so drop ready as well.
-            # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p_ready_o.eq(0)
-
-        # ok so the counter's running: when it gets to 1, fire the output
-        with m.If((self.counter == 1) | go_now):
-            # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n_valid_o.eq(1)
-        with m.If(self.n_ready_i & self.n_valid_o):
-            m.d.sync += self.n_valid_o.eq(0)
-            # recipient said it was ready: reset back to known-good.
-            m.d.sync += self.counter.eq(0) # reset the counter
-            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
  
-        # countdown to 1 (transition from 1 to 0 only on acknowledgement)
-        with m.If(self.counter > 1):
+        with m.Elif(~alu_done | self.n.ready_i):
+            # decrement the counter while the ALU is neither idle nor finished
              m.d.sync += self.counter.eq(self.counter - 1)
  
+        # choose between zero-delay output, or registered
+        with m.If(go_now):
+            m.d.comb += self.o.eq(sub.o)
+        with m.Else():
+            m.d.comb += self.o.eq(alu_r)
+
          return m
  
      def __iter__(self):
@@ -260,15 +298,24 @@ class BranchOp(Elaboratable):
  
  class BranchALU(Elaboratable):
      def __init__(self, width):
-        self.p_valid_i = Signal()
-        self.p_ready_o = Signal()
-        self.n_ready_i = Signal()
-        self.n_valid_o = Signal()
+        self.p = Dummy() # make look like nmutil pipeline API
+        self.p.data_i = Dummy()
+        self.p.data_i.ctx = Dummy()
+        self.n = Dummy() # make look like nmutil pipeline API
+        self.n.data_o = Dummy()
+        self.p.valid_i = Signal()
+        self.p.ready_o = Signal()
+        self.n.ready_i = Signal()
+        self.n.valid_o = Signal()
          self.counter   = Signal(4)
          self.op  = Signal(2)
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        i = []
+        i.append(Signal(width, name="i1"))
+        i.append(Signal(width, name="i2"))
+        self.i = Array(i)
+        self.a, self.b = i[0], i[1]
+        self.out = Array([Signal(width)])
+        self.o = self.out[0]
          self.width = width
  
      def elaborate(self, platform):
@@ -289,11 +336,11 @@ class BranchALU(Elaboratable):
              ]
  
          go_now = Signal(reset_less=True) # testing no-delay ALU
-        with m.If(self.p_valid_i):
+        with m.If(self.p.valid_i):
              # input is valid. next check, if we already said "ready" or not
-            with m.If(~self.p_ready_o):
+            with m.If(~self.p.ready_o):
                  # we didn't say "ready" yet, so say so and initialise
-                m.d.sync += self.p_ready_o.eq(1)
+                m.d.sync += self.p.ready_o.eq(1)
  
                  # as this is a "fake" pipeline, just grab the output right now
                  with m.Switch(self.op):
@@ -305,14 +352,14 @@ class BranchALU(Elaboratable):
          with m.Else():
              # input says no longer valid, so drop ready as well.
              # a "proper" ALU would have had to sync in the opcode and a/b ops
-            m.d.sync += self.p_ready_o.eq(0)
+            m.d.sync += self.p.ready_o.eq(0)
  
          # ok so the counter's running: when it gets to 1, fire the output
          with m.If((self.counter == 1) | go_now):
              # set the output as valid if the recipient is ready for it
-            m.d.sync += self.n_valid_o.eq(1)
-        with m.If(self.n_ready_i & self.n_valid_o):
-            m.d.sync += self.n_valid_o.eq(0)
+            m.d.sync += self.n.valid_o.eq(1)
+        with m.If(self.n.ready_i & self.n.valid_o):
+            m.d.sync += self.n.valid_o.eq(0)
              # recipient said it was ready: reset back to known-good.
              m.d.sync += self.counter.eq(0) # reset the counter
              m.d.sync += self.o.eq(0) # clear the output for tidiness sake
@@ -333,23 +380,55 @@ class BranchALU(Elaboratable):
          return list(self)
  
  def run_op(dut, a, b, op, inv_a=0):
+    from nmigen.back.pysim import Settle
      yield dut.a.eq(a)
      yield dut.b.eq(b)
      yield dut.op.insn_type.eq(op)
      yield dut.op.invert_a.eq(inv_a)
-    yield dut.n_ready_i.eq(0)
-    yield dut.p_valid_i.eq(1)
+    yield dut.n.ready_i.eq(0)
+    yield dut.p.valid_i.eq(1)
+
+    # if valid_o rose on the very first cycle, it is a
+    # zero-delay ALU
+    yield Settle()
+    vld = yield dut.n.valid_o
+    if vld:
+        # special case for zero-delay ALU
+        # we must raise ready_i first, since the combinatorial ALU doesn't
+        # have any storage, and doesn't dare to assert ready_o back to us
+        # until we accepted the output data
+        yield dut.n.ready_i.eq(1)
+        result = yield dut.o
+        yield
+        yield dut.p.valid_i.eq(0)
+        yield dut.n.ready_i.eq(0)
+        yield
+        return result
+
      yield
+
+    # wait for the ALU to accept our input data
      while True:
+        rdy = yield dut.p.ready_o
+        if rdy:
+            break
          yield
-        n_valid_o = yield dut.n_valid_o
-        if n_valid_o:
+
+    yield dut.p.valid_i.eq(0)
+
+    # wait for the ALU to present the output data
+    while True:
+        yield Settle()
+        vld = yield dut.n.valid_o
+        if vld:
              break
-    yield
+        yield
  
+    # latch the result and lower read_i
+    yield dut.n.ready_i.eq(1)
      result = yield dut.o
-    yield dut.p_valid_i.eq(0)
-    yield dut.n_ready_i.eq(0)
+    yield
+    yield dut.n.ready_i.eq(0)
      yield
  
      return result
@@ -368,10 +447,20 @@ def alu_sim(dut):
      print ("alu_sim add-inv", result)
      assert (result == 65533)
  
+    # test zero-delay ALU
+    # don't have OP_SUB, so use any other
+    result = yield from run_op(dut, 5, 3, InternalOp.OP_NOP)
+    print ("alu_sim sub", result)
+    assert (result == 2)
+
+    result = yield from run_op(dut, 13, 2, InternalOp.OP_SHR)
+    print ("alu_sim shr", result)
+    assert (result == 3)
+
  
  def test_alu():
      alu = ALU(width=16)
-    run_simulation(alu, alu_sim(alu), vcd_name='test_alusim.vcd')
+    run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
  
      vl = rtlil.convert(alu, ports=alu.ports())
      with open("test_alu.il", "w") as f:
@@ -381,8 +470,8 @@ def test_alu():
  if __name__ == "__main__":
      test_alu()
  
-    alu = BranchALU(width=16)
-    vl = rtlil.convert(alu, ports=alu.ports())
-    with open("test_branch_alu.il", "w") as f:
-        f.write(vl)
+    # alu = BranchALU(width=16)
+    # vl = rtlil.convert(alu, ports=alu.ports())
+    # with open("test_branch_alu.il", "w") as f:
+    #     f.write(vl)