Parameterize the issuer on the number of operands and results

[soc.git] / src / soc / experiment / alu_hier.py
diff --git a/src/soc/experiment/alu_hier.py b/src/soc/experiment/alu_hier.py

index 5b1968c2a19a085a478b179380fad3f205f60cce..ca372aa0d1b3095851587b762a42e0fcc7eb80ce 100644 (file)
--- a/src/soc/experiment/alu_hier.py
+++ b/src/soc/experiment/alu_hier.py
@@ -14,8 +14,14 @@ from nmigen.hdl.rec import Record, Layout
  from nmigen.cli import main
  from nmigen.cli import verilog, rtlil
  from nmigen.compat.sim import run_simulation
+from nmutil.gtkw import write_gtkw
  
-from soc.decoder.power_enums import InternalOp, Function, CryIn
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
+                                        is_engine_pysim)
+
+from soc.decoder.power_enums import MicrOp, Function, CryIn
  
  from soc.fu.alu.alu_input_record import CompALUOpSubset
  from soc.fu.cr.cr_input_record import CompCROpSubset
@@ -23,18 +29,16 @@ from soc.fu.cr.cr_input_record import CompCROpSubset
  import operator
  
  
-
-
  class Adder(Elaboratable):
      def __init__(self, width):
-        self.invert_a = Signal()
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width, name="add_o")
+        self.invert_in = Signal()
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width, name="add_o")
  
      def elaborate(self, platform):
          m = Module()
-        with m.If(self.invert_a):
+        with m.If(self.invert_in):
              m.d.comb += self.o.eq((~self.a) + self.b)
          with m.Else():
              m.d.comb += self.o.eq(self.a + self.b)
@@ -43,9 +47,9 @@ class Adder(Elaboratable):
  
  class Subtractor(Elaboratable):
      def __init__(self, width):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width, name="sub_o")
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width, name="sub_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -55,9 +59,9 @@ class Subtractor(Elaboratable):
  
  class Multiplier(Elaboratable):
      def __init__(self, width):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width, name="mul_o")
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width, name="mul_o")
  
      def elaborate(self, platform):
          m = Module()
@@ -68,34 +72,35 @@ class Multiplier(Elaboratable):
  class Shifter(Elaboratable):
      def __init__(self, width):
          self.width = width
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width, name="shf_o")
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width, name="shf_o")
  
      def elaborate(self, platform):
          m = Module()
          btrunc = Signal(self.width)
-        m.d.comb += btrunc.eq(self.b & Const((1<<self.width)-1))
+        m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
          m.d.comb += self.o.eq(self.a >> btrunc)
          return m
  
+
  class Dummy:
      pass
  
  
  class DummyALU(Elaboratable):
      def __init__(self, width):
-        self.p = Dummy() # make look like nmutil pipeline API
+        self.p = Dummy()  # make look like nmutil pipeline API
          self.p.data_i = Dummy()
          self.p.data_i.ctx = Dummy()
-        self.n = Dummy() # make look like nmutil pipeline API
+        self.n = Dummy()  # make look like nmutil pipeline API
          self.n.data_o = Dummy()
          self.p.valid_i = Signal()
          self.p.ready_o = Signal()
          self.n.ready_i = Signal()
          self.n.valid_o = Signal()
-        self.counter   = Signal(4)
-        self.op  = CompCROpSubset()
+        self.counter = Signal(4)
+        self.op = CompCROpSubset()
          i = []
          i.append(Signal(width, name="i1"))
          i.append(Signal(width, name="i2"))
@@ -115,7 +120,7 @@ class DummyALU(Elaboratable):
      def elaborate(self, platform):
          m = Module()
  
-        go_now = Signal(reset_less=True) # testing no-delay ALU
+        go_now = Signal(reset_less=True)  # testing no-delay ALU
  
          with m.If(self.p.valid_i):
              # input is valid. next check, if we already said "ready" or not
@@ -139,8 +144,8 @@ class DummyALU(Elaboratable):
          with m.If(self.n.ready_i & self.n.valid_o):
              m.d.sync += self.n.valid_o.eq(0)
              # recipient said it was ready: reset back to known-good.
-            m.d.sync += self.counter.eq(0) # reset the counter
-            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
+            m.d.sync += self.counter.eq(0)  # reset the counter
+            m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
  
          # countdown to 1 (transition from 1 to 0 only on acknowledgement)
          with m.If(self.counter > 1):
@@ -161,16 +166,16 @@ class DummyALU(Elaboratable):
  
  class ALU(Elaboratable):
      def __init__(self, width):
-        self.p = Dummy() # make look like nmutil pipeline API
+        self.p = Dummy()  # make look like nmutil pipeline API
          self.p.data_i = Dummy()
          self.p.data_i.ctx = Dummy()
-        self.n = Dummy() # make look like nmutil pipeline API
+        self.n = Dummy()  # make look like nmutil pipeline API
          self.n.data_o = Dummy()
          self.p.valid_i = Signal()
          self.p.ready_o = Signal()
          self.n.ready_i = Signal()
          self.n.valid_o = Signal()
-        self.counter   = Signal(4)
+        self.counter = Signal(4)
          self.op = CompALUOpSubset(name="op")
          i = []
          i.append(Signal(width, name="i1"))
@@ -206,9 +211,9 @@ class ALU(Elaboratable):
              ]
  
          # pass invert (and carry later)
-        m.d.comb += add.invert_a.eq(self.op.invert_a)
+        m.d.comb += add.invert_in.eq(self.op.invert_in)
  
-        go_now = Signal(reset_less=True) # testing no-delay ALU
+        go_now = Signal(reset_less=True)  # testing no-delay ALU
  
          # ALU sequencer is idle when the count is zero
          alu_idle = Signal(reset_less=True)
@@ -227,7 +232,7 @@ class ALU(Elaboratable):
          with m.Else():
              # sequential ALU handshake:
              # ready_o responds to valid_i, but only if the ALU is idle
-            m.d.comb += self.p.ready_o.eq(self.p.valid_i & alu_idle)
+            m.d.comb += self.p.ready_o.eq(alu_idle)
              # select the internally generated valid_o, above
              m.d.comb += self.n.valid_o.eq(alu_done)
  
@@ -238,24 +243,24 @@ class ALU(Elaboratable):
              with m.If(self.p.valid_i):
  
                  # as this is a "fake" pipeline, just grab the output right now
-                with m.If(self.op.insn_type == InternalOp.OP_ADD):
+                with m.If(self.op.insn_type == MicrOp.OP_ADD):
                      m.d.sync += alu_r.eq(add.o)
-                with m.Elif(self.op.insn_type == InternalOp.OP_MUL_L64):
+                with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
                      m.d.sync += alu_r.eq(mul.o)
-                with m.Elif(self.op.insn_type == InternalOp.OP_SHR):
+                with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
                      m.d.sync += alu_r.eq(shf.o)
                  # SUB is zero-delay, no need to register
  
                  # NOTE: all of these are fake, just something to test
  
                  # MUL, to take 5 instructions
-                with m.If(self.op.insn_type == InternalOp.OP_MUL_L64):
+                with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
                      m.d.sync += self.counter.eq(5)
                  # SHIFT to take 1, straight away
-                with m.Elif(self.op.insn_type == InternalOp.OP_SHR):
+                with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
                      m.d.sync += self.counter.eq(1)
                  # ADD/SUB to take 3
-                with m.Elif(self.op.insn_type == InternalOp.OP_ADD):
+                with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
                      m.d.sync += self.counter.eq(3)
                  # others to take no delay
                  with m.Else():
@@ -268,7 +273,8 @@ class ALU(Elaboratable):
          # choose between zero-delay output, or registered
          with m.If(go_now):
              m.d.comb += self.o.eq(sub.o)
-        with m.Else():
+        # only present the result at the last computation cycle
+        with m.Elif(alu_done):
              m.d.comb += self.o.eq(alu_r)
  
          return m
@@ -289,9 +295,9 @@ class ALU(Elaboratable):
  
  class BranchOp(Elaboratable):
      def __init__(self, width, op):
-        self.a   = Signal(width)
-        self.b   = Signal(width)
-        self.o   = Signal(width)
+        self.a = Signal(width)
+        self.b = Signal(width)
+        self.o = Signal(width)
          self.op = op
  
      def elaborate(self, platform):
@@ -302,17 +308,17 @@ class BranchOp(Elaboratable):
  
  class BranchALU(Elaboratable):
      def __init__(self, width):
-        self.p = Dummy() # make look like nmutil pipeline API
+        self.p = Dummy()  # make look like nmutil pipeline API
          self.p.data_i = Dummy()
          self.p.data_i.ctx = Dummy()
-        self.n = Dummy() # make look like nmutil pipeline API
+        self.n = Dummy()  # make look like nmutil pipeline API
          self.n.data_o = Dummy()
          self.p.valid_i = Signal()
          self.p.ready_o = Signal()
          self.n.ready_i = Signal()
          self.n.valid_o = Signal()
-        self.counter   = Signal(4)
-        self.op  = Signal(2)
+        self.counter = Signal(4)
+        self.op = Signal(2)
          i = []
          i.append(Signal(width, name="i1"))
          i.append(Signal(width, name="i2"))
@@ -339,7 +345,7 @@ class BranchALU(Elaboratable):
                  mod.b.eq(self.b),
              ]
  
-        go_now = Signal(reset_less=True) # testing no-delay ALU
+        go_now = Signal(reset_less=True)  # testing no-delay ALU
          with m.If(self.p.valid_i):
              # input is valid. next check, if we already said "ready" or not
              with m.If(~self.p.ready_o):
@@ -351,7 +357,8 @@ class BranchALU(Elaboratable):
                      for i, mod in enumerate([bgt, blt, beq, bne]):
                          with m.Case(i):
                              m.d.sync += self.o.eq(mod.o)
-                m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake)
+                # branch to take 5 cycles (fake)
+                m.d.sync += self.counter.eq(5)
                  #m.d.comb += go_now.eq(1)
          with m.Else():
              # input says no longer valid, so drop ready as well.
@@ -365,8 +372,8 @@ class BranchALU(Elaboratable):
          with m.If(self.n.ready_i & self.n.valid_o):
              m.d.sync += self.n.valid_o.eq(0)
              # recipient said it was ready: reset back to known-good.
-            m.d.sync += self.counter.eq(0) # reset the counter
-            m.d.sync += self.o.eq(0) # clear the output for tidiness sake
+            m.d.sync += self.counter.eq(0)  # reset the counter
+            m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
  
          # countdown to 1 (transition from 1 to 0 only on acknowledgement)
          with m.If(self.counter > 1):
@@ -383,87 +390,65 @@ class BranchALU(Elaboratable):
      def ports(self):
          return list(self)
  
+
  def run_op(dut, a, b, op, inv_a=0):
-    from nmigen.back.pysim import Settle
      yield dut.a.eq(a)
      yield dut.b.eq(b)
      yield dut.op.insn_type.eq(op)
-    yield dut.op.invert_a.eq(inv_a)
+    yield dut.op.invert_in.eq(inv_a)
      yield dut.n.ready_i.eq(0)
      yield dut.p.valid_i.eq(1)
-
-    # if valid_o rose on the very first cycle, it is a
-    # zero-delay ALU
-    yield Settle()
-    vld = yield dut.n.valid_o
-    if vld:
-        # special case for zero-delay ALU
-        # we must raise ready_i first, since the combinatorial ALU doesn't
-        # have any storage, and doesn't dare to assert ready_o back to us
-        # until we accepted the output data
-        yield dut.n.ready_i.eq(1)
-        result = yield dut.o
-        yield
-        yield dut.p.valid_i.eq(0)
-        yield dut.n.ready_i.eq(0)
-        yield
-        return result
-
+    yield dut.n.ready_i.eq(1)
      yield
  
      # wait for the ALU to accept our input data
-    while True:
-        rdy = yield dut.p.ready_o
-        if rdy:
-            break
+    while not (yield dut.p.ready_o):
          yield
  
      yield dut.p.valid_i.eq(0)
+    yield dut.a.eq(0)
+    yield dut.b.eq(0)
+    yield dut.op.insn_type.eq(0)
+    yield dut.op.invert_in.eq(0)
  
      # wait for the ALU to present the output data
-    while True:
-        yield Settle()
-        vld = yield dut.n.valid_o
-        if vld:
-            break
+    while not (yield dut.n.valid_o):
          yield
  
      # latch the result and lower read_i
-    yield dut.n.ready_i.eq(1)
      result = yield dut.o
-    yield
      yield dut.n.ready_i.eq(0)
-    yield
  
      return result
  
  
  def alu_sim(dut):
-    result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD)
-    print ("alu_sim add", result)
+    result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
+    print("alu_sim add", result)
      assert (result == 8)
  
-    result = yield from run_op(dut, 2, 3, InternalOp.OP_MUL_L64)
-    print ("alu_sim mul", result)
+    result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
+    print("alu_sim mul", result)
      assert (result == 6)
  
-    result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD, inv_a=1)
-    print ("alu_sim add-inv", result)
+    result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
+    print("alu_sim add-inv", result)
      assert (result == 65533)
  
      # test zero-delay ALU
      # don't have OP_SUB, so use any other
-    result = yield from run_op(dut, 5, 3, InternalOp.OP_NOP)
-    print ("alu_sim sub", result)
+    result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP)
+    print("alu_sim sub", result)
      assert (result == 2)
  
-    result = yield from run_op(dut, 13, 2, InternalOp.OP_SHR)
-    print ("alu_sim shr", result)
+    result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
+    print("alu_sim shr", result)
      assert (result == 3)
  
  
  def test_alu():
      alu = ALU(width=16)
+    write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
      run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
  
      vl = rtlil.convert(alu, ports=alu.ports())
@@ -471,11 +456,136 @@ def test_alu():
          f.write(vl)
  
  
+def test_alu_parallel():
+    # Compare with the sequential test implementation, above.
+    m = Module()
+    m.submodules.alu = dut = ALU(width=16)
+    write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
+                   pysim=is_engine_pysim())
+
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    def send(a, b, op, inv_a=0):
+        # present input data and assert valid_i
+        yield dut.a.eq(a)
+        yield dut.b.eq(b)
+        yield dut.op.insn_type.eq(op)
+        yield dut.op.invert_in.eq(inv_a)
+        yield dut.p.valid_i.eq(1)
+        yield
+        # wait for ready_o to be asserted
+        while not (yield dut.p.ready_o):
+            yield
+        # clear input data and negate valid_i
+        # if send is called again immediately afterwards, there will be no
+        # visible transition (they will not be negated, after all)
+        yield dut.p.valid_i.eq(0)
+        yield dut.a.eq(0)
+        yield dut.b.eq(0)
+        yield dut.op.insn_type.eq(0)
+        yield dut.op.invert_in.eq(0)
+
+    def receive():
+        # signal readiness to receive data
+        yield dut.n.ready_i.eq(1)
+        yield
+        # wait for valid_o to be asserted
+        while not (yield dut.n.valid_o):
+            yield
+        # read result
+        result = yield dut.o
+        # negate ready_i
+        # if receive is called again immediately afterwards, there will be no
+        # visible transition (it will not be negated, after all)
+        yield dut.n.ready_i.eq(0)
+        return result
+
+    def producer():
+        # send a few test cases, interspersed with wait states
+        # note that, for this test, we do not wait for the result to be ready,
+        # before presenting the next input
+        # 5 + 3
+        yield from send(5, 3, MicrOp.OP_ADD)
+        yield
+        yield
+        # 2 * 3
+        yield from send(2, 3, MicrOp.OP_MUL_L64)
+        # (-5) + 3
+        yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
+        yield
+        # 5 - 3
+        # note that this is a zero-delay operation
+        yield from send(5, 3, MicrOp.OP_NOP)
+        yield
+        yield
+        # 13 >> 2
+        yield from send(13, 2, MicrOp.OP_SHR)
+
+    def consumer():
+        # receive and check results, interspersed with wait states
+        # the consumer is not in step with the producer, but the
+        # order of the results are preserved
+        yield
+        # 5 + 3 = 8
+        result = yield from receive()
+        assert (result == 8)
+        # 2 * 3 = 6
+        result = yield from receive()
+        assert (result == 6)
+        yield
+        yield
+        # (-5) + 3 = -2
+        result = yield from receive()
+        assert (result == 65533)  # unsigned equivalent to -2
+        # 5 - 3 = 2
+        # note that this is a zero-delay operation
+        # this, and the previous result, will be received back-to-back
+        # (check the output waveform to see this)
+        result = yield from receive()
+        assert (result == 2)
+        yield
+        yield
+        # 13 >> 2 = 3
+        result = yield from receive()
+        assert (result == 3)
+
+    sim.add_sync_process(producer)
+    sim.add_sync_process(consumer)
+    sim_writer = sim.write_vcd("test_alu_parallel.vcd")
+    with sim_writer:
+        sim.run()
+
+
+def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
+                   pysim=True):
+    """Common function to write the GTKWave documents for this module"""
+    gtkwave_desc = [
+        'clk',
+        'i1[15:0]',
+        'i2[15:0]',
+        'op__insn_type' if pysim else 'op__insn_type[6:0]',
+        'op__invert_in',
+        'valid_i',
+        'ready_o',
+        'valid_o',
+        'ready_i',
+        'alu_o[15:0]',
+    ]
+    # determine the module name of the DUT
+    module = 'top'
+    if sub_module is not None:
+        module = nmigen_sim_top_module + sub_module
+    vcd_name = gtkw_name.replace('.gtkw', '.vcd')
+    write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
+               loc=__file__, clk_period=clk_period, base='signed')
+
+
  if __name__ == "__main__":
      test_alu()
+    test_alu_parallel()
  
      # alu = BranchALU(width=16)
      # vl = rtlil.convert(alu, ports=alu.ports())
      # with open("test_branch_alu.il", "w") as f:
      #     f.write(vl)
-