X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fexperiment%2Falu_hier.py;h=9c8115ce69da2c035f0d674ac3768fceb415020d;hb=78183662a64e0be018ad22de14f323c28e611f76;hp=99ff39e47f287ff4125ea025c249ddfe4804c207;hpb=689a040bdb4a79dfb89ec6742a7285dba0fda0af;p=soc.git diff --git a/src/soc/experiment/alu_hier.py b/src/soc/experiment/alu_hier.py index 99ff39e4..9c8115ce 100644 --- a/src/soc/experiment/alu_hier.py +++ b/src/soc/experiment/alu_hier.py @@ -14,8 +14,9 @@ from nmigen.hdl.rec import Record, Layout from nmigen.cli import main from nmigen.cli import verilog, rtlil from nmigen.compat.sim import run_simulation +from nmigen.back.pysim import Simulator, Settle -from soc.decoder.power_enums import InternalOp, Function, CryIn +from soc.decoder.power_enums import MicrOp, Function, CryIn from soc.fu.alu.alu_input_record import CompALUOpSubset from soc.fu.cr.cr_input_record import CompCROpSubset @@ -23,18 +24,16 @@ from soc.fu.cr.cr_input_record import CompCROpSubset import operator - - class Adder(Elaboratable): def __init__(self, width): - self.invert_a = Signal() - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width, name="add_o") + self.invert_in = Signal() + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width, name="add_o") def elaborate(self, platform): m = Module() - with m.If(self.invert_a): + with m.If(self.invert_in): m.d.comb += self.o.eq((~self.a) + self.b) with m.Else(): m.d.comb += self.o.eq(self.a + self.b) @@ -43,9 +42,9 @@ class Adder(Elaboratable): class Subtractor(Elaboratable): def __init__(self, width): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width, name="sub_o") + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width, name="sub_o") def elaborate(self, platform): m = Module() @@ -55,9 +54,9 @@ class Subtractor(Elaboratable): class Multiplier(Elaboratable): def __init__(self, width): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width, name="mul_o") + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width, name="mul_o") def elaborate(self, platform): m = Module() @@ -68,34 +67,35 @@ class Multiplier(Elaboratable): class Shifter(Elaboratable): def __init__(self, width): self.width = width - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width, name="shf_o") + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width, name="shf_o") def elaborate(self, platform): m = Module() btrunc = Signal(self.width) - m.d.comb += btrunc.eq(self.b & Const((1<> btrunc) return m + class Dummy: pass class DummyALU(Elaboratable): def __init__(self, width): - self.p = Dummy() # make look like nmutil pipeline API + self.p = Dummy() # make look like nmutil pipeline API self.p.data_i = Dummy() self.p.data_i.ctx = Dummy() - self.n = Dummy() # make look like nmutil pipeline API + self.n = Dummy() # make look like nmutil pipeline API self.n.data_o = Dummy() self.p.valid_i = Signal() self.p.ready_o = Signal() self.n.ready_i = Signal() self.n.valid_o = Signal() - self.counter = Signal(4) - self.op = CompCROpSubset() + self.counter = Signal(4) + self.op = CompCROpSubset() i = [] i.append(Signal(width, name="i1")) i.append(Signal(width, name="i2")) @@ -115,7 +115,7 @@ class DummyALU(Elaboratable): def elaborate(self, platform): m = Module() - go_now = Signal(reset_less=True) # testing no-delay ALU + go_now = Signal(reset_less=True) # testing no-delay ALU with m.If(self.p.valid_i): # input is valid. next check, if we already said "ready" or not @@ -139,8 +139,8 @@ class DummyALU(Elaboratable): with m.If(self.n.ready_i & self.n.valid_o): m.d.sync += self.n.valid_o.eq(0) # recipient said it was ready: reset back to known-good. - m.d.sync += self.counter.eq(0) # reset the counter - m.d.sync += self.o.eq(0) # clear the output for tidiness sake + m.d.sync += self.counter.eq(0) # reset the counter + m.d.sync += self.o.eq(0) # clear the output for tidiness sake # countdown to 1 (transition from 1 to 0 only on acknowledgement) with m.If(self.counter > 1): @@ -161,16 +161,16 @@ class DummyALU(Elaboratable): class ALU(Elaboratable): def __init__(self, width): - self.p = Dummy() # make look like nmutil pipeline API + self.p = Dummy() # make look like nmutil pipeline API self.p.data_i = Dummy() self.p.data_i.ctx = Dummy() - self.n = Dummy() # make look like nmutil pipeline API + self.n = Dummy() # make look like nmutil pipeline API self.n.data_o = Dummy() self.p.valid_i = Signal() self.p.ready_o = Signal() self.n.ready_i = Signal() self.n.valid_o = Signal() - self.counter = Signal(4) + self.counter = Signal(4) self.op = CompALUOpSubset(name="op") i = [] i.append(Signal(width, name="i1")) @@ -206,9 +206,9 @@ class ALU(Elaboratable): ] # pass invert (and carry later) - m.d.comb += add.invert_a.eq(self.op.invert_a) + m.d.comb += add.invert_in.eq(self.op.invert_in) - go_now = Signal(reset_less=True) # testing no-delay ALU + go_now = Signal(reset_less=True) # testing no-delay ALU # ALU sequencer is idle when the count is zero alu_idle = Signal(reset_less=True) @@ -227,7 +227,7 @@ class ALU(Elaboratable): with m.Else(): # sequential ALU handshake: # ready_o responds to valid_i, but only if the ALU is idle - m.d.comb += self.p.ready_o.eq(self.p.valid_i & alu_idle) + m.d.comb += self.p.ready_o.eq(alu_idle) # select the internally generated valid_o, above m.d.comb += self.n.valid_o.eq(alu_done) @@ -238,24 +238,24 @@ class ALU(Elaboratable): with m.If(self.p.valid_i): # as this is a "fake" pipeline, just grab the output right now - with m.If(self.op.insn_type == InternalOp.OP_ADD): + with m.If(self.op.insn_type == MicrOp.OP_ADD): m.d.sync += alu_r.eq(add.o) - with m.Elif(self.op.insn_type == InternalOp.OP_MUL_L64): + with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64): m.d.sync += alu_r.eq(mul.o) - with m.Elif(self.op.insn_type == InternalOp.OP_SHR): + with m.Elif(self.op.insn_type == MicrOp.OP_SHR): m.d.sync += alu_r.eq(shf.o) # SUB is zero-delay, no need to register # NOTE: all of these are fake, just something to test # MUL, to take 5 instructions - with m.If(self.op.insn_type == InternalOp.OP_MUL_L64): + with m.If(self.op.insn_type == MicrOp.OP_MUL_L64): m.d.sync += self.counter.eq(5) # SHIFT to take 1, straight away - with m.Elif(self.op.insn_type == InternalOp.OP_SHR): + with m.Elif(self.op.insn_type == MicrOp.OP_SHR): m.d.sync += self.counter.eq(1) # ADD/SUB to take 3 - with m.Elif(self.op.insn_type == InternalOp.OP_ADD): + with m.Elif(self.op.insn_type == MicrOp.OP_ADD): m.d.sync += self.counter.eq(3) # others to take no delay with m.Else(): @@ -268,7 +268,8 @@ class ALU(Elaboratable): # choose between zero-delay output, or registered with m.If(go_now): m.d.comb += self.o.eq(sub.o) - with m.Else(): + # only present the result at the last computation cycle + with m.Elif(alu_done): m.d.comb += self.o.eq(alu_r) return m @@ -278,6 +279,10 @@ class ALU(Elaboratable): yield self.a yield self.b yield self.o + yield self.p.valid_i + yield self.p.ready_o + yield self.n.valid_o + yield self.n.ready_i def ports(self): return list(self) @@ -285,9 +290,9 @@ class ALU(Elaboratable): class BranchOp(Elaboratable): def __init__(self, width, op): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width) self.op = op def elaborate(self, platform): @@ -298,17 +303,17 @@ class BranchOp(Elaboratable): class BranchALU(Elaboratable): def __init__(self, width): - self.p = Dummy() # make look like nmutil pipeline API + self.p = Dummy() # make look like nmutil pipeline API self.p.data_i = Dummy() self.p.data_i.ctx = Dummy() - self.n = Dummy() # make look like nmutil pipeline API + self.n = Dummy() # make look like nmutil pipeline API self.n.data_o = Dummy() self.p.valid_i = Signal() self.p.ready_o = Signal() self.n.ready_i = Signal() self.n.valid_o = Signal() - self.counter = Signal(4) - self.op = Signal(2) + self.counter = Signal(4) + self.op = Signal(2) i = [] i.append(Signal(width, name="i1")) i.append(Signal(width, name="i2")) @@ -335,7 +340,7 @@ class BranchALU(Elaboratable): mod.b.eq(self.b), ] - go_now = Signal(reset_less=True) # testing no-delay ALU + go_now = Signal(reset_less=True) # testing no-delay ALU with m.If(self.p.valid_i): # input is valid. next check, if we already said "ready" or not with m.If(~self.p.ready_o): @@ -347,7 +352,8 @@ class BranchALU(Elaboratable): for i, mod in enumerate([bgt, blt, beq, bne]): with m.Case(i): m.d.sync += self.o.eq(mod.o) - m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake) + # branch to take 5 cycles (fake) + m.d.sync += self.counter.eq(5) #m.d.comb += go_now.eq(1) with m.Else(): # input says no longer valid, so drop ready as well. @@ -361,8 +367,8 @@ class BranchALU(Elaboratable): with m.If(self.n.ready_i & self.n.valid_o): m.d.sync += self.n.valid_o.eq(0) # recipient said it was ready: reset back to known-good. - m.d.sync += self.counter.eq(0) # reset the counter - m.d.sync += self.o.eq(0) # clear the output for tidiness sake + m.d.sync += self.counter.eq(0) # reset the counter + m.d.sync += self.o.eq(0) # clear the output for tidiness sake # countdown to 1 (transition from 1 to 0 only on acknowledgement) with m.If(self.counter > 1): @@ -379,82 +385,59 @@ class BranchALU(Elaboratable): def ports(self): return list(self) + def run_op(dut, a, b, op, inv_a=0): - from nmigen.back.pysim import Settle yield dut.a.eq(a) yield dut.b.eq(b) yield dut.op.insn_type.eq(op) - yield dut.op.invert_a.eq(inv_a) + yield dut.op.invert_in.eq(inv_a) yield dut.n.ready_i.eq(0) yield dut.p.valid_i.eq(1) - - # if valid_o rose on the very first cycle, it is a - # zero-delay ALU - yield Settle() - vld = yield dut.n.valid_o - if vld: - # special case for zero-delay ALU - # we must raise ready_i first, since the combinatorial ALU doesn't - # have any storage, and doesn't dare to assert ready_o back to us - # until we accepted the output data - yield dut.n.ready_i.eq(1) - result = yield dut.o - yield - yield dut.p.valid_i.eq(0) - yield dut.n.ready_i.eq(0) - yield - return result - + yield dut.n.ready_i.eq(1) yield # wait for the ALU to accept our input data - while True: - rdy = yield dut.p.ready_o - if rdy: - break + while not (yield dut.p.ready_o): yield yield dut.p.valid_i.eq(0) + yield dut.a.eq(0) + yield dut.b.eq(0) + yield dut.op.insn_type.eq(0) + yield dut.op.invert_in.eq(0) # wait for the ALU to present the output data - while True: - yield Settle() - vld = yield dut.n.valid_o - if vld: - break + while not (yield dut.n.valid_o): yield # latch the result and lower read_i - yield dut.n.ready_i.eq(1) result = yield dut.o - yield yield dut.n.ready_i.eq(0) - yield return result def alu_sim(dut): - result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD) - print ("alu_sim add", result) + result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD) + print("alu_sim add", result) assert (result == 8) - result = yield from run_op(dut, 2, 3, InternalOp.OP_MUL_L64) - print ("alu_sim mul", result) + result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64) + print("alu_sim mul", result) assert (result == 6) - result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD, inv_a=1) - print ("alu_sim add-inv", result) + result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1) + print("alu_sim add-inv", result) assert (result == 65533) # test zero-delay ALU # don't have OP_SUB, so use any other - result = yield from run_op(dut, 5, 3, InternalOp.OP_NOP) - print ("alu_sim sub", result) + result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP) + print("alu_sim sub", result) assert (result == 2) - result = yield from run_op(dut, 13, 2, InternalOp.OP_SHR) - print ("alu_sim shr", result) + result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR) + print("alu_sim shr", result) assert (result == 3) @@ -467,11 +450,113 @@ def test_alu(): f.write(vl) +def test_alu_parallel(): + # Compare with the sequential test implementation, above. + m = Module() + m.submodules.alu = dut = ALU(width=16) + sim = Simulator(m) + sim.add_clock(1e-6) + + def send(a, b, op, inv_a=0): + # present input data and assert valid_i + yield dut.a.eq(a) + yield dut.b.eq(b) + yield dut.op.insn_type.eq(op) + yield dut.op.invert_in.eq(inv_a) + yield dut.p.valid_i.eq(1) + yield + # wait for ready_o to be asserted + while not (yield dut.p.ready_o): + yield + # clear input data and negate valid_i + # if send is called again immediately afterwards, there will be no + # visible transition (they will not be negated, after all) + yield dut.p.valid_i.eq(0) + yield dut.a.eq(0) + yield dut.b.eq(0) + yield dut.op.insn_type.eq(0) + yield dut.op.invert_in.eq(0) + + def receive(): + # signal readiness to receive data + yield dut.n.ready_i.eq(1) + yield + # wait for valid_o to be asserted + while not (yield dut.n.valid_o): + yield + # read result + result = yield dut.o + # negate ready_i + # if receive is called again immediately afterwards, there will be no + # visible transition (it will not be negated, after all) + yield dut.n.ready_i.eq(0) + return result + + def producer(): + # send a few test cases, interspersed with wait states + # note that, for this test, we do not wait for the result to be ready, + # before presenting the next input + # 5 + 3 + yield from send(5, 3, MicrOp.OP_ADD) + yield + yield + # 2 * 3 + yield from send(2, 3, MicrOp.OP_MUL_L64) + # (-5) + 3 + yield from send(5, 3, MicrOp.OP_ADD, inv_a=1) + yield + # 5 - 3 + # note that this is a zero-delay operation + yield from send(5, 3, MicrOp.OP_NOP) + yield + yield + # 13 >> 2 + yield from send(13, 2, MicrOp.OP_SHR) + + def consumer(): + # receive and check results, interspersed with wait states + # the consumer is not in step with the producer, but the + # order of the results are preserved + yield + # 5 + 3 = 8 + result = yield from receive() + assert (result == 8) + # 2 * 3 = 6 + result = yield from receive() + assert (result == 6) + yield + yield + # (-5) + 3 = -2 + result = yield from receive() + assert (result == 65533) # unsigned equivalent to -2 + # 5 - 3 = 2 + # note that this is a zero-delay operation + # this, and the previous result, will be received back-to-back + # (check the output waveform to see this) + result = yield from receive() + assert (result == 2) + yield + yield + # 13 >> 2 = 3 + result = yield from receive() + assert (result == 3) + + sim.add_sync_process(producer) + sim.add_sync_process(consumer) + sim_writer = sim.write_vcd( + "test_alu_parallel.vcd", + "test_alu_parallel.gtkw", + traces=dut.ports() + ) + with sim_writer: + sim.run() + + if __name__ == "__main__": test_alu() + test_alu_parallel() # alu = BranchALU(width=16) # vl = rtlil.convert(alu, ports=alu.ports()) # with open("test_branch_alu.il", "w") as f: # f.write(vl) -