X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fexperiment%2Falu_hier.py;h=99ff39e47f287ff4125ea025c249ddfe4804c207;hb=9d69e91e8baa66366a9f36093b738a25dc126659;hp=af373ca3c9905bfac4b87dd686619f0aa714e3e5;hpb=943a10d56aa05f6888d39c668074f886a6736bd1;p=soc.git diff --git a/src/soc/experiment/alu_hier.py b/src/soc/experiment/alu_hier.py index af373ca3..99ff39e4 100644 --- a/src/soc/experiment/alu_hier.py +++ b/src/soc/experiment/alu_hier.py @@ -18,6 +18,7 @@ from nmigen.compat.sim import run_simulation from soc.decoder.power_enums import InternalOp, Function, CryIn from soc.fu.alu.alu_input_record import CompALUOpSubset +from soc.fu.cr.cr_input_record import CompCROpSubset import operator @@ -29,7 +30,7 @@ class Adder(Elaboratable): self.invert_a = Signal() self.a = Signal(width) self.b = Signal(width) - self.o = Signal(width) + self.o = Signal(width, name="add_o") def elaborate(self, platform): m = Module() @@ -44,7 +45,7 @@ class Subtractor(Elaboratable): def __init__(self, width): self.a = Signal(width) self.b = Signal(width) - self.o = Signal(width) + self.o = Signal(width, name="sub_o") def elaborate(self, platform): m = Module() @@ -56,7 +57,7 @@ class Multiplier(Elaboratable): def __init__(self, width): self.a = Signal(width) self.b = Signal(width) - self.o = Signal(width) + self.o = Signal(width, name="mul_o") def elaborate(self, platform): m = Module() @@ -69,7 +70,7 @@ class Shifter(Elaboratable): self.width = width self.a = Signal(width) self.b = Signal(width) - self.o = Signal(width) + self.o = Signal(width, name="shf_o") def elaborate(self, platform): m = Module() @@ -81,37 +82,124 @@ class Shifter(Elaboratable): class Dummy: pass + +class DummyALU(Elaboratable): + def __init__(self, width): + self.p = Dummy() # make look like nmutil pipeline API + self.p.data_i = Dummy() + self.p.data_i.ctx = Dummy() + self.n = Dummy() # make look like nmutil pipeline API + self.n.data_o = Dummy() + self.p.valid_i = Signal() + self.p.ready_o = Signal() + self.n.ready_i = Signal() + self.n.valid_o = Signal() + self.counter = Signal(4) + self.op = CompCROpSubset() + i = [] + i.append(Signal(width, name="i1")) + i.append(Signal(width, name="i2")) + i.append(Signal(width, name="i3")) + self.i = Array(i) + self.a, self.b, self.c = i[0], i[1], i[2] + self.out = Array([Signal(width, name="alu_o")]) + self.o = self.out[0] + self.width = width + # more "look like nmutil pipeline API" + self.p.data_i.ctx.op = self.op + self.p.data_i.a = self.a + self.p.data_i.b = self.b + self.p.data_i.c = self.c + self.n.data_o.o = self.o + + def elaborate(self, platform): + m = Module() + + go_now = Signal(reset_less=True) # testing no-delay ALU + + with m.If(self.p.valid_i): + # input is valid. next check, if we already said "ready" or not + with m.If(~self.p.ready_o): + # we didn't say "ready" yet, so say so and initialise + m.d.sync += self.p.ready_o.eq(1) + + m.d.sync += self.o.eq(self.a) + m.d.comb += go_now.eq(1) + m.d.sync += self.counter.eq(1) + + with m.Else(): + # input says no longer valid, so drop ready as well. + # a "proper" ALU would have had to sync in the opcode and a/b ops + m.d.sync += self.p.ready_o.eq(0) + + # ok so the counter's running: when it gets to 1, fire the output + with m.If((self.counter == 1) | go_now): + # set the output as valid if the recipient is ready for it + m.d.sync += self.n.valid_o.eq(1) + with m.If(self.n.ready_i & self.n.valid_o): + m.d.sync += self.n.valid_o.eq(0) + # recipient said it was ready: reset back to known-good. + m.d.sync += self.counter.eq(0) # reset the counter + m.d.sync += self.o.eq(0) # clear the output for tidiness sake + + # countdown to 1 (transition from 1 to 0 only on acknowledgement) + with m.If(self.counter > 1): + m.d.sync += self.counter.eq(self.counter - 1) + + return m + + def __iter__(self): + yield from self.op.ports() + yield self.a + yield self.b + yield self.c + yield self.o + + def ports(self): + return list(self) + + class ALU(Elaboratable): def __init__(self, width): self.p = Dummy() # make look like nmutil pipeline API + self.p.data_i = Dummy() + self.p.data_i.ctx = Dummy() self.n = Dummy() # make look like nmutil pipeline API + self.n.data_o = Dummy() self.p.valid_i = Signal() self.p.ready_o = Signal() self.n.ready_i = Signal() self.n.valid_o = Signal() self.counter = Signal(4) - self.op = CompALUOpSubset() + self.op = CompALUOpSubset(name="op") i = [] i.append(Signal(width, name="i1")) i.append(Signal(width, name="i2")) self.i = Array(i) self.a, self.b = i[0], i[1] - self.out = Array([Signal(width)]) + self.out = Array([Signal(width, name="alu_o")]) self.o = self.out[0] self.width = width + # more "look like nmutil pipeline API" + self.p.data_i.ctx.op = self.op + self.p.data_i.a = self.a + self.p.data_i.b = self.b + self.n.data_o.o = self.o def elaborate(self, platform): m = Module() add = Adder(self.width) mul = Multiplier(self.width) shf = Shifter(self.width) + sub = Subtractor(self.width) m.submodules.add = add m.submodules.mul = mul m.submodules.shf = shf + m.submodules.sub = sub # really should not activate absolutely all ALU inputs like this - for mod in [add, mul, shf]: + for mod in [add, mul, shf, sub]: m.d.comb += [ mod.a.eq(self.a), mod.b.eq(self.b), @@ -122,56 +210,67 @@ class ALU(Elaboratable): go_now = Signal(reset_less=True) # testing no-delay ALU - with m.If(self.p.valid_i): - # input is valid. next check, if we already said "ready" or not - with m.If(~self.p.ready_o): - # we didn't say "ready" yet, so say so and initialise - m.d.sync += self.p.ready_o.eq(1) + # ALU sequencer is idle when the count is zero + alu_idle = Signal(reset_less=True) + m.d.comb += alu_idle.eq(self.counter == 0) + + # ALU sequencer is done when the count is one + alu_done = Signal(reset_less=True) + m.d.comb += alu_done.eq(self.counter == 1) + + # select handshake handling according to ALU type + with m.If(go_now): + # with a combinatorial, no-delay ALU, just pass through + # the handshake signals to the other side + m.d.comb += self.p.ready_o.eq(self.n.ready_i) + m.d.comb += self.n.valid_o.eq(self.p.valid_i) + with m.Else(): + # sequential ALU handshake: + # ready_o responds to valid_i, but only if the ALU is idle + m.d.comb += self.p.ready_o.eq(self.p.valid_i & alu_idle) + # select the internally generated valid_o, above + m.d.comb += self.n.valid_o.eq(alu_done) + + # hold the ALU result until ready_o is asserted + alu_r = Signal(self.width) + + with m.If(alu_idle): + with m.If(self.p.valid_i): # as this is a "fake" pipeline, just grab the output right now with m.If(self.op.insn_type == InternalOp.OP_ADD): - m.d.sync += self.o.eq(add.o) + m.d.sync += alu_r.eq(add.o) with m.Elif(self.op.insn_type == InternalOp.OP_MUL_L64): - m.d.sync += self.o.eq(mul.o) + m.d.sync += alu_r.eq(mul.o) with m.Elif(self.op.insn_type == InternalOp.OP_SHR): - m.d.sync += self.o.eq(shf.o) - # TODO: SUB + m.d.sync += alu_r.eq(shf.o) + # SUB is zero-delay, no need to register # NOTE: all of these are fake, just something to test # MUL, to take 5 instructions with m.If(self.op.insn_type == InternalOp.OP_MUL_L64): m.d.sync += self.counter.eq(5) - # SHIFT to take 7 + # SHIFT to take 1, straight away with m.Elif(self.op.insn_type == InternalOp.OP_SHR): - m.d.sync += self.counter.eq(7) - # ADD/SUB to take 2, straight away - with m.If(self.op.insn_type == InternalOp.OP_ADD): + m.d.sync += self.counter.eq(1) + # ADD/SUB to take 3 + with m.Elif(self.op.insn_type == InternalOp.OP_ADD): m.d.sync += self.counter.eq(3) - # others to take 1, straight away + # others to take no delay with m.Else(): m.d.comb += go_now.eq(1) - m.d.sync += self.counter.eq(1) - - with m.Else(): - # input says no longer valid, so drop ready as well. - # a "proper" ALU would have had to sync in the opcode and a/b ops - m.d.sync += self.p.ready_o.eq(0) - - # ok so the counter's running: when it gets to 1, fire the output - with m.If((self.counter == 1) | go_now): - # set the output as valid if the recipient is ready for it - m.d.sync += self.n.valid_o.eq(1) - with m.If(self.n.ready_i & self.n.valid_o): - m.d.sync += self.n.valid_o.eq(0) - # recipient said it was ready: reset back to known-good. - m.d.sync += self.counter.eq(0) # reset the counter - m.d.sync += self.o.eq(0) # clear the output for tidiness sake - # countdown to 1 (transition from 1 to 0 only on acknowledgement) - with m.If(self.counter > 1): + with m.Elif(~alu_done | self.n.ready_i): + # decrement the counter while the ALU is neither idle nor finished m.d.sync += self.counter.eq(self.counter - 1) + # choose between zero-delay output, or registered + with m.If(go_now): + m.d.comb += self.o.eq(sub.o) + with m.Else(): + m.d.comb += self.o.eq(alu_r) + return m def __iter__(self): @@ -199,6 +298,11 @@ class BranchOp(Elaboratable): class BranchALU(Elaboratable): def __init__(self, width): + self.p = Dummy() # make look like nmutil pipeline API + self.p.data_i = Dummy() + self.p.data_i.ctx = Dummy() + self.n = Dummy() # make look like nmutil pipeline API + self.n.data_o = Dummy() self.p.valid_i = Signal() self.p.ready_o = Signal() self.n.ready_i = Signal() @@ -276,22 +380,54 @@ class BranchALU(Elaboratable): return list(self) def run_op(dut, a, b, op, inv_a=0): + from nmigen.back.pysim import Settle yield dut.a.eq(a) yield dut.b.eq(b) yield dut.op.insn_type.eq(op) yield dut.op.invert_a.eq(inv_a) yield dut.n.ready_i.eq(0) yield dut.p.valid_i.eq(1) + + # if valid_o rose on the very first cycle, it is a + # zero-delay ALU + yield Settle() + vld = yield dut.n.valid_o + if vld: + # special case for zero-delay ALU + # we must raise ready_i first, since the combinatorial ALU doesn't + # have any storage, and doesn't dare to assert ready_o back to us + # until we accepted the output data + yield dut.n.ready_i.eq(1) + result = yield dut.o + yield + yield dut.p.valid_i.eq(0) + yield dut.n.ready_i.eq(0) + yield + return result + yield + + # wait for the ALU to accept our input data while True: + rdy = yield dut.p.ready_o + if rdy: + break yield - n.valid_o = yield dut.n.valid_o - if n.valid_o: + + yield dut.p.valid_i.eq(0) + + # wait for the ALU to present the output data + while True: + yield Settle() + vld = yield dut.n.valid_o + if vld: break - yield + yield + # latch the result and lower read_i + yield dut.n.ready_i.eq(1) result = yield dut.o - yield dut.p.valid_i.eq(0) + yield yield dut.n.ready_i.eq(0) yield @@ -311,10 +447,20 @@ def alu_sim(dut): print ("alu_sim add-inv", result) assert (result == 65533) + # test zero-delay ALU + # don't have OP_SUB, so use any other + result = yield from run_op(dut, 5, 3, InternalOp.OP_NOP) + print ("alu_sim sub", result) + assert (result == 2) + + result = yield from run_op(dut, 13, 2, InternalOp.OP_SHR) + print ("alu_sim shr", result) + assert (result == 3) + def test_alu(): alu = ALU(width=16) - run_simulation(alu, alu_sim(alu), vcd_name='test_alusim.vcd') + run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd') vl = rtlil.convert(alu, ports=alu.ports()) with open("test_alu.il", "w") as f: @@ -324,8 +470,8 @@ def test_alu(): if __name__ == "__main__": test_alu() - alu = BranchALU(width=16) - vl = rtlil.convert(alu, ports=alu.ports()) - with open("test_branch_alu.il", "w") as f: - f.write(vl) + # alu = BranchALU(width=16) + # vl = rtlil.convert(alu, ports=alu.ports()) + # with open("test_branch_alu.il", "w") as f: + # f.write(vl)