X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fexperiment%2Falu_hier.py;h=9c8115ce69da2c035f0d674ac3768fceb415020d;hb=65f1492b3d3531687ba90c5c537453cde0e6e5fd;hp=6d95290dd8a7ecae487a76c9f1d5125b15f3c43a;hpb=5affaf43c9747d28a0a6957de6213fb035a86098;p=soc.git diff --git a/src/soc/experiment/alu_hier.py b/src/soc/experiment/alu_hier.py index 6d95290d..9c8115ce 100644 --- a/src/soc/experiment/alu_hier.py +++ b/src/soc/experiment/alu_hier.py @@ -9,90 +9,31 @@ A "real" integer ALU would place the answers onto the output bus after only one cycle (sync) """ -from nmigen import Elaboratable, Signal, Module, Const, Mux +from nmigen import Elaboratable, Signal, Module, Const, Mux, Array from nmigen.hdl.rec import Record, Layout from nmigen.cli import main from nmigen.cli import verilog, rtlil from nmigen.compat.sim import run_simulation +from nmigen.back.pysim import Simulator, Settle -from soc.decoder.power_enums import InternalOp, CryIn - -import operator +from soc.decoder.power_enums import MicrOp, Function, CryIn +from soc.fu.alu.alu_input_record import CompALUOpSubset +from soc.fu.cr.cr_input_record import CompCROpSubset -class CompALUOpSubset(Record): - """CompALUOpSubset - - a copy of the relevant subset information from Decode2Execute1Type - needed for ALU operations. - """ - def __init__(self): - layout = (('insn_type', InternalOp), - ('nia', 64), - ('imm_data', Layout((("imm", 64), ("imm_ok", 1)))), - #'cr = Signal(32, reset_less=True) # NO: this is from the CR SPR - #'xerc = XerBits() # NO: this is from the XER SPR - ('lk', 1), - ('rc', Layout((("rc", 1), ("rc_ok", 1)))), - ('oe', Layout((("oe", 1), ("oe_ok", 1)))), - ('invert_a', 1), - ('invert_out', 1), - ('input_carry', CryIn), - ('output_carry', 1), - ('input_cr', 1), - ('output_cr', 1), - ('is_32bit', 1), - ('is_signed', 1), - ('byte_reverse', 1), - ('sign_extend', 1)) - - Record.__init__(self, Layout(layout)) - - # grrr. Record does not have kwargs - self.insn_type.reset_less = True - self.nia.reset_less = True - #self.cr = Signal(32, reset_less = True - #self.xerc = XerBits( - self.lk.reset_less = True - self.invert_a.reset_less = True - self.invert_out.reset_less = True - self.input_carry.reset_less = True - self.output_carry.reset_less = True - self.input_cr.reset_less = True - self.output_cr.reset_less = True - self.is_32bit.reset_less = True - self.is_signed.reset_less = True - self.byte_reverse.reset_less = True - self.sign_extend.reset_less = True +import operator - def ports(self): - return [self.insn_type, - self.nia, - #self.cr, - #self.xerc, - self.lk, - self.invert_a, - self.invert_out, - self.input_carry, - self.output_carry, - self.input_cr, - self.output_cr, - self.is_32bit, - self.is_signed, - self.byte_reverse, - self.sign_extend, - ] class Adder(Elaboratable): def __init__(self, width): - self.invert_a = Signal() - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) + self.invert_in = Signal() + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width, name="add_o") def elaborate(self, platform): m = Module() - with m.If(self.invert_a): + with m.If(self.invert_in): m.d.comb += self.o.eq((~self.a) + self.b) with m.Else(): m.d.comb += self.o.eq(self.a + self.b) @@ -101,9 +42,9 @@ class Adder(Elaboratable): class Subtractor(Elaboratable): def __init__(self, width): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width, name="sub_o") def elaborate(self, platform): m = Module() @@ -113,9 +54,9 @@ class Subtractor(Elaboratable): class Multiplier(Elaboratable): def __init__(self, width): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width, name="mul_o") def elaborate(self, platform): m = Module() @@ -126,103 +67,211 @@ class Multiplier(Elaboratable): class Shifter(Elaboratable): def __init__(self, width): self.width = width - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width, name="shf_o") def elaborate(self, platform): m = Module() btrunc = Signal(self.width) - m.d.comb += btrunc.eq(self.b & Const((1<> btrunc) return m +class Dummy: + pass + + +class DummyALU(Elaboratable): + def __init__(self, width): + self.p = Dummy() # make look like nmutil pipeline API + self.p.data_i = Dummy() + self.p.data_i.ctx = Dummy() + self.n = Dummy() # make look like nmutil pipeline API + self.n.data_o = Dummy() + self.p.valid_i = Signal() + self.p.ready_o = Signal() + self.n.ready_i = Signal() + self.n.valid_o = Signal() + self.counter = Signal(4) + self.op = CompCROpSubset() + i = [] + i.append(Signal(width, name="i1")) + i.append(Signal(width, name="i2")) + i.append(Signal(width, name="i3")) + self.i = Array(i) + self.a, self.b, self.c = i[0], i[1], i[2] + self.out = Array([Signal(width, name="alu_o")]) + self.o = self.out[0] + self.width = width + # more "look like nmutil pipeline API" + self.p.data_i.ctx.op = self.op + self.p.data_i.a = self.a + self.p.data_i.b = self.b + self.p.data_i.c = self.c + self.n.data_o.o = self.o + + def elaborate(self, platform): + m = Module() + + go_now = Signal(reset_less=True) # testing no-delay ALU + + with m.If(self.p.valid_i): + # input is valid. next check, if we already said "ready" or not + with m.If(~self.p.ready_o): + # we didn't say "ready" yet, so say so and initialise + m.d.sync += self.p.ready_o.eq(1) + + m.d.sync += self.o.eq(self.a) + m.d.comb += go_now.eq(1) + m.d.sync += self.counter.eq(1) + + with m.Else(): + # input says no longer valid, so drop ready as well. + # a "proper" ALU would have had to sync in the opcode and a/b ops + m.d.sync += self.p.ready_o.eq(0) + + # ok so the counter's running: when it gets to 1, fire the output + with m.If((self.counter == 1) | go_now): + # set the output as valid if the recipient is ready for it + m.d.sync += self.n.valid_o.eq(1) + with m.If(self.n.ready_i & self.n.valid_o): + m.d.sync += self.n.valid_o.eq(0) + # recipient said it was ready: reset back to known-good. + m.d.sync += self.counter.eq(0) # reset the counter + m.d.sync += self.o.eq(0) # clear the output for tidiness sake + + # countdown to 1 (transition from 1 to 0 only on acknowledgement) + with m.If(self.counter > 1): + m.d.sync += self.counter.eq(self.counter - 1) + + return m + + def __iter__(self): + yield from self.op.ports() + yield self.a + yield self.b + yield self.c + yield self.o + + def ports(self): + return list(self) + + class ALU(Elaboratable): def __init__(self, width): - self.p_valid_i = Signal() - self.p_ready_o = Signal() - self.n_ready_i = Signal() - self.n_valid_o = Signal() - self.counter = Signal(4) - self.op = CompALUOpSubset() - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) + self.p = Dummy() # make look like nmutil pipeline API + self.p.data_i = Dummy() + self.p.data_i.ctx = Dummy() + self.n = Dummy() # make look like nmutil pipeline API + self.n.data_o = Dummy() + self.p.valid_i = Signal() + self.p.ready_o = Signal() + self.n.ready_i = Signal() + self.n.valid_o = Signal() + self.counter = Signal(4) + self.op = CompALUOpSubset(name="op") + i = [] + i.append(Signal(width, name="i1")) + i.append(Signal(width, name="i2")) + self.i = Array(i) + self.a, self.b = i[0], i[1] + self.out = Array([Signal(width, name="alu_o")]) + self.o = self.out[0] self.width = width + # more "look like nmutil pipeline API" + self.p.data_i.ctx.op = self.op + self.p.data_i.a = self.a + self.p.data_i.b = self.b + self.n.data_o.o = self.o def elaborate(self, platform): m = Module() add = Adder(self.width) mul = Multiplier(self.width) shf = Shifter(self.width) + sub = Subtractor(self.width) m.submodules.add = add m.submodules.mul = mul m.submodules.shf = shf + m.submodules.sub = sub # really should not activate absolutely all ALU inputs like this - for mod in [add, mul, shf]: + for mod in [add, mul, shf, sub]: m.d.comb += [ mod.a.eq(self.a), mod.b.eq(self.b), ] # pass invert (and carry later) - m.d.comb += add.invert_a.eq(self.op.invert_a) + m.d.comb += add.invert_in.eq(self.op.invert_in) - go_now = Signal(reset_less=True) # testing no-delay ALU + go_now = Signal(reset_less=True) # testing no-delay ALU - with m.If(self.p_valid_i): - # input is valid. next check, if we already said "ready" or not - with m.If(~self.p_ready_o): - # we didn't say "ready" yet, so say so and initialise - m.d.sync += self.p_ready_o.eq(1) + # ALU sequencer is idle when the count is zero + alu_idle = Signal(reset_less=True) + m.d.comb += alu_idle.eq(self.counter == 0) + + # ALU sequencer is done when the count is one + alu_done = Signal(reset_less=True) + m.d.comb += alu_done.eq(self.counter == 1) + + # select handshake handling according to ALU type + with m.If(go_now): + # with a combinatorial, no-delay ALU, just pass through + # the handshake signals to the other side + m.d.comb += self.p.ready_o.eq(self.n.ready_i) + m.d.comb += self.n.valid_o.eq(self.p.valid_i) + with m.Else(): + # sequential ALU handshake: + # ready_o responds to valid_i, but only if the ALU is idle + m.d.comb += self.p.ready_o.eq(alu_idle) + # select the internally generated valid_o, above + m.d.comb += self.n.valid_o.eq(alu_done) + + # hold the ALU result until ready_o is asserted + alu_r = Signal(self.width) + + with m.If(alu_idle): + with m.If(self.p.valid_i): # as this is a "fake" pipeline, just grab the output right now - with m.If(self.op.insn_type == InternalOp.OP_ADD): - m.d.sync += self.o.eq(add.o) - with m.Elif(self.op.insn_type == InternalOp.OP_MUL_L64): - m.d.sync += self.o.eq(mul.o) - with m.Elif(self.op.insn_type == InternalOp.OP_SHR): - m.d.sync += self.o.eq(shf.o) - # TODO: SUB + with m.If(self.op.insn_type == MicrOp.OP_ADD): + m.d.sync += alu_r.eq(add.o) + with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64): + m.d.sync += alu_r.eq(mul.o) + with m.Elif(self.op.insn_type == MicrOp.OP_SHR): + m.d.sync += alu_r.eq(shf.o) + # SUB is zero-delay, no need to register # NOTE: all of these are fake, just something to test # MUL, to take 5 instructions - with m.If(self.op.insn_type == InternalOp.OP_MUL_L64): + with m.If(self.op.insn_type == MicrOp.OP_MUL_L64): m.d.sync += self.counter.eq(5) - # SHIFT to take 7 - with m.Elif(self.op.insn_type == InternalOp.OP_SHR): - m.d.sync += self.counter.eq(7) - # ADD/SUB to take 2, straight away - with m.If(self.op.insn_type == InternalOp.OP_ADD): + # SHIFT to take 1, straight away + with m.Elif(self.op.insn_type == MicrOp.OP_SHR): + m.d.sync += self.counter.eq(1) + # ADD/SUB to take 3 + with m.Elif(self.op.insn_type == MicrOp.OP_ADD): m.d.sync += self.counter.eq(3) - # others to take 1, straight away + # others to take no delay with m.Else(): m.d.comb += go_now.eq(1) - m.d.sync += self.counter.eq(1) - - with m.Else(): - # input says no longer valid, so drop ready as well. - # a "proper" ALU would have had to sync in the opcode and a/b ops - m.d.sync += self.p_ready_o.eq(0) - # ok so the counter's running: when it gets to 1, fire the output - with m.If((self.counter == 1) | go_now): - # set the output as valid if the recipient is ready for it - m.d.sync += self.n_valid_o.eq(1) - with m.If(self.n_ready_i & self.n_valid_o): - m.d.sync += self.n_valid_o.eq(0) - # recipient said it was ready: reset back to known-good. - m.d.sync += self.counter.eq(0) # reset the counter - m.d.sync += self.o.eq(0) # clear the output for tidiness sake - - # countdown to 1 (transition from 1 to 0 only on acknowledgement) - with m.If(self.counter > 1): + with m.Elif(~alu_done | self.n.ready_i): + # decrement the counter while the ALU is neither idle nor finished m.d.sync += self.counter.eq(self.counter - 1) + # choose between zero-delay output, or registered + with m.If(go_now): + m.d.comb += self.o.eq(sub.o) + # only present the result at the last computation cycle + with m.Elif(alu_done): + m.d.comb += self.o.eq(alu_r) + return m def __iter__(self): @@ -230,6 +279,10 @@ class ALU(Elaboratable): yield self.a yield self.b yield self.o + yield self.p.valid_i + yield self.p.ready_o + yield self.n.valid_o + yield self.n.ready_i def ports(self): return list(self) @@ -237,9 +290,9 @@ class ALU(Elaboratable): class BranchOp(Elaboratable): def __init__(self, width, op): - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) + self.a = Signal(width) + self.b = Signal(width) + self.o = Signal(width) self.op = op def elaborate(self, platform): @@ -250,15 +303,24 @@ class BranchOp(Elaboratable): class BranchALU(Elaboratable): def __init__(self, width): - self.p_valid_i = Signal() - self.p_ready_o = Signal() - self.n_ready_i = Signal() - self.n_valid_o = Signal() - self.counter = Signal(4) - self.op = Signal(2) - self.a = Signal(width) - self.b = Signal(width) - self.o = Signal(width) + self.p = Dummy() # make look like nmutil pipeline API + self.p.data_i = Dummy() + self.p.data_i.ctx = Dummy() + self.n = Dummy() # make look like nmutil pipeline API + self.n.data_o = Dummy() + self.p.valid_i = Signal() + self.p.ready_o = Signal() + self.n.ready_i = Signal() + self.n.valid_o = Signal() + self.counter = Signal(4) + self.op = Signal(2) + i = [] + i.append(Signal(width, name="i1")) + i.append(Signal(width, name="i2")) + self.i = Array(i) + self.a, self.b = i[0], i[1] + self.out = Array([Signal(width)]) + self.o = self.out[0] self.width = width def elaborate(self, platform): @@ -278,34 +340,35 @@ class BranchALU(Elaboratable): mod.b.eq(self.b), ] - go_now = Signal(reset_less=True) # testing no-delay ALU - with m.If(self.p_valid_i): + go_now = Signal(reset_less=True) # testing no-delay ALU + with m.If(self.p.valid_i): # input is valid. next check, if we already said "ready" or not - with m.If(~self.p_ready_o): + with m.If(~self.p.ready_o): # we didn't say "ready" yet, so say so and initialise - m.d.sync += self.p_ready_o.eq(1) + m.d.sync += self.p.ready_o.eq(1) # as this is a "fake" pipeline, just grab the output right now with m.Switch(self.op): for i, mod in enumerate([bgt, blt, beq, bne]): with m.Case(i): m.d.sync += self.o.eq(mod.o) - m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake) + # branch to take 5 cycles (fake) + m.d.sync += self.counter.eq(5) #m.d.comb += go_now.eq(1) with m.Else(): # input says no longer valid, so drop ready as well. # a "proper" ALU would have had to sync in the opcode and a/b ops - m.d.sync += self.p_ready_o.eq(0) + m.d.sync += self.p.ready_o.eq(0) # ok so the counter's running: when it gets to 1, fire the output with m.If((self.counter == 1) | go_now): # set the output as valid if the recipient is ready for it - m.d.sync += self.n_valid_o.eq(1) - with m.If(self.n_ready_i & self.n_valid_o): - m.d.sync += self.n_valid_o.eq(0) + m.d.sync += self.n.valid_o.eq(1) + with m.If(self.n.ready_i & self.n.valid_o): + m.d.sync += self.n.valid_o.eq(0) # recipient said it was ready: reset back to known-good. - m.d.sync += self.counter.eq(0) # reset the counter - m.d.sync += self.o.eq(0) # clear the output for tidiness sake + m.d.sync += self.counter.eq(0) # reset the counter + m.d.sync += self.o.eq(0) # clear the output for tidiness sake # countdown to 1 (transition from 1 to 0 only on acknowledgement) with m.If(self.counter > 1): @@ -322,57 +385,178 @@ class BranchALU(Elaboratable): def ports(self): return list(self) + def run_op(dut, a, b, op, inv_a=0): yield dut.a.eq(a) yield dut.b.eq(b) yield dut.op.insn_type.eq(op) - yield dut.op.invert_a.eq(inv_a) - yield dut.n_ready_i.eq(0) - yield dut.p_valid_i.eq(1) + yield dut.op.invert_in.eq(inv_a) + yield dut.n.ready_i.eq(0) + yield dut.p.valid_i.eq(1) + yield dut.n.ready_i.eq(1) yield - while True: + + # wait for the ALU to accept our input data + while not (yield dut.p.ready_o): + yield + + yield dut.p.valid_i.eq(0) + yield dut.a.eq(0) + yield dut.b.eq(0) + yield dut.op.insn_type.eq(0) + yield dut.op.invert_in.eq(0) + + # wait for the ALU to present the output data + while not (yield dut.n.valid_o): yield - n_valid_o = yield dut.n_valid_o - if n_valid_o: - break - yield + # latch the result and lower read_i result = yield dut.o - yield dut.p_valid_i.eq(0) - yield dut.n_ready_i.eq(0) - yield + yield dut.n.ready_i.eq(0) return result def alu_sim(dut): - result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD) - print ("alu_sim add", result) + result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD) + print("alu_sim add", result) assert (result == 8) - result = yield from run_op(dut, 2, 3, InternalOp.OP_MUL_L64) - print ("alu_sim mul", result) + result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64) + print("alu_sim mul", result) assert (result == 6) - result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD, inv_a=1) - print ("alu_sim add-inv", result) + result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1) + print("alu_sim add-inv", result) assert (result == 65533) + # test zero-delay ALU + # don't have OP_SUB, so use any other + result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP) + print("alu_sim sub", result) + assert (result == 2) + + result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR) + print("alu_sim shr", result) + assert (result == 3) + def test_alu(): alu = ALU(width=16) - run_simulation(alu, alu_sim(alu), vcd_name='test_alusim.vcd') + run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd') vl = rtlil.convert(alu, ports=alu.ports()) with open("test_alu.il", "w") as f: f.write(vl) +def test_alu_parallel(): + # Compare with the sequential test implementation, above. + m = Module() + m.submodules.alu = dut = ALU(width=16) + sim = Simulator(m) + sim.add_clock(1e-6) + + def send(a, b, op, inv_a=0): + # present input data and assert valid_i + yield dut.a.eq(a) + yield dut.b.eq(b) + yield dut.op.insn_type.eq(op) + yield dut.op.invert_in.eq(inv_a) + yield dut.p.valid_i.eq(1) + yield + # wait for ready_o to be asserted + while not (yield dut.p.ready_o): + yield + # clear input data and negate valid_i + # if send is called again immediately afterwards, there will be no + # visible transition (they will not be negated, after all) + yield dut.p.valid_i.eq(0) + yield dut.a.eq(0) + yield dut.b.eq(0) + yield dut.op.insn_type.eq(0) + yield dut.op.invert_in.eq(0) + + def receive(): + # signal readiness to receive data + yield dut.n.ready_i.eq(1) + yield + # wait for valid_o to be asserted + while not (yield dut.n.valid_o): + yield + # read result + result = yield dut.o + # negate ready_i + # if receive is called again immediately afterwards, there will be no + # visible transition (it will not be negated, after all) + yield dut.n.ready_i.eq(0) + return result + + def producer(): + # send a few test cases, interspersed with wait states + # note that, for this test, we do not wait for the result to be ready, + # before presenting the next input + # 5 + 3 + yield from send(5, 3, MicrOp.OP_ADD) + yield + yield + # 2 * 3 + yield from send(2, 3, MicrOp.OP_MUL_L64) + # (-5) + 3 + yield from send(5, 3, MicrOp.OP_ADD, inv_a=1) + yield + # 5 - 3 + # note that this is a zero-delay operation + yield from send(5, 3, MicrOp.OP_NOP) + yield + yield + # 13 >> 2 + yield from send(13, 2, MicrOp.OP_SHR) + + def consumer(): + # receive and check results, interspersed with wait states + # the consumer is not in step with the producer, but the + # order of the results are preserved + yield + # 5 + 3 = 8 + result = yield from receive() + assert (result == 8) + # 2 * 3 = 6 + result = yield from receive() + assert (result == 6) + yield + yield + # (-5) + 3 = -2 + result = yield from receive() + assert (result == 65533) # unsigned equivalent to -2 + # 5 - 3 = 2 + # note that this is a zero-delay operation + # this, and the previous result, will be received back-to-back + # (check the output waveform to see this) + result = yield from receive() + assert (result == 2) + yield + yield + # 13 >> 2 = 3 + result = yield from receive() + assert (result == 3) + + sim.add_sync_process(producer) + sim.add_sync_process(consumer) + sim_writer = sim.write_vcd( + "test_alu_parallel.vcd", + "test_alu_parallel.gtkw", + traces=dut.ports() + ) + with sim_writer: + sim.run() + + if __name__ == "__main__": test_alu() + test_alu_parallel() - alu = BranchALU(width=16) - vl = rtlil.convert(alu, ports=alu.ports()) - with open("test_branch_alu.il", "w") as f: - f.write(vl) - + # alu = BranchALU(width=16) + # vl = rtlil.convert(alu, ports=alu.ports()) + # with open("test_branch_alu.il", "w") as f: + # f.write(vl)