src/soc/experiment/alu_hier.py

   1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
   2
   3 This ALU is *deliberately* designed to add in (unnecessary) delays into
   4 different operations so as to be able to test the 6600-style matrices
   5 and the CompUnits.  Countdown timers wait for (defined) periods before
   6 indicating that the output is valid
   7
   8 A "real" integer ALU would place the answers onto the output bus after
   9 only one cycle (sync)
  10 """
  11
  12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
  13 from nmigen.hdl.rec import Record, Layout
  14 from nmigen.cli import main
  15 from nmigen.cli import verilog, rtlil
  16 from nmigen.compat.sim import run_simulation
  17
  18 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  19 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  20 from nmutil.sim_tmp_alternative import Simulator
  21
  22 from soc.decoder.power_enums import MicrOp, Function, CryIn
  23
  24 from soc.fu.alu.alu_input_record import CompALUOpSubset
  25 from soc.fu.cr.cr_input_record import CompCROpSubset
  26
  27 import operator
  28
  29
  30 class Adder(Elaboratable):
  31     def __init__(self, width):
  32         self.invert_in = Signal()
  33         self.a = Signal(width)
  34         self.b = Signal(width)
  35         self.o = Signal(width, name="add_o")
  36
  37     def elaborate(self, platform):
  38         m = Module()
  39         with m.If(self.invert_in):
  40             m.d.comb += self.o.eq((~self.a) + self.b)
  41         with m.Else():
  42             m.d.comb += self.o.eq(self.a + self.b)
  43         return m
  44
  45
  46 class Subtractor(Elaboratable):
  47     def __init__(self, width):
  48         self.a = Signal(width)
  49         self.b = Signal(width)
  50         self.o = Signal(width, name="sub_o")
  51
  52     def elaborate(self, platform):
  53         m = Module()
  54         m.d.comb += self.o.eq(self.a - self.b)
  55         return m
  56
  57
  58 class Multiplier(Elaboratable):
  59     def __init__(self, width):
  60         self.a = Signal(width)
  61         self.b = Signal(width)
  62         self.o = Signal(width, name="mul_o")
  63
  64     def elaborate(self, platform):
  65         m = Module()
  66         m.d.comb += self.o.eq(self.a * self.b)
  67         return m
  68
  69
  70 class Shifter(Elaboratable):
  71     def __init__(self, width):
  72         self.width = width
  73         self.a = Signal(width)
  74         self.b = Signal(width)
  75         self.o = Signal(width, name="shf_o")
  76
  77     def elaborate(self, platform):
  78         m = Module()
  79         btrunc = Signal(self.width)
  80         m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
  81         m.d.comb += self.o.eq(self.a >> btrunc)
  82         return m
  83
  84
  85 class Dummy:
  86     pass
  87
  88
  89 class DummyALU(Elaboratable):
  90     def __init__(self, width):
  91         self.p = Dummy()  # make look like nmutil pipeline API
  92         self.p.data_i = Dummy()
  93         self.p.data_i.ctx = Dummy()
  94         self.n = Dummy()  # make look like nmutil pipeline API
  95         self.n.data_o = Dummy()
  96         self.p.valid_i = Signal()
  97         self.p.ready_o = Signal()
  98         self.n.ready_i = Signal()
  99         self.n.valid_o = Signal()
 100         self.counter = Signal(4)
 101         self.op = CompCROpSubset()
 102         i = []
 103         i.append(Signal(width, name="i1"))
 104         i.append(Signal(width, name="i2"))
 105         i.append(Signal(width, name="i3"))
 106         self.i = Array(i)
 107         self.a, self.b, self.c = i[0], i[1], i[2]
 108         self.out = Array([Signal(width, name="alu_o")])
 109         self.o = self.out[0]
 110         self.width = width
 111         # more "look like nmutil pipeline API"
 112         self.p.data_i.ctx.op = self.op
 113         self.p.data_i.a = self.a
 114         self.p.data_i.b = self.b
 115         self.p.data_i.c = self.c
 116         self.n.data_o.o = self.o
 117
 118     def elaborate(self, platform):
 119         m = Module()
 120
 121         go_now = Signal(reset_less=True)  # testing no-delay ALU
 122
 123         with m.If(self.p.valid_i):
 124             # input is valid. next check, if we already said "ready" or not
 125             with m.If(~self.p.ready_o):
 126                 # we didn't say "ready" yet, so say so and initialise
 127                 m.d.sync += self.p.ready_o.eq(1)
 128
 129                 m.d.sync += self.o.eq(self.a)
 130                 m.d.comb += go_now.eq(1)
 131                 m.d.sync += self.counter.eq(1)
 132
 133         with m.Else():
 134             # input says no longer valid, so drop ready as well.
 135             # a "proper" ALU would have had to sync in the opcode and a/b ops
 136             m.d.sync += self.p.ready_o.eq(0)
 137
 138         # ok so the counter's running: when it gets to 1, fire the output
 139         with m.If((self.counter == 1) | go_now):
 140             # set the output as valid if the recipient is ready for it
 141             m.d.sync += self.n.valid_o.eq(1)
 142         with m.If(self.n.ready_i & self.n.valid_o):
 143             m.d.sync += self.n.valid_o.eq(0)
 144             # recipient said it was ready: reset back to known-good.
 145             m.d.sync += self.counter.eq(0)  # reset the counter
 146             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 147
 148         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 149         with m.If(self.counter > 1):
 150             m.d.sync += self.counter.eq(self.counter - 1)
 151
 152         return m
 153
 154     def __iter__(self):
 155         yield from self.op.ports()
 156         yield self.a
 157         yield self.b
 158         yield self.c
 159         yield self.o
 160
 161     def ports(self):
 162         return list(self)
 163
 164
 165 class ALU(Elaboratable):
 166     def __init__(self, width):
 167         self.p = Dummy()  # make look like nmutil pipeline API
 168         self.p.data_i = Dummy()
 169         self.p.data_i.ctx = Dummy()
 170         self.n = Dummy()  # make look like nmutil pipeline API
 171         self.n.data_o = Dummy()
 172         self.p.valid_i = Signal()
 173         self.p.ready_o = Signal()
 174         self.n.ready_i = Signal()
 175         self.n.valid_o = Signal()
 176         self.counter = Signal(4)
 177         self.op = CompALUOpSubset(name="op")
 178         i = []
 179         i.append(Signal(width, name="i1"))
 180         i.append(Signal(width, name="i2"))
 181         self.i = Array(i)
 182         self.a, self.b = i[0], i[1]
 183         self.out = Array([Signal(width, name="alu_o")])
 184         self.o = self.out[0]
 185         self.width = width
 186         # more "look like nmutil pipeline API"
 187         self.p.data_i.ctx.op = self.op
 188         self.p.data_i.a = self.a
 189         self.p.data_i.b = self.b
 190         self.n.data_o.o = self.o
 191
 192     def elaborate(self, platform):
 193         m = Module()
 194         add = Adder(self.width)
 195         mul = Multiplier(self.width)
 196         shf = Shifter(self.width)
 197         sub = Subtractor(self.width)
 198
 199         m.submodules.add = add
 200         m.submodules.mul = mul
 201         m.submodules.shf = shf
 202         m.submodules.sub = sub
 203
 204         # really should not activate absolutely all ALU inputs like this
 205         for mod in [add, mul, shf, sub]:
 206             m.d.comb += [
 207                 mod.a.eq(self.a),
 208                 mod.b.eq(self.b),
 209             ]
 210
 211         # pass invert (and carry later)
 212         m.d.comb += add.invert_in.eq(self.op.invert_in)
 213
 214         go_now = Signal(reset_less=True)  # testing no-delay ALU
 215
 216         # ALU sequencer is idle when the count is zero
 217         alu_idle = Signal(reset_less=True)
 218         m.d.comb += alu_idle.eq(self.counter == 0)
 219
 220         # ALU sequencer is done when the count is one
 221         alu_done = Signal(reset_less=True)
 222         m.d.comb += alu_done.eq(self.counter == 1)
 223
 224         # select handshake handling according to ALU type
 225         with m.If(go_now):
 226             # with a combinatorial, no-delay ALU, just pass through
 227             # the handshake signals to the other side
 228             m.d.comb += self.p.ready_o.eq(self.n.ready_i)
 229             m.d.comb += self.n.valid_o.eq(self.p.valid_i)
 230         with m.Else():
 231             # sequential ALU handshake:
 232             # ready_o responds to valid_i, but only if the ALU is idle
 233             m.d.comb += self.p.ready_o.eq(alu_idle)
 234             # select the internally generated valid_o, above
 235             m.d.comb += self.n.valid_o.eq(alu_done)
 236
 237         # hold the ALU result until ready_o is asserted
 238         alu_r = Signal(self.width)
 239
 240         with m.If(alu_idle):
 241             with m.If(self.p.valid_i):
 242
 243                 # as this is a "fake" pipeline, just grab the output right now
 244                 with m.If(self.op.insn_type == MicrOp.OP_ADD):
 245                     m.d.sync += alu_r.eq(add.o)
 246                 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
 247                     m.d.sync += alu_r.eq(mul.o)
 248                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 249                     m.d.sync += alu_r.eq(shf.o)
 250                 # SUB is zero-delay, no need to register
 251
 252                 # NOTE: all of these are fake, just something to test
 253
 254                 # MUL, to take 5 instructions
 255                 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
 256                     m.d.sync += self.counter.eq(5)
 257                 # SHIFT to take 1, straight away
 258                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 259                     m.d.sync += self.counter.eq(1)
 260                 # ADD/SUB to take 3
 261                 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
 262                     m.d.sync += self.counter.eq(3)
 263                 # others to take no delay
 264                 with m.Else():
 265                     m.d.comb += go_now.eq(1)
 266
 267         with m.Elif(~alu_done | self.n.ready_i):
 268             # decrement the counter while the ALU is neither idle nor finished
 269             m.d.sync += self.counter.eq(self.counter - 1)
 270
 271         # choose between zero-delay output, or registered
 272         with m.If(go_now):
 273             m.d.comb += self.o.eq(sub.o)
 274         # only present the result at the last computation cycle
 275         with m.Elif(alu_done):
 276             m.d.comb += self.o.eq(alu_r)
 277
 278         return m
 279
 280     def __iter__(self):
 281         yield from self.op.ports()
 282         yield self.a
 283         yield self.b
 284         yield self.o
 285         yield self.p.valid_i
 286         yield self.p.ready_o
 287         yield self.n.valid_o
 288         yield self.n.ready_i
 289
 290     def ports(self):
 291         return list(self)
 292
 293
 294 class BranchOp(Elaboratable):
 295     def __init__(self, width, op):
 296         self.a = Signal(width)
 297         self.b = Signal(width)
 298         self.o = Signal(width)
 299         self.op = op
 300
 301     def elaborate(self, platform):
 302         m = Module()
 303         m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
 304         return m
 305
 306
 307 class BranchALU(Elaboratable):
 308     def __init__(self, width):
 309         self.p = Dummy()  # make look like nmutil pipeline API
 310         self.p.data_i = Dummy()
 311         self.p.data_i.ctx = Dummy()
 312         self.n = Dummy()  # make look like nmutil pipeline API
 313         self.n.data_o = Dummy()
 314         self.p.valid_i = Signal()
 315         self.p.ready_o = Signal()
 316         self.n.ready_i = Signal()
 317         self.n.valid_o = Signal()
 318         self.counter = Signal(4)
 319         self.op = Signal(2)
 320         i = []
 321         i.append(Signal(width, name="i1"))
 322         i.append(Signal(width, name="i2"))
 323         self.i = Array(i)
 324         self.a, self.b = i[0], i[1]
 325         self.out = Array([Signal(width)])
 326         self.o = self.out[0]
 327         self.width = width
 328
 329     def elaborate(self, platform):
 330         m = Module()
 331         bgt = BranchOp(self.width, operator.gt)
 332         blt = BranchOp(self.width, operator.lt)
 333         beq = BranchOp(self.width, operator.eq)
 334         bne = BranchOp(self.width, operator.ne)
 335
 336         m.submodules.bgt = bgt
 337         m.submodules.blt = blt
 338         m.submodules.beq = beq
 339         m.submodules.bne = bne
 340         for mod in [bgt, blt, beq, bne]:
 341             m.d.comb += [
 342                 mod.a.eq(self.a),
 343                 mod.b.eq(self.b),
 344             ]
 345
 346         go_now = Signal(reset_less=True)  # testing no-delay ALU
 347         with m.If(self.p.valid_i):
 348             # input is valid. next check, if we already said "ready" or not
 349             with m.If(~self.p.ready_o):
 350                 # we didn't say "ready" yet, so say so and initialise
 351                 m.d.sync += self.p.ready_o.eq(1)
 352
 353                 # as this is a "fake" pipeline, just grab the output right now
 354                 with m.Switch(self.op):
 355                     for i, mod in enumerate([bgt, blt, beq, bne]):
 356                         with m.Case(i):
 357                             m.d.sync += self.o.eq(mod.o)
 358                 # branch to take 5 cycles (fake)
 359                 m.d.sync += self.counter.eq(5)
 360                 #m.d.comb += go_now.eq(1)
 361         with m.Else():
 362             # input says no longer valid, so drop ready as well.
 363             # a "proper" ALU would have had to sync in the opcode and a/b ops
 364             m.d.sync += self.p.ready_o.eq(0)
 365
 366         # ok so the counter's running: when it gets to 1, fire the output
 367         with m.If((self.counter == 1) | go_now):
 368             # set the output as valid if the recipient is ready for it
 369             m.d.sync += self.n.valid_o.eq(1)
 370         with m.If(self.n.ready_i & self.n.valid_o):
 371             m.d.sync += self.n.valid_o.eq(0)
 372             # recipient said it was ready: reset back to known-good.
 373             m.d.sync += self.counter.eq(0)  # reset the counter
 374             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 375
 376         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 377         with m.If(self.counter > 1):
 378             m.d.sync += self.counter.eq(self.counter - 1)
 379
 380         return m
 381
 382     def __iter__(self):
 383         yield self.op
 384         yield self.a
 385         yield self.b
 386         yield self.o
 387
 388     def ports(self):
 389         return list(self)
 390
 391
 392 def run_op(dut, a, b, op, inv_a=0):
 393     yield dut.a.eq(a)
 394     yield dut.b.eq(b)
 395     yield dut.op.insn_type.eq(op)
 396     yield dut.op.invert_in.eq(inv_a)
 397     yield dut.n.ready_i.eq(0)
 398     yield dut.p.valid_i.eq(1)
 399     yield dut.n.ready_i.eq(1)
 400     yield
 401
 402     # wait for the ALU to accept our input data
 403     while not (yield dut.p.ready_o):
 404         yield
 405
 406     yield dut.p.valid_i.eq(0)
 407     yield dut.a.eq(0)
 408     yield dut.b.eq(0)
 409     yield dut.op.insn_type.eq(0)
 410     yield dut.op.invert_in.eq(0)
 411
 412     # wait for the ALU to present the output data
 413     while not (yield dut.n.valid_o):
 414         yield
 415
 416     # latch the result and lower read_i
 417     result = yield dut.o
 418     yield dut.n.ready_i.eq(0)
 419
 420     return result
 421
 422
 423 def alu_sim(dut):
 424     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
 425     print("alu_sim add", result)
 426     assert (result == 8)
 427
 428     result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
 429     print("alu_sim mul", result)
 430     assert (result == 6)
 431
 432     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
 433     print("alu_sim add-inv", result)
 434     assert (result == 65533)
 435
 436     # test zero-delay ALU
 437     # don't have OP_SUB, so use any other
 438     result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP)
 439     print("alu_sim sub", result)
 440     assert (result == 2)
 441
 442     result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
 443     print("alu_sim shr", result)
 444     assert (result == 3)
 445
 446
 447 def test_alu():
 448     alu = ALU(width=16)
 449     run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
 450
 451     vl = rtlil.convert(alu, ports=alu.ports())
 452     with open("test_alu.il", "w") as f:
 453         f.write(vl)
 454
 455
 456 def test_alu_parallel():
 457     # Compare with the sequential test implementation, above.
 458     m = Module()
 459     m.submodules.alu = dut = ALU(width=16)
 460     sim = Simulator(m)
 461     sim.add_clock(1e-6)
 462
 463     def send(a, b, op, inv_a=0):
 464         # present input data and assert valid_i
 465         yield dut.a.eq(a)
 466         yield dut.b.eq(b)
 467         yield dut.op.insn_type.eq(op)
 468         yield dut.op.invert_in.eq(inv_a)
 469         yield dut.p.valid_i.eq(1)
 470         yield
 471         # wait for ready_o to be asserted
 472         while not (yield dut.p.ready_o):
 473             yield
 474         # clear input data and negate valid_i
 475         # if send is called again immediately afterwards, there will be no
 476         # visible transition (they will not be negated, after all)
 477         yield dut.p.valid_i.eq(0)
 478         yield dut.a.eq(0)
 479         yield dut.b.eq(0)
 480         yield dut.op.insn_type.eq(0)
 481         yield dut.op.invert_in.eq(0)
 482
 483     def receive():
 484         # signal readiness to receive data
 485         yield dut.n.ready_i.eq(1)
 486         yield
 487         # wait for valid_o to be asserted
 488         while not (yield dut.n.valid_o):
 489             yield
 490         # read result
 491         result = yield dut.o
 492         # negate ready_i
 493         # if receive is called again immediately afterwards, there will be no
 494         # visible transition (it will not be negated, after all)
 495         yield dut.n.ready_i.eq(0)
 496         return result
 497
 498     def producer():
 499         # send a few test cases, interspersed with wait states
 500         # note that, for this test, we do not wait for the result to be ready,
 501         # before presenting the next input
 502         # 5 + 3
 503         yield from send(5, 3, MicrOp.OP_ADD)
 504         yield
 505         yield
 506         # 2 * 3
 507         yield from send(2, 3, MicrOp.OP_MUL_L64)
 508         # (-5) + 3
 509         yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
 510         yield
 511         # 5 - 3
 512         # note that this is a zero-delay operation
 513         yield from send(5, 3, MicrOp.OP_NOP)
 514         yield
 515         yield
 516         # 13 >> 2
 517         yield from send(13, 2, MicrOp.OP_SHR)
 518
 519     def consumer():
 520         # receive and check results, interspersed with wait states
 521         # the consumer is not in step with the producer, but the
 522         # order of the results are preserved
 523         yield
 524         # 5 + 3 = 8
 525         result = yield from receive()
 526         assert (result == 8)
 527         # 2 * 3 = 6
 528         result = yield from receive()
 529         assert (result == 6)
 530         yield
 531         yield
 532         # (-5) + 3 = -2
 533         result = yield from receive()
 534         assert (result == 65533)  # unsigned equivalent to -2
 535         # 5 - 3 = 2
 536         # note that this is a zero-delay operation
 537         # this, and the previous result, will be received back-to-back
 538         # (check the output waveform to see this)
 539         result = yield from receive()
 540         assert (result == 2)
 541         yield
 542         yield
 543         # 13 >> 2 = 3
 544         result = yield from receive()
 545         assert (result == 3)
 546
 547     sim.add_sync_process(producer)
 548     sim.add_sync_process(consumer)
 549     sim_writer = sim.write_vcd(
 550         "test_alu_parallel.vcd",
 551         "test_alu_parallel.gtkw",
 552         traces=dut.ports()
 553     )
 554     with sim_writer:
 555         sim.run()
 556
 557
 558 if __name__ == "__main__":
 559     test_alu()
 560     test_alu_parallel()
 561
 562     # alu = BranchALU(width=16)
 563     # vl = rtlil.convert(alu, ports=alu.ports())
 564     # with open("test_branch_alu.il", "w") as f:
 565     #     f.write(vl)