src/soc/experiment/alu_hier.py

   1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
   2
   3 This ALU is *deliberately* designed to add in (unnecessary) delays into
   4 different operations so as to be able to test the 6600-style matrices
   5 and the CompUnits.  Countdown timers wait for (defined) periods before
   6 indicating that the output is valid
   7
   8 A "real" integer ALU would place the answers onto the output bus after
   9 only one cycle (sync)
  10 """
  11
  12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
  13 from nmigen.hdl.rec import Record, Layout
  14 from nmigen.cli import main
  15 from nmigen.cli import verilog, rtlil
  16 from nmigen.compat.sim import run_simulation
  17 from nmutil.gtkw import write_gtkw
  18
  19 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  20 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  21 from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
  22                                         is_engine_pysim)
  23
  24 from soc.decoder.power_enums import MicrOp, Function, CryIn
  25
  26 from soc.fu.alu.alu_input_record import CompALUOpSubset
  27 from soc.fu.cr.cr_input_record import CompCROpSubset
  28
  29 import operator
  30
  31
  32 class Adder(Elaboratable):
  33     def __init__(self, width):
  34         self.invert_in = Signal()
  35         self.a = Signal(width)
  36         self.b = Signal(width)
  37         self.o = Signal(width, name="add_o")
  38
  39     def elaborate(self, platform):
  40         m = Module()
  41         with m.If(self.invert_in):
  42             m.d.comb += self.o.eq((~self.a) + self.b)
  43         with m.Else():
  44             m.d.comb += self.o.eq(self.a + self.b)
  45         return m
  46
  47
  48 class Subtractor(Elaboratable):
  49     def __init__(self, width):
  50         self.a = Signal(width)
  51         self.b = Signal(width)
  52         self.o = Signal(width, name="sub_o")
  53
  54     def elaborate(self, platform):
  55         m = Module()
  56         m.d.comb += self.o.eq(self.a - self.b)
  57         return m
  58
  59
  60 class Multiplier(Elaboratable):
  61     def __init__(self, width):
  62         self.a = Signal(width)
  63         self.b = Signal(width)
  64         self.o = Signal(width, name="mul_o")
  65
  66     def elaborate(self, platform):
  67         m = Module()
  68         m.d.comb += self.o.eq(self.a * self.b)
  69         return m
  70
  71
  72 class Shifter(Elaboratable):
  73     def __init__(self, width):
  74         self.width = width
  75         self.a = Signal(width)
  76         self.b = Signal(width)
  77         self.o = Signal(width, name="shf_o")
  78
  79     def elaborate(self, platform):
  80         m = Module()
  81         btrunc = Signal(self.width)
  82         m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
  83         m.d.comb += self.o.eq(self.a >> btrunc)
  84         return m
  85
  86
  87 class Dummy:
  88     pass
  89
  90
  91 class DummyALU(Elaboratable):
  92     def __init__(self, width):
  93         self.p = Dummy()  # make look like nmutil pipeline API
  94         self.p.data_i = Dummy()
  95         self.p.data_i.ctx = Dummy()
  96         self.n = Dummy()  # make look like nmutil pipeline API
  97         self.n.data_o = Dummy()
  98         self.p.valid_i = Signal()
  99         self.p.ready_o = Signal()
 100         self.n.ready_i = Signal()
 101         self.n.valid_o = Signal()
 102         self.counter = Signal(4)
 103         self.op = CompCROpSubset()
 104         i = []
 105         i.append(Signal(width, name="i1"))
 106         i.append(Signal(width, name="i2"))
 107         i.append(Signal(width, name="i3"))
 108         self.i = Array(i)
 109         self.a, self.b, self.c = i[0], i[1], i[2]
 110         self.out = Array([Signal(width, name="alu_o")])
 111         self.o = self.out[0]
 112         self.width = width
 113         # more "look like nmutil pipeline API"
 114         self.p.data_i.ctx.op = self.op
 115         self.p.data_i.a = self.a
 116         self.p.data_i.b = self.b
 117         self.p.data_i.c = self.c
 118         self.n.data_o.o = self.o
 119
 120     def elaborate(self, platform):
 121         m = Module()
 122
 123         go_now = Signal(reset_less=True)  # testing no-delay ALU
 124
 125         with m.If(self.p.valid_i):
 126             # input is valid. next check, if we already said "ready" or not
 127             with m.If(~self.p.ready_o):
 128                 # we didn't say "ready" yet, so say so and initialise
 129                 m.d.sync += self.p.ready_o.eq(1)
 130
 131                 m.d.sync += self.o.eq(self.a)
 132                 m.d.comb += go_now.eq(1)
 133                 m.d.sync += self.counter.eq(1)
 134
 135         with m.Else():
 136             # input says no longer valid, so drop ready as well.
 137             # a "proper" ALU would have had to sync in the opcode and a/b ops
 138             m.d.sync += self.p.ready_o.eq(0)
 139
 140         # ok so the counter's running: when it gets to 1, fire the output
 141         with m.If((self.counter == 1) | go_now):
 142             # set the output as valid if the recipient is ready for it
 143             m.d.sync += self.n.valid_o.eq(1)
 144         with m.If(self.n.ready_i & self.n.valid_o):
 145             m.d.sync += self.n.valid_o.eq(0)
 146             # recipient said it was ready: reset back to known-good.
 147             m.d.sync += self.counter.eq(0)  # reset the counter
 148             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 149
 150         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 151         with m.If(self.counter > 1):
 152             m.d.sync += self.counter.eq(self.counter - 1)
 153
 154         return m
 155
 156     def __iter__(self):
 157         yield from self.op.ports()
 158         yield self.a
 159         yield self.b
 160         yield self.c
 161         yield self.o
 162
 163     def ports(self):
 164         return list(self)
 165
 166
 167 class ALU(Elaboratable):
 168     def __init__(self, width):
 169         self.p = Dummy()  # make look like nmutil pipeline API
 170         self.p.data_i = Dummy()
 171         self.p.data_i.ctx = Dummy()
 172         self.n = Dummy()  # make look like nmutil pipeline API
 173         self.n.data_o = Dummy()
 174         self.p.valid_i = Signal()
 175         self.p.ready_o = Signal()
 176         self.n.ready_i = Signal()
 177         self.n.valid_o = Signal()
 178         self.counter = Signal(4)
 179         self.op = CompALUOpSubset(name="op")
 180         i = []
 181         i.append(Signal(width, name="i1"))
 182         i.append(Signal(width, name="i2"))
 183         self.i = Array(i)
 184         self.a, self.b = i[0], i[1]
 185         self.out = Array([Signal(width, name="alu_o")])
 186         self.o = self.out[0]
 187         self.width = width
 188         # more "look like nmutil pipeline API"
 189         self.p.data_i.ctx.op = self.op
 190         self.p.data_i.a = self.a
 191         self.p.data_i.b = self.b
 192         self.n.data_o.o = self.o
 193
 194     def elaborate(self, platform):
 195         m = Module()
 196         add = Adder(self.width)
 197         mul = Multiplier(self.width)
 198         shf = Shifter(self.width)
 199         sub = Subtractor(self.width)
 200
 201         m.submodules.add = add
 202         m.submodules.mul = mul
 203         m.submodules.shf = shf
 204         m.submodules.sub = sub
 205
 206         # really should not activate absolutely all ALU inputs like this
 207         for mod in [add, mul, shf, sub]:
 208             m.d.comb += [
 209                 mod.a.eq(self.a),
 210                 mod.b.eq(self.b),
 211             ]
 212
 213         # pass invert (and carry later)
 214         m.d.comb += add.invert_in.eq(self.op.invert_in)
 215
 216         go_now = Signal(reset_less=True)  # testing no-delay ALU
 217
 218         # ALU sequencer is idle when the count is zero
 219         alu_idle = Signal(reset_less=True)
 220         m.d.comb += alu_idle.eq(self.counter == 0)
 221
 222         # ALU sequencer is done when the count is one
 223         alu_done = Signal(reset_less=True)
 224         m.d.comb += alu_done.eq(self.counter == 1)
 225
 226         # select handshake handling according to ALU type
 227         with m.If(go_now):
 228             # with a combinatorial, no-delay ALU, just pass through
 229             # the handshake signals to the other side
 230             m.d.comb += self.p.ready_o.eq(self.n.ready_i)
 231             m.d.comb += self.n.valid_o.eq(self.p.valid_i)
 232         with m.Else():
 233             # sequential ALU handshake:
 234             # ready_o responds to valid_i, but only if the ALU is idle
 235             m.d.comb += self.p.ready_o.eq(alu_idle)
 236             # select the internally generated valid_o, above
 237             m.d.comb += self.n.valid_o.eq(alu_done)
 238
 239         # hold the ALU result until ready_o is asserted
 240         alu_r = Signal(self.width)
 241
 242         with m.If(alu_idle):
 243             with m.If(self.p.valid_i):
 244
 245                 # as this is a "fake" pipeline, just grab the output right now
 246                 with m.If(self.op.insn_type == MicrOp.OP_ADD):
 247                     m.d.sync += alu_r.eq(add.o)
 248                 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
 249                     m.d.sync += alu_r.eq(mul.o)
 250                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 251                     m.d.sync += alu_r.eq(shf.o)
 252                 # SUB is zero-delay, no need to register
 253
 254                 # NOTE: all of these are fake, just something to test
 255
 256                 # MUL, to take 5 instructions
 257                 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
 258                     m.d.sync += self.counter.eq(5)
 259                 # SHIFT to take 1, straight away
 260                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 261                     m.d.sync += self.counter.eq(1)
 262                 # ADD/SUB to take 3
 263                 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
 264                     m.d.sync += self.counter.eq(3)
 265                 # others to take no delay
 266                 with m.Else():
 267                     m.d.comb += go_now.eq(1)
 268
 269         with m.Elif(~alu_done | self.n.ready_i):
 270             # decrement the counter while the ALU is neither idle nor finished
 271             m.d.sync += self.counter.eq(self.counter - 1)
 272
 273         # choose between zero-delay output, or registered
 274         with m.If(go_now):
 275             m.d.comb += self.o.eq(sub.o)
 276         # only present the result at the last computation cycle
 277         with m.Elif(alu_done):
 278             m.d.comb += self.o.eq(alu_r)
 279
 280         return m
 281
 282     def __iter__(self):
 283         yield from self.op.ports()
 284         yield self.a
 285         yield self.b
 286         yield self.o
 287         yield self.p.valid_i
 288         yield self.p.ready_o
 289         yield self.n.valid_o
 290         yield self.n.ready_i
 291
 292     def ports(self):
 293         return list(self)
 294
 295
 296 class BranchOp(Elaboratable):
 297     def __init__(self, width, op):
 298         self.a = Signal(width)
 299         self.b = Signal(width)
 300         self.o = Signal(width)
 301         self.op = op
 302
 303     def elaborate(self, platform):
 304         m = Module()
 305         m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
 306         return m
 307
 308
 309 class BranchALU(Elaboratable):
 310     def __init__(self, width):
 311         self.p = Dummy()  # make look like nmutil pipeline API
 312         self.p.data_i = Dummy()
 313         self.p.data_i.ctx = Dummy()
 314         self.n = Dummy()  # make look like nmutil pipeline API
 315         self.n.data_o = Dummy()
 316         self.p.valid_i = Signal()
 317         self.p.ready_o = Signal()
 318         self.n.ready_i = Signal()
 319         self.n.valid_o = Signal()
 320         self.counter = Signal(4)
 321         self.op = Signal(2)
 322         i = []
 323         i.append(Signal(width, name="i1"))
 324         i.append(Signal(width, name="i2"))
 325         self.i = Array(i)
 326         self.a, self.b = i[0], i[1]
 327         self.out = Array([Signal(width)])
 328         self.o = self.out[0]
 329         self.width = width
 330
 331     def elaborate(self, platform):
 332         m = Module()
 333         bgt = BranchOp(self.width, operator.gt)
 334         blt = BranchOp(self.width, operator.lt)
 335         beq = BranchOp(self.width, operator.eq)
 336         bne = BranchOp(self.width, operator.ne)
 337
 338         m.submodules.bgt = bgt
 339         m.submodules.blt = blt
 340         m.submodules.beq = beq
 341         m.submodules.bne = bne
 342         for mod in [bgt, blt, beq, bne]:
 343             m.d.comb += [
 344                 mod.a.eq(self.a),
 345                 mod.b.eq(self.b),
 346             ]
 347
 348         go_now = Signal(reset_less=True)  # testing no-delay ALU
 349         with m.If(self.p.valid_i):
 350             # input is valid. next check, if we already said "ready" or not
 351             with m.If(~self.p.ready_o):
 352                 # we didn't say "ready" yet, so say so and initialise
 353                 m.d.sync += self.p.ready_o.eq(1)
 354
 355                 # as this is a "fake" pipeline, just grab the output right now
 356                 with m.Switch(self.op):
 357                     for i, mod in enumerate([bgt, blt, beq, bne]):
 358                         with m.Case(i):
 359                             m.d.sync += self.o.eq(mod.o)
 360                 # branch to take 5 cycles (fake)
 361                 m.d.sync += self.counter.eq(5)
 362                 #m.d.comb += go_now.eq(1)
 363         with m.Else():
 364             # input says no longer valid, so drop ready as well.
 365             # a "proper" ALU would have had to sync in the opcode and a/b ops
 366             m.d.sync += self.p.ready_o.eq(0)
 367
 368         # ok so the counter's running: when it gets to 1, fire the output
 369         with m.If((self.counter == 1) | go_now):
 370             # set the output as valid if the recipient is ready for it
 371             m.d.sync += self.n.valid_o.eq(1)
 372         with m.If(self.n.ready_i & self.n.valid_o):
 373             m.d.sync += self.n.valid_o.eq(0)
 374             # recipient said it was ready: reset back to known-good.
 375             m.d.sync += self.counter.eq(0)  # reset the counter
 376             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 377
 378         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 379         with m.If(self.counter > 1):
 380             m.d.sync += self.counter.eq(self.counter - 1)
 381
 382         return m
 383
 384     def __iter__(self):
 385         yield self.op
 386         yield self.a
 387         yield self.b
 388         yield self.o
 389
 390     def ports(self):
 391         return list(self)
 392
 393
 394 def run_op(dut, a, b, op, inv_a=0):
 395     yield dut.a.eq(a)
 396     yield dut.b.eq(b)
 397     yield dut.op.insn_type.eq(op)
 398     yield dut.op.invert_in.eq(inv_a)
 399     yield dut.n.ready_i.eq(0)
 400     yield dut.p.valid_i.eq(1)
 401     yield dut.n.ready_i.eq(1)
 402     yield
 403
 404     # wait for the ALU to accept our input data
 405     while not (yield dut.p.ready_o):
 406         yield
 407
 408     yield dut.p.valid_i.eq(0)
 409     yield dut.a.eq(0)
 410     yield dut.b.eq(0)
 411     yield dut.op.insn_type.eq(0)
 412     yield dut.op.invert_in.eq(0)
 413
 414     # wait for the ALU to present the output data
 415     while not (yield dut.n.valid_o):
 416         yield
 417
 418     # latch the result and lower read_i
 419     result = yield dut.o
 420     yield dut.n.ready_i.eq(0)
 421
 422     return result
 423
 424
 425 def alu_sim(dut):
 426     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
 427     print("alu_sim add", result)
 428     assert (result == 8)
 429
 430     result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
 431     print("alu_sim mul", result)
 432     assert (result == 6)
 433
 434     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
 435     print("alu_sim add-inv", result)
 436     assert (result == 65533)
 437
 438     # test zero-delay ALU
 439     # don't have OP_SUB, so use any other
 440     result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP)
 441     print("alu_sim sub", result)
 442     assert (result == 2)
 443
 444     result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
 445     print("alu_sim shr", result)
 446     assert (result == 3)
 447
 448
 449 def test_alu():
 450     alu = ALU(width=16)
 451     write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
 452     run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
 453
 454     vl = rtlil.convert(alu, ports=alu.ports())
 455     with open("test_alu.il", "w") as f:
 456         f.write(vl)
 457
 458
 459 def test_alu_parallel():
 460     # Compare with the sequential test implementation, above.
 461     m = Module()
 462     m.submodules.alu = dut = ALU(width=16)
 463     write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
 464                    pysim=is_engine_pysim())
 465
 466     sim = Simulator(m)
 467     sim.add_clock(1e-6)
 468
 469     def send(a, b, op, inv_a=0):
 470         # present input data and assert valid_i
 471         yield dut.a.eq(a)
 472         yield dut.b.eq(b)
 473         yield dut.op.insn_type.eq(op)
 474         yield dut.op.invert_in.eq(inv_a)
 475         yield dut.p.valid_i.eq(1)
 476         yield
 477         # wait for ready_o to be asserted
 478         while not (yield dut.p.ready_o):
 479             yield
 480         # clear input data and negate valid_i
 481         # if send is called again immediately afterwards, there will be no
 482         # visible transition (they will not be negated, after all)
 483         yield dut.p.valid_i.eq(0)
 484         yield dut.a.eq(0)
 485         yield dut.b.eq(0)
 486         yield dut.op.insn_type.eq(0)
 487         yield dut.op.invert_in.eq(0)
 488
 489     def receive():
 490         # signal readiness to receive data
 491         yield dut.n.ready_i.eq(1)
 492         yield
 493         # wait for valid_o to be asserted
 494         while not (yield dut.n.valid_o):
 495             yield
 496         # read result
 497         result = yield dut.o
 498         # negate ready_i
 499         # if receive is called again immediately afterwards, there will be no
 500         # visible transition (it will not be negated, after all)
 501         yield dut.n.ready_i.eq(0)
 502         return result
 503
 504     def producer():
 505         # send a few test cases, interspersed with wait states
 506         # note that, for this test, we do not wait for the result to be ready,
 507         # before presenting the next input
 508         # 5 + 3
 509         yield from send(5, 3, MicrOp.OP_ADD)
 510         yield
 511         yield
 512         # 2 * 3
 513         yield from send(2, 3, MicrOp.OP_MUL_L64)
 514         # (-5) + 3
 515         yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
 516         yield
 517         # 5 - 3
 518         # note that this is a zero-delay operation
 519         yield from send(5, 3, MicrOp.OP_NOP)
 520         yield
 521         yield
 522         # 13 >> 2
 523         yield from send(13, 2, MicrOp.OP_SHR)
 524
 525     def consumer():
 526         # receive and check results, interspersed with wait states
 527         # the consumer is not in step with the producer, but the
 528         # order of the results are preserved
 529         yield
 530         # 5 + 3 = 8
 531         result = yield from receive()
 532         assert (result == 8)
 533         # 2 * 3 = 6
 534         result = yield from receive()
 535         assert (result == 6)
 536         yield
 537         yield
 538         # (-5) + 3 = -2
 539         result = yield from receive()
 540         assert (result == 65533)  # unsigned equivalent to -2
 541         # 5 - 3 = 2
 542         # note that this is a zero-delay operation
 543         # this, and the previous result, will be received back-to-back
 544         # (check the output waveform to see this)
 545         result = yield from receive()
 546         assert (result == 2)
 547         yield
 548         yield
 549         # 13 >> 2 = 3
 550         result = yield from receive()
 551         assert (result == 3)
 552
 553     sim.add_sync_process(producer)
 554     sim.add_sync_process(consumer)
 555     sim_writer = sim.write_vcd("test_alu_parallel.vcd")
 556     with sim_writer:
 557         sim.run()
 558
 559
 560 def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
 561                    pysim=True):
 562     """Common function to write the GTKWave documents for this module"""
 563     gtkwave_desc = [
 564         'clk',
 565         'i1[15:0]',
 566         'i2[15:0]',
 567         'op__insn_type' if pysim else 'op__insn_type[6:0]',
 568         'op__invert_in',
 569         'valid_i',
 570         'ready_o',
 571         'valid_o',
 572         'ready_i',
 573         'alu_o[15:0]',
 574     ]
 575     # determine the module name of the DUT
 576     module = 'top'
 577     if sub_module is not None:
 578         module = nmigen_sim_top_module + sub_module
 579     vcd_name = gtkw_name.replace('.gtkw', '.vcd')
 580     write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
 581                loc=__file__, clk_period=clk_period, base='signed')
 582
 583
 584 if __name__ == "__main__":
 585     test_alu()
 586     test_alu_parallel()
 587
 588     # alu = BranchALU(width=16)
 589     # vl = rtlil.convert(alu, ports=alu.ports())
 590     # with open("test_branch_alu.il", "w") as f:
 591     #     f.write(vl)