src/soc/experiment/alu_hier.py

   1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
   2
   3 This ALU is *deliberately* designed to add in (unnecessary) delays into
   4 different operations so as to be able to test the 6600-style matrices
   5 and the CompUnits.  Countdown timers wait for (defined) periods before
   6 indicating that the output is valid
   7
   8 A "real" integer ALU would place the answers onto the output bus after
   9 only one cycle (sync)
  10 """
  11
  12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
  13 from nmigen.hdl.rec import Record, Layout
  14 from nmigen.cli import main
  15 from nmigen.cli import verilog, rtlil
  16 from nmigen.compat.sim import run_simulation
  17 from nmutil.extend import exts
  18 from nmutil.gtkw import write_gtkw
  19
  20 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  21 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  22 from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
  23                                         is_engine_pysim)
  24
  25 from soc.decoder.power_enums import MicrOp, Function, CryIn
  26
  27 from soc.fu.alu.alu_input_record import CompALUOpSubset
  28 from soc.fu.cr.cr_input_record import CompCROpSubset
  29
  30 import operator
  31
  32
  33 class Adder(Elaboratable):
  34     def __init__(self, width):
  35         self.invert_in = Signal()
  36         self.a = Signal(width)
  37         self.b = Signal(width)
  38         self.o = Signal(width, name="add_o")
  39
  40     def elaborate(self, platform):
  41         m = Module()
  42         with m.If(self.invert_in):
  43             m.d.comb += self.o.eq((~self.a) + self.b)
  44         with m.Else():
  45             m.d.comb += self.o.eq(self.a + self.b)
  46         return m
  47
  48
  49 class Subtractor(Elaboratable):
  50     def __init__(self, width):
  51         self.a = Signal(width)
  52         self.b = Signal(width)
  53         self.o = Signal(width, name="sub_o")
  54
  55     def elaborate(self, platform):
  56         m = Module()
  57         m.d.comb += self.o.eq(self.a - self.b)
  58         return m
  59
  60
  61 class Multiplier(Elaboratable):
  62     def __init__(self, width):
  63         self.a = Signal(width)
  64         self.b = Signal(width)
  65         self.o = Signal(width, name="mul_o")
  66
  67     def elaborate(self, platform):
  68         m = Module()
  69         m.d.comb += self.o.eq(self.a * self.b)
  70         return m
  71
  72
  73 class Shifter(Elaboratable):
  74     def __init__(self, width):
  75         self.width = width
  76         self.a = Signal(width)
  77         self.b = Signal(width)
  78         self.o = Signal(width, name="shf_o")
  79
  80     def elaborate(self, platform):
  81         m = Module()
  82         btrunc = Signal(self.width)
  83         m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
  84         m.d.comb += self.o.eq(self.a >> btrunc)
  85         return m
  86
  87
  88 class SignExtend(Elaboratable):
  89     def __init__(self, width):
  90         self.width = width
  91         self.a = Signal(width)
  92         self.o = Signal(width, name="exts_o")
  93
  94     def elaborate(self, platform):
  95         m = Module()
  96         m.d.comb += self.o.eq(exts(self.a, 8, self.width))
  97         return m
  98
  99
 100 class Dummy:
 101     pass
 102
 103
 104 class DummyALU(Elaboratable):
 105     def __init__(self, width):
 106         self.p = Dummy()  # make look like nmutil pipeline API
 107         self.p.data_i = Dummy()
 108         self.p.data_i.ctx = Dummy()
 109         self.n = Dummy()  # make look like nmutil pipeline API
 110         self.n.data_o = Dummy()
 111         self.p.valid_i = Signal()
 112         self.p.ready_o = Signal()
 113         self.n.ready_i = Signal()
 114         self.n.valid_o = Signal()
 115         self.counter = Signal(4)
 116         self.op = CompCROpSubset()
 117         i = []
 118         i.append(Signal(width, name="i1"))
 119         i.append(Signal(width, name="i2"))
 120         i.append(Signal(width, name="i3"))
 121         self.i = Array(i)
 122         self.a, self.b, self.c = i[0], i[1], i[2]
 123         self.out = Array([Signal(width, name="alu_o")])
 124         self.o = self.out[0]
 125         self.width = width
 126         # more "look like nmutil pipeline API"
 127         self.p.data_i.ctx.op = self.op
 128         self.p.data_i.a = self.a
 129         self.p.data_i.b = self.b
 130         self.p.data_i.c = self.c
 131         self.n.data_o.o = self.o
 132
 133     def elaborate(self, platform):
 134         m = Module()
 135
 136         go_now = Signal(reset_less=True)  # testing no-delay ALU
 137
 138         with m.If(self.p.valid_i):
 139             # input is valid. next check, if we already said "ready" or not
 140             with m.If(~self.p.ready_o):
 141                 # we didn't say "ready" yet, so say so and initialise
 142                 m.d.sync += self.p.ready_o.eq(1)
 143
 144                 m.d.sync += self.o.eq(self.a)
 145                 m.d.comb += go_now.eq(1)
 146                 m.d.sync += self.counter.eq(1)
 147
 148         with m.Else():
 149             # input says no longer valid, so drop ready as well.
 150             # a "proper" ALU would have had to sync in the opcode and a/b ops
 151             m.d.sync += self.p.ready_o.eq(0)
 152
 153         # ok so the counter's running: when it gets to 1, fire the output
 154         with m.If((self.counter == 1) | go_now):
 155             # set the output as valid if the recipient is ready for it
 156             m.d.sync += self.n.valid_o.eq(1)
 157         with m.If(self.n.ready_i & self.n.valid_o):
 158             m.d.sync += self.n.valid_o.eq(0)
 159             # recipient said it was ready: reset back to known-good.
 160             m.d.sync += self.counter.eq(0)  # reset the counter
 161             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 162
 163         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 164         with m.If(self.counter > 1):
 165             m.d.sync += self.counter.eq(self.counter - 1)
 166
 167         return m
 168
 169     def __iter__(self):
 170         yield from self.op.ports()
 171         yield self.a
 172         yield self.b
 173         yield self.c
 174         yield self.o
 175
 176     def ports(self):
 177         return list(self)
 178
 179
 180 class ALU(Elaboratable):
 181     def __init__(self, width):
 182         self.p = Dummy()  # make look like nmutil pipeline API
 183         self.p.data_i = Dummy()
 184         self.p.data_i.ctx = Dummy()
 185         self.n = Dummy()  # make look like nmutil pipeline API
 186         self.n.data_o = Dummy()
 187         self.p.valid_i = Signal()
 188         self.p.ready_o = Signal()
 189         self.n.ready_i = Signal()
 190         self.n.valid_o = Signal()
 191         self.counter = Signal(4)
 192         self.op = CompALUOpSubset(name="op")
 193         i = []
 194         i.append(Signal(width, name="i1"))
 195         i.append(Signal(width, name="i2"))
 196         self.i = Array(i)
 197         self.a, self.b = i[0], i[1]
 198         self.out = Array([Signal(width, name="alu_o")])
 199         self.o = self.out[0]
 200         self.width = width
 201         # more "look like nmutil pipeline API"
 202         self.p.data_i.ctx.op = self.op
 203         self.p.data_i.a = self.a
 204         self.p.data_i.b = self.b
 205         self.n.data_o.o = self.o
 206
 207     def elaborate(self, platform):
 208         m = Module()
 209         add = Adder(self.width)
 210         mul = Multiplier(self.width)
 211         shf = Shifter(self.width)
 212         sub = Subtractor(self.width)
 213         ext_sign = SignExtend(self.width)
 214
 215         m.submodules.add = add
 216         m.submodules.mul = mul
 217         m.submodules.shf = shf
 218         m.submodules.sub = sub
 219         m.submodules.ext_sign = ext_sign
 220
 221         # really should not activate absolutely all ALU inputs like this
 222         for mod in [add, mul, shf, sub]:
 223             m.d.comb += [
 224                 mod.a.eq(self.a),
 225                 mod.b.eq(self.b),
 226             ]
 227         # EXTS sign extends the first input
 228         with m.If(self.op.insn_type == MicrOp.OP_EXTS):
 229             m.d.comb += ext_sign.a.eq(self.a)
 230         # EXTSWSLI sign extends the second input
 231         with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
 232             m.d.comb += ext_sign.a.eq(self.b)
 233
 234         # pass invert (and carry later)
 235         m.d.comb += add.invert_in.eq(self.op.invert_in)
 236
 237         go_now = Signal(reset_less=True)  # testing no-delay ALU
 238
 239         # ALU sequencer is idle when the count is zero
 240         alu_idle = Signal(reset_less=True)
 241         m.d.comb += alu_idle.eq(self.counter == 0)
 242
 243         # ALU sequencer is done when the count is one
 244         alu_done = Signal(reset_less=True)
 245         m.d.comb += alu_done.eq(self.counter == 1)
 246
 247         # select handshake handling according to ALU type
 248         with m.If(go_now):
 249             # with a combinatorial, no-delay ALU, just pass through
 250             # the handshake signals to the other side
 251             m.d.comb += self.p.ready_o.eq(self.n.ready_i)
 252             m.d.comb += self.n.valid_o.eq(self.p.valid_i)
 253         with m.Else():
 254             # sequential ALU handshake:
 255             # ready_o responds to valid_i, but only if the ALU is idle
 256             m.d.comb += self.p.ready_o.eq(alu_idle)
 257             # select the internally generated valid_o, above
 258             m.d.comb += self.n.valid_o.eq(alu_done)
 259
 260         # hold the ALU result until ready_o is asserted
 261         alu_r = Signal(self.width)
 262
 263         with m.If(alu_idle):
 264             with m.If(self.p.valid_i):
 265
 266                 # as this is a "fake" pipeline, just grab the output right now
 267                 with m.If(self.op.insn_type == MicrOp.OP_ADD):
 268                     m.d.sync += alu_r.eq(add.o)
 269                 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
 270                     m.d.sync += alu_r.eq(mul.o)
 271                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 272                     m.d.sync += alu_r.eq(shf.o)
 273                 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
 274                     m.d.sync += alu_r.eq(ext_sign.o)
 275                 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
 276                     m.d.sync += alu_r.eq(ext_sign.o)
 277                 # SUB is zero-delay, no need to register
 278
 279                 # NOTE: all of these are fake, just something to test
 280
 281                 # MUL, to take 5 instructions
 282                 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
 283                     m.d.sync += self.counter.eq(5)
 284                 # SHIFT to take 1, straight away
 285                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 286                     m.d.sync += self.counter.eq(1)
 287                 # ADD/SUB to take 3
 288                 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
 289                     m.d.sync += self.counter.eq(3)
 290                 # EXTS to take 1
 291                 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
 292                     m.d.sync += self.counter.eq(1)
 293                 # EXTSWSLI to take 1
 294                 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
 295                     m.d.sync += self.counter.eq(1)
 296                 # others to take no delay
 297                 with m.Else():
 298                     m.d.comb += go_now.eq(1)
 299
 300         with m.Elif(~alu_done | self.n.ready_i):
 301             # decrement the counter while the ALU is neither idle nor finished
 302             m.d.sync += self.counter.eq(self.counter - 1)
 303
 304         # choose between zero-delay output, or registered
 305         with m.If(go_now):
 306             m.d.comb += self.o.eq(sub.o)
 307         # only present the result at the last computation cycle
 308         with m.Elif(alu_done):
 309             m.d.comb += self.o.eq(alu_r)
 310
 311         return m
 312
 313     def __iter__(self):
 314         yield from self.op.ports()
 315         yield self.a
 316         yield self.b
 317         yield self.o
 318         yield self.p.valid_i
 319         yield self.p.ready_o
 320         yield self.n.valid_o
 321         yield self.n.ready_i
 322
 323     def ports(self):
 324         return list(self)
 325
 326
 327 class BranchOp(Elaboratable):
 328     def __init__(self, width, op):
 329         self.a = Signal(width)
 330         self.b = Signal(width)
 331         self.o = Signal(width)
 332         self.op = op
 333
 334     def elaborate(self, platform):
 335         m = Module()
 336         m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
 337         return m
 338
 339
 340 class BranchALU(Elaboratable):
 341     def __init__(self, width):
 342         self.p = Dummy()  # make look like nmutil pipeline API
 343         self.p.data_i = Dummy()
 344         self.p.data_i.ctx = Dummy()
 345         self.n = Dummy()  # make look like nmutil pipeline API
 346         self.n.data_o = Dummy()
 347         self.p.valid_i = Signal()
 348         self.p.ready_o = Signal()
 349         self.n.ready_i = Signal()
 350         self.n.valid_o = Signal()
 351         self.counter = Signal(4)
 352         self.op = Signal(2)
 353         i = []
 354         i.append(Signal(width, name="i1"))
 355         i.append(Signal(width, name="i2"))
 356         self.i = Array(i)
 357         self.a, self.b = i[0], i[1]
 358         self.out = Array([Signal(width)])
 359         self.o = self.out[0]
 360         self.width = width
 361
 362     def elaborate(self, platform):
 363         m = Module()
 364         bgt = BranchOp(self.width, operator.gt)
 365         blt = BranchOp(self.width, operator.lt)
 366         beq = BranchOp(self.width, operator.eq)
 367         bne = BranchOp(self.width, operator.ne)
 368
 369         m.submodules.bgt = bgt
 370         m.submodules.blt = blt
 371         m.submodules.beq = beq
 372         m.submodules.bne = bne
 373         for mod in [bgt, blt, beq, bne]:
 374             m.d.comb += [
 375                 mod.a.eq(self.a),
 376                 mod.b.eq(self.b),
 377             ]
 378
 379         go_now = Signal(reset_less=True)  # testing no-delay ALU
 380         with m.If(self.p.valid_i):
 381             # input is valid. next check, if we already said "ready" or not
 382             with m.If(~self.p.ready_o):
 383                 # we didn't say "ready" yet, so say so and initialise
 384                 m.d.sync += self.p.ready_o.eq(1)
 385
 386                 # as this is a "fake" pipeline, just grab the output right now
 387                 with m.Switch(self.op):
 388                     for i, mod in enumerate([bgt, blt, beq, bne]):
 389                         with m.Case(i):
 390                             m.d.sync += self.o.eq(mod.o)
 391                 # branch to take 5 cycles (fake)
 392                 m.d.sync += self.counter.eq(5)
 393                 #m.d.comb += go_now.eq(1)
 394         with m.Else():
 395             # input says no longer valid, so drop ready as well.
 396             # a "proper" ALU would have had to sync in the opcode and a/b ops
 397             m.d.sync += self.p.ready_o.eq(0)
 398
 399         # ok so the counter's running: when it gets to 1, fire the output
 400         with m.If((self.counter == 1) | go_now):
 401             # set the output as valid if the recipient is ready for it
 402             m.d.sync += self.n.valid_o.eq(1)
 403         with m.If(self.n.ready_i & self.n.valid_o):
 404             m.d.sync += self.n.valid_o.eq(0)
 405             # recipient said it was ready: reset back to known-good.
 406             m.d.sync += self.counter.eq(0)  # reset the counter
 407             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 408
 409         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 410         with m.If(self.counter > 1):
 411             m.d.sync += self.counter.eq(self.counter - 1)
 412
 413         return m
 414
 415     def __iter__(self):
 416         yield self.op
 417         yield self.a
 418         yield self.b
 419         yield self.o
 420
 421     def ports(self):
 422         return list(self)
 423
 424
 425 def run_op(dut, a, b, op, inv_a=0):
 426     yield dut.a.eq(a)
 427     yield dut.b.eq(b)
 428     yield dut.op.insn_type.eq(op)
 429     yield dut.op.invert_in.eq(inv_a)
 430     yield dut.n.ready_i.eq(0)
 431     yield dut.p.valid_i.eq(1)
 432     yield dut.n.ready_i.eq(1)
 433     yield
 434
 435     # wait for the ALU to accept our input data
 436     while not (yield dut.p.ready_o):
 437         yield
 438
 439     yield dut.p.valid_i.eq(0)
 440     yield dut.a.eq(0)
 441     yield dut.b.eq(0)
 442     yield dut.op.insn_type.eq(0)
 443     yield dut.op.invert_in.eq(0)
 444
 445     # wait for the ALU to present the output data
 446     while not (yield dut.n.valid_o):
 447         yield
 448
 449     # latch the result and lower read_i
 450     result = yield dut.o
 451     yield dut.n.ready_i.eq(0)
 452
 453     return result
 454
 455
 456 def alu_sim(dut):
 457     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
 458     print("alu_sim add", result)
 459     assert (result == 8)
 460
 461     result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
 462     print("alu_sim mul", result)
 463     assert (result == 6)
 464
 465     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
 466     print("alu_sim add-inv", result)
 467     assert (result == 65533)
 468
 469     # test zero-delay ALU
 470     # don't have OP_SUB, so use any other
 471     result = yield from run_op(dut, 5, 3, MicrOp.OP_CMP)
 472     print("alu_sim sub", result)
 473     assert (result == 2)
 474
 475     result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
 476     print("alu_sim shr", result)
 477     assert (result == 3)
 478
 479
 480 def test_alu():
 481     alu = ALU(width=16)
 482     write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
 483     run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
 484
 485     vl = rtlil.convert(alu, ports=alu.ports())
 486     with open("test_alu.il", "w") as f:
 487         f.write(vl)
 488
 489
 490 def test_alu_parallel():
 491     # Compare with the sequential test implementation, above.
 492     m = Module()
 493     m.submodules.alu = dut = ALU(width=16)
 494     write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
 495                    pysim=is_engine_pysim())
 496
 497     sim = Simulator(m)
 498     sim.add_clock(1e-6)
 499
 500     def send(a, b, op, inv_a=0):
 501         # present input data and assert valid_i
 502         yield dut.a.eq(a)
 503         yield dut.b.eq(b)
 504         yield dut.op.insn_type.eq(op)
 505         yield dut.op.invert_in.eq(inv_a)
 506         yield dut.p.valid_i.eq(1)
 507         yield
 508         # wait for ready_o to be asserted
 509         while not (yield dut.p.ready_o):
 510             yield
 511         # clear input data and negate valid_i
 512         # if send is called again immediately afterwards, there will be no
 513         # visible transition (they will not be negated, after all)
 514         yield dut.p.valid_i.eq(0)
 515         yield dut.a.eq(0)
 516         yield dut.b.eq(0)
 517         yield dut.op.insn_type.eq(0)
 518         yield dut.op.invert_in.eq(0)
 519
 520     def receive():
 521         # signal readiness to receive data
 522         yield dut.n.ready_i.eq(1)
 523         yield
 524         # wait for valid_o to be asserted
 525         while not (yield dut.n.valid_o):
 526             yield
 527         # read result
 528         result = yield dut.o
 529         # negate ready_i
 530         # if receive is called again immediately afterwards, there will be no
 531         # visible transition (it will not be negated, after all)
 532         yield dut.n.ready_i.eq(0)
 533         return result
 534
 535     def producer():
 536         # send a few test cases, interspersed with wait states
 537         # note that, for this test, we do not wait for the result to be ready,
 538         # before presenting the next input
 539         # 5 + 3
 540         yield from send(5, 3, MicrOp.OP_ADD)
 541         yield
 542         yield
 543         # 2 * 3
 544         yield from send(2, 3, MicrOp.OP_MUL_L64)
 545         # (-5) + 3
 546         yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
 547         yield
 548         # 5 - 3
 549         # note that this is a zero-delay operation
 550         yield from send(5, 3, MicrOp.OP_NOP)
 551         yield
 552         yield
 553         # 13 >> 2
 554         yield from send(13, 2, MicrOp.OP_SHR)
 555         # sign extent 13
 556         yield from send(13, 2, MicrOp.OP_EXTS)
 557         # sign extend -128 (8 bits)
 558         yield from send(0x80, 2, MicrOp.OP_EXTS)
 559         # sign extend -128 (8 bits)
 560         yield from send(2, 0x80, MicrOp.OP_EXTSWSLI)
 561
 562     def consumer():
 563         # receive and check results, interspersed with wait states
 564         # the consumer is not in step with the producer, but the
 565         # order of the results are preserved
 566         yield
 567         # 5 + 3 = 8
 568         result = yield from receive()
 569         assert (result == 8)
 570         # 2 * 3 = 6
 571         result = yield from receive()
 572         assert (result == 6)
 573         yield
 574         yield
 575         # (-5) + 3 = -2
 576         result = yield from receive()
 577         assert (result == 65533)  # unsigned equivalent to -2
 578         # 5 - 3 = 2
 579         # note that this is a zero-delay operation
 580         # this, and the previous result, will be received back-to-back
 581         # (check the output waveform to see this)
 582         result = yield from receive()
 583         assert (result == 2)
 584         yield
 585         yield
 586         # 13 >> 2 = 3
 587         result = yield from receive()
 588         assert (result == 3)
 589         # sign extent 13 = 13
 590         result = yield from receive()
 591         assert (result == 13)
 592         # sign extend -128 (8 bits) = -128 (16 bits)
 593         result = yield from receive()
 594         assert (result == 0xFF80)
 595         # sign extend -128 (8 bits) = -128 (16 bits)
 596         result = yield from receive()
 597         assert (result == 0xFF80)
 598
 599     sim.add_sync_process(producer)
 600     sim.add_sync_process(consumer)
 601     sim_writer = sim.write_vcd("test_alu_parallel.vcd")
 602     with sim_writer:
 603         sim.run()
 604
 605
 606 def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
 607                    pysim=True):
 608     """Common function to write the GTKWave documents for this module"""
 609     gtkwave_desc = [
 610         'clk',
 611         'i1[15:0]',
 612         'i2[15:0]',
 613         'op__insn_type' if pysim else 'op__insn_type[6:0]',
 614         'op__invert_in',
 615         'valid_i',
 616         'ready_o',
 617         'valid_o',
 618         'ready_i',
 619         'alu_o[15:0]',
 620     ]
 621     # determine the module name of the DUT
 622     module = 'top'
 623     if sub_module is not None:
 624         module = nmigen_sim_top_module + sub_module
 625     vcd_name = gtkw_name.replace('.gtkw', '.vcd')
 626     write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
 627                loc=__file__, clk_period=clk_period, base='signed')
 628
 629
 630 if __name__ == "__main__":
 631     test_alu()
 632     test_alu_parallel()
 633
 634     # alu = BranchALU(width=16)
 635     # vl = rtlil.convert(alu, ports=alu.ports())
 636     # with open("test_branch_alu.il", "w") as f:
 637     #     f.write(vl)