src/soc/experiment/alu_hier.py

   1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
   2
   3 This ALU is *deliberately* designed to add in (unnecessary) delays into
   4 different operations so as to be able to test the 6600-style matrices
   5 and the CompUnits.  Countdown timers wait for (defined) periods before
   6 indicating that the output is valid
   7
   8 A "real" integer ALU would place the answers onto the output bus after
   9 only one cycle (sync)
  10 """
  11
  12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
  13 from nmigen.hdl.rec import Record, Layout
  14 from nmigen.cli import main
  15 from nmigen.cli import verilog, rtlil
  16 from nmigen.compat.sim import run_simulation
  17 from nmigen.back.pysim import Simulator, Settle
  18
  19 from soc.decoder.power_enums import InternalOp, Function, CryIn
  20
  21 from soc.fu.alu.alu_input_record import CompALUOpSubset
  22 from soc.fu.cr.cr_input_record import CompCROpSubset
  23
  24 import operator
  25
  26
  27
  28
  29 class Adder(Elaboratable):
  30     def __init__(self, width):
  31         self.invert_a = Signal()
  32         self.a   = Signal(width)
  33         self.b   = Signal(width)
  34         self.o   = Signal(width, name="add_o")
  35
  36     def elaborate(self, platform):
  37         m = Module()
  38         with m.If(self.invert_a):
  39             m.d.comb += self.o.eq((~self.a) + self.b)
  40         with m.Else():
  41             m.d.comb += self.o.eq(self.a + self.b)
  42         return m
  43
  44
  45 class Subtractor(Elaboratable):
  46     def __init__(self, width):
  47         self.a   = Signal(width)
  48         self.b   = Signal(width)
  49         self.o   = Signal(width, name="sub_o")
  50
  51     def elaborate(self, platform):
  52         m = Module()
  53         m.d.comb += self.o.eq(self.a - self.b)
  54         return m
  55
  56
  57 class Multiplier(Elaboratable):
  58     def __init__(self, width):
  59         self.a   = Signal(width)
  60         self.b   = Signal(width)
  61         self.o   = Signal(width, name="mul_o")
  62
  63     def elaborate(self, platform):
  64         m = Module()
  65         m.d.comb += self.o.eq(self.a * self.b)
  66         return m
  67
  68
  69 class Shifter(Elaboratable):
  70     def __init__(self, width):
  71         self.width = width
  72         self.a   = Signal(width)
  73         self.b   = Signal(width)
  74         self.o   = Signal(width, name="shf_o")
  75
  76     def elaborate(self, platform):
  77         m = Module()
  78         btrunc = Signal(self.width)
  79         m.d.comb += btrunc.eq(self.b & Const((1<<self.width)-1))
  80         m.d.comb += self.o.eq(self.a >> btrunc)
  81         return m
  82
  83 class Dummy:
  84     pass
  85
  86
  87 class DummyALU(Elaboratable):
  88     def __init__(self, width):
  89         self.p = Dummy() # make look like nmutil pipeline API
  90         self.p.data_i = Dummy()
  91         self.p.data_i.ctx = Dummy()
  92         self.n = Dummy() # make look like nmutil pipeline API
  93         self.n.data_o = Dummy()
  94         self.p.valid_i = Signal()
  95         self.p.ready_o = Signal()
  96         self.n.ready_i = Signal()
  97         self.n.valid_o = Signal()
  98         self.counter   = Signal(4)
  99         self.op  = CompCROpSubset()
 100         i = []
 101         i.append(Signal(width, name="i1"))
 102         i.append(Signal(width, name="i2"))
 103         i.append(Signal(width, name="i3"))
 104         self.i = Array(i)
 105         self.a, self.b, self.c = i[0], i[1], i[2]
 106         self.out = Array([Signal(width, name="alu_o")])
 107         self.o = self.out[0]
 108         self.width = width
 109         # more "look like nmutil pipeline API"
 110         self.p.data_i.ctx.op = self.op
 111         self.p.data_i.a = self.a
 112         self.p.data_i.b = self.b
 113         self.p.data_i.c = self.c
 114         self.n.data_o.o = self.o
 115
 116     def elaborate(self, platform):
 117         m = Module()
 118
 119         go_now = Signal(reset_less=True) # testing no-delay ALU
 120
 121         with m.If(self.p.valid_i):
 122             # input is valid. next check, if we already said "ready" or not
 123             with m.If(~self.p.ready_o):
 124                 # we didn't say "ready" yet, so say so and initialise
 125                 m.d.sync += self.p.ready_o.eq(1)
 126
 127                 m.d.sync += self.o.eq(self.a)
 128                 m.d.comb += go_now.eq(1)
 129                 m.d.sync += self.counter.eq(1)
 130
 131         with m.Else():
 132             # input says no longer valid, so drop ready as well.
 133             # a "proper" ALU would have had to sync in the opcode and a/b ops
 134             m.d.sync += self.p.ready_o.eq(0)
 135
 136         # ok so the counter's running: when it gets to 1, fire the output
 137         with m.If((self.counter == 1) | go_now):
 138             # set the output as valid if the recipient is ready for it
 139             m.d.sync += self.n.valid_o.eq(1)
 140         with m.If(self.n.ready_i & self.n.valid_o):
 141             m.d.sync += self.n.valid_o.eq(0)
 142             # recipient said it was ready: reset back to known-good.
 143             m.d.sync += self.counter.eq(0) # reset the counter
 144             m.d.sync += self.o.eq(0) # clear the output for tidiness sake
 145
 146         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 147         with m.If(self.counter > 1):
 148             m.d.sync += self.counter.eq(self.counter - 1)
 149
 150         return m
 151
 152     def __iter__(self):
 153         yield from self.op.ports()
 154         yield self.a
 155         yield self.b
 156         yield self.c
 157         yield self.o
 158
 159     def ports(self):
 160         return list(self)
 161
 162
 163 class ALU(Elaboratable):
 164     def __init__(self, width):
 165         self.p = Dummy() # make look like nmutil pipeline API
 166         self.p.data_i = Dummy()
 167         self.p.data_i.ctx = Dummy()
 168         self.n = Dummy() # make look like nmutil pipeline API
 169         self.n.data_o = Dummy()
 170         self.p.valid_i = Signal()
 171         self.p.ready_o = Signal()
 172         self.n.ready_i = Signal()
 173         self.n.valid_o = Signal()
 174         self.counter   = Signal(4)
 175         self.op = CompALUOpSubset(name="op")
 176         i = []
 177         i.append(Signal(width, name="i1"))
 178         i.append(Signal(width, name="i2"))
 179         self.i = Array(i)
 180         self.a, self.b = i[0], i[1]
 181         self.out = Array([Signal(width, name="alu_o")])
 182         self.o = self.out[0]
 183         self.width = width
 184         # more "look like nmutil pipeline API"
 185         self.p.data_i.ctx.op = self.op
 186         self.p.data_i.a = self.a
 187         self.p.data_i.b = self.b
 188         self.n.data_o.o = self.o
 189
 190     def elaborate(self, platform):
 191         m = Module()
 192         add = Adder(self.width)
 193         mul = Multiplier(self.width)
 194         shf = Shifter(self.width)
 195         sub = Subtractor(self.width)
 196
 197         m.submodules.add = add
 198         m.submodules.mul = mul
 199         m.submodules.shf = shf
 200         m.submodules.sub = sub
 201
 202         # really should not activate absolutely all ALU inputs like this
 203         for mod in [add, mul, shf, sub]:
 204             m.d.comb += [
 205                 mod.a.eq(self.a),
 206                 mod.b.eq(self.b),
 207             ]
 208
 209         # pass invert (and carry later)
 210         m.d.comb += add.invert_a.eq(self.op.invert_a)
 211
 212         go_now = Signal(reset_less=True) # testing no-delay ALU
 213
 214         # ALU sequencer is idle when the count is zero
 215         alu_idle = Signal(reset_less=True)
 216         m.d.comb += alu_idle.eq(self.counter == 0)
 217
 218         # ALU sequencer is done when the count is one
 219         alu_done = Signal(reset_less=True)
 220         m.d.comb += alu_done.eq(self.counter == 1)
 221
 222         # select handshake handling according to ALU type
 223         with m.If(go_now):
 224             # with a combinatorial, no-delay ALU, just pass through
 225             # the handshake signals to the other side
 226             m.d.comb += self.p.ready_o.eq(self.n.ready_i)
 227             m.d.comb += self.n.valid_o.eq(self.p.valid_i)
 228         with m.Else():
 229             # sequential ALU handshake:
 230             # ready_o responds to valid_i, but only if the ALU is idle
 231             m.d.comb += self.p.ready_o.eq(alu_idle)
 232             # select the internally generated valid_o, above
 233             m.d.comb += self.n.valid_o.eq(alu_done)
 234
 235         # hold the ALU result until ready_o is asserted
 236         alu_r = Signal(self.width)
 237
 238         with m.If(alu_idle):
 239             with m.If(self.p.valid_i):
 240
 241                 # as this is a "fake" pipeline, just grab the output right now
 242                 with m.If(self.op.insn_type == InternalOp.OP_ADD):
 243                     m.d.sync += alu_r.eq(add.o)
 244                 with m.Elif(self.op.insn_type == InternalOp.OP_MUL_L64):
 245                     m.d.sync += alu_r.eq(mul.o)
 246                 with m.Elif(self.op.insn_type == InternalOp.OP_SHR):
 247                     m.d.sync += alu_r.eq(shf.o)
 248                 # SUB is zero-delay, no need to register
 249
 250                 # NOTE: all of these are fake, just something to test
 251
 252                 # MUL, to take 5 instructions
 253                 with m.If(self.op.insn_type == InternalOp.OP_MUL_L64):
 254                     m.d.sync += self.counter.eq(5)
 255                 # SHIFT to take 1, straight away
 256                 with m.Elif(self.op.insn_type == InternalOp.OP_SHR):
 257                     m.d.sync += self.counter.eq(1)
 258                 # ADD/SUB to take 3
 259                 with m.Elif(self.op.insn_type == InternalOp.OP_ADD):
 260                     m.d.sync += self.counter.eq(3)
 261                 # others to take no delay
 262                 with m.Else():
 263                     m.d.comb += go_now.eq(1)
 264
 265         with m.Elif(~alu_done | self.n.ready_i):
 266             # decrement the counter while the ALU is neither idle nor finished
 267             m.d.sync += self.counter.eq(self.counter - 1)
 268
 269         # choose between zero-delay output, or registered
 270         with m.If(go_now):
 271             m.d.comb += self.o.eq(sub.o)
 272         # only present the result at the last computation cycle
 273         with m.Elif(alu_done):
 274             m.d.comb += self.o.eq(alu_r)
 275
 276         return m
 277
 278     def __iter__(self):
 279         yield from self.op.ports()
 280         yield self.a
 281         yield self.b
 282         yield self.o
 283         yield self.p.valid_i
 284         yield self.p.ready_o
 285         yield self.n.valid_o
 286         yield self.n.ready_i
 287
 288     def ports(self):
 289         return list(self)
 290
 291
 292 class BranchOp(Elaboratable):
 293     def __init__(self, width, op):
 294         self.a   = Signal(width)
 295         self.b   = Signal(width)
 296         self.o   = Signal(width)
 297         self.op = op
 298
 299     def elaborate(self, platform):
 300         m = Module()
 301         m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
 302         return m
 303
 304
 305 class BranchALU(Elaboratable):
 306     def __init__(self, width):
 307         self.p = Dummy() # make look like nmutil pipeline API
 308         self.p.data_i = Dummy()
 309         self.p.data_i.ctx = Dummy()
 310         self.n = Dummy() # make look like nmutil pipeline API
 311         self.n.data_o = Dummy()
 312         self.p.valid_i = Signal()
 313         self.p.ready_o = Signal()
 314         self.n.ready_i = Signal()
 315         self.n.valid_o = Signal()
 316         self.counter   = Signal(4)
 317         self.op  = Signal(2)
 318         i = []
 319         i.append(Signal(width, name="i1"))
 320         i.append(Signal(width, name="i2"))
 321         self.i = Array(i)
 322         self.a, self.b = i[0], i[1]
 323         self.out = Array([Signal(width)])
 324         self.o = self.out[0]
 325         self.width = width
 326
 327     def elaborate(self, platform):
 328         m = Module()
 329         bgt = BranchOp(self.width, operator.gt)
 330         blt = BranchOp(self.width, operator.lt)
 331         beq = BranchOp(self.width, operator.eq)
 332         bne = BranchOp(self.width, operator.ne)
 333
 334         m.submodules.bgt = bgt
 335         m.submodules.blt = blt
 336         m.submodules.beq = beq
 337         m.submodules.bne = bne
 338         for mod in [bgt, blt, beq, bne]:
 339             m.d.comb += [
 340                 mod.a.eq(self.a),
 341                 mod.b.eq(self.b),
 342             ]
 343
 344         go_now = Signal(reset_less=True) # testing no-delay ALU
 345         with m.If(self.p.valid_i):
 346             # input is valid. next check, if we already said "ready" or not
 347             with m.If(~self.p.ready_o):
 348                 # we didn't say "ready" yet, so say so and initialise
 349                 m.d.sync += self.p.ready_o.eq(1)
 350
 351                 # as this is a "fake" pipeline, just grab the output right now
 352                 with m.Switch(self.op):
 353                     for i, mod in enumerate([bgt, blt, beq, bne]):
 354                         with m.Case(i):
 355                             m.d.sync += self.o.eq(mod.o)
 356                 m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake)
 357                 #m.d.comb += go_now.eq(1)
 358         with m.Else():
 359             # input says no longer valid, so drop ready as well.
 360             # a "proper" ALU would have had to sync in the opcode and a/b ops
 361             m.d.sync += self.p.ready_o.eq(0)
 362
 363         # ok so the counter's running: when it gets to 1, fire the output
 364         with m.If((self.counter == 1) | go_now):
 365             # set the output as valid if the recipient is ready for it
 366             m.d.sync += self.n.valid_o.eq(1)
 367         with m.If(self.n.ready_i & self.n.valid_o):
 368             m.d.sync += self.n.valid_o.eq(0)
 369             # recipient said it was ready: reset back to known-good.
 370             m.d.sync += self.counter.eq(0) # reset the counter
 371             m.d.sync += self.o.eq(0) # clear the output for tidiness sake
 372
 373         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 374         with m.If(self.counter > 1):
 375             m.d.sync += self.counter.eq(self.counter - 1)
 376
 377         return m
 378
 379     def __iter__(self):
 380         yield self.op
 381         yield self.a
 382         yield self.b
 383         yield self.o
 384
 385     def ports(self):
 386         return list(self)
 387
 388 def run_op(dut, a, b, op, inv_a=0):
 389     yield dut.a.eq(a)
 390     yield dut.b.eq(b)
 391     yield dut.op.insn_type.eq(op)
 392     yield dut.op.invert_a.eq(inv_a)
 393     yield dut.n.ready_i.eq(0)
 394     yield dut.p.valid_i.eq(1)
 395
 396     # if valid_o rose on the very first cycle, it is a
 397     # zero-delay ALU
 398     yield Settle()
 399     vld = yield dut.n.valid_o
 400     if vld:
 401         # special case for zero-delay ALU
 402         # we must raise ready_i first, since the combinatorial ALU doesn't
 403         # have any storage, and doesn't dare to assert ready_o back to us
 404         # until we accepted the output data
 405         yield dut.n.ready_i.eq(1)
 406         result = yield dut.o
 407         yield
 408         yield dut.p.valid_i.eq(0)
 409         yield dut.n.ready_i.eq(0)
 410         yield
 411         return result
 412
 413     yield
 414
 415     # wait for the ALU to accept our input data
 416     while not (yield dut.p.ready_o):
 417         yield
 418
 419     yield dut.p.valid_i.eq(0)
 420
 421     # wait for the ALU to present the output data
 422     while not (yield dut.n.valid_o):
 423         yield
 424
 425     # latch the result and lower read_i
 426     yield dut.n.ready_i.eq(1)
 427     result = yield dut.o
 428     yield
 429     yield dut.n.ready_i.eq(0)
 430     yield
 431
 432     return result
 433
 434
 435 def alu_sim(dut):
 436     result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD)
 437     print ("alu_sim add", result)
 438     assert (result == 8)
 439
 440     result = yield from run_op(dut, 2, 3, InternalOp.OP_MUL_L64)
 441     print ("alu_sim mul", result)
 442     assert (result == 6)
 443
 444     result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD, inv_a=1)
 445     print ("alu_sim add-inv", result)
 446     assert (result == 65533)
 447
 448     # test zero-delay ALU
 449     # don't have OP_SUB, so use any other
 450     result = yield from run_op(dut, 5, 3, InternalOp.OP_NOP)
 451     print ("alu_sim sub", result)
 452     assert (result == 2)
 453
 454     result = yield from run_op(dut, 13, 2, InternalOp.OP_SHR)
 455     print ("alu_sim shr", result)
 456     assert (result == 3)
 457
 458
 459 def test_alu():
 460     alu = ALU(width=16)
 461     run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
 462
 463     vl = rtlil.convert(alu, ports=alu.ports())
 464     with open("test_alu.il", "w") as f:
 465         f.write(vl)
 466
 467
 468 def test_alu_parallel():
 469     # Compare with the sequential test implementation, above.
 470     m = Module()
 471     m.submodules.alu = dut = ALU(width=16)
 472     sim = Simulator(m)
 473     sim.add_clock(1e-6)
 474
 475     def send(a, b, op, inv_a=0):
 476         # present input data and assert valid_i
 477         yield dut.a.eq(a)
 478         yield dut.b.eq(b)
 479         yield dut.op.insn_type.eq(op)
 480         yield dut.op.invert_a.eq(inv_a)
 481         yield dut.p.valid_i.eq(1)
 482         yield
 483         # wait for ready_o to be asserted
 484         while not (yield dut.p.ready_o):
 485             yield
 486         # clear input data and negate valid_i
 487         # if send is called again immediately afterwards, there will be no
 488         # visible transition (they will not be negated, after all)
 489         yield dut.p.valid_i.eq(0)
 490         yield dut.a.eq(0)
 491         yield dut.b.eq(0)
 492         yield dut.op.insn_type.eq(0)
 493         yield dut.op.invert_a.eq(0)
 494
 495     def receive():
 496         # signal readiness to receive data
 497         yield dut.n.ready_i.eq(1)
 498         yield
 499         # wait for valid_o to be asserted
 500         while not (yield dut.n.valid_o):
 501             yield
 502         # read result
 503         result = yield dut.o
 504         # negate ready_i
 505         # if receive is called again immediately afterwards, there will be no
 506         # visible transition (it will not be negated, after all)
 507         yield dut.n.ready_i.eq(0)
 508         return result
 509
 510     def producer():
 511         # send a few test cases, interspersed with wait states
 512         # note that, for this test, we do not wait for the result to be ready,
 513         # before presenting the next input
 514         # 5 + 3
 515         yield from send(5, 3, InternalOp.OP_ADD)
 516         yield
 517         yield
 518         # 2 * 3
 519         yield from send(2, 3, InternalOp.OP_MUL_L64)
 520         # (-5) + 3
 521         yield from send(5, 3, InternalOp.OP_ADD, inv_a=1)
 522         yield
 523         # 5 - 3
 524         # note that this is a zero-delay operation
 525         yield from send(5, 3, InternalOp.OP_NOP)
 526         yield
 527         yield
 528         # 13 >> 2
 529         yield from send(13, 2, InternalOp.OP_SHR)
 530
 531     def consumer():
 532         # receive and check results, interspersed with wait states
 533         # the consumer is not in step with the producer, but the
 534         # order of the results are preserved
 535         yield
 536         # 5 + 3 = 8
 537         result = yield from receive()
 538         assert (result == 8)
 539         # 2 * 3 = 6
 540         result = yield from receive()
 541         assert (result == 6)
 542         yield
 543         yield
 544         # (-5) + 3 = -2
 545         result = yield from receive()
 546         assert (result == 65533)  # unsigned equivalent to -2
 547         yield
 548         # 5 - 3 = 2
 549         # note that this is a zero-delay operation
 550         result = yield from receive()
 551         assert (result == 2)
 552         yield
 553         yield
 554         # 13 >> 2 = 3
 555         result = yield from receive()
 556         assert (result == 3)
 557
 558     sim.add_sync_process(producer)
 559     sim.add_sync_process(consumer)
 560     sim_writer = sim.write_vcd(
 561         "test_alu_parallel.vcd",
 562         "test_alu_parallel.gtkw",
 563         traces=dut.ports()
 564     )
 565     with sim_writer:
 566         sim.run()
 567
 568
 569 if __name__ == "__main__":
 570     test_alu()
 571     test_alu_parallel()
 572
 573     # alu = BranchALU(width=16)
 574     # vl = rtlil.convert(alu, ports=alu.ports())
 575     # with open("test_branch_alu.il", "w") as f:
 576     #     f.write(vl)
 577